diff --git a/alpine/BumponTailInstabilityManager.h b/alpine/BumponTailInstabilityManager.h
index 7525f9af1..5e03a0712 100644
--- a/alpine/BumponTailInstabilityManager.h
+++ b/alpine/BumponTailInstabilityManager.h
@@ -274,10 +274,10 @@ class BumponTailInstabilityManager : public AlpineManager<T, Dim> {
 
         this->pcontainer_m->create(nlocal);
 
-        view_type* R = &(this->pcontainer_m->R.getView());
-        samplingR.generate(*R, rand_pool64);
+        view_type R = this->pcontainer_m->R.getView();
+        samplingR.generate(R, rand_pool64);
 
-        view_type* P = &(this->pcontainer_m->P.getView());
+        view_type P = this->pcontainer_m->P.getView();
 
         double mu[Dim];
         double sd[Dim];
@@ -288,12 +288,12 @@ class BumponTailInstabilityManager : public AlpineManager<T, Dim> {
         // sample first nlocBulk with muBulk as mean velocity
         mu[Dim - 1] = muBulk_m;
         Kokkos::parallel_for(Kokkos::RangePolicy<int>(0, nlocBulk),
-                             ippl::random::randn<double, Dim>(*P, rand_pool64, mu, sd));
+                             ippl::random::randn<double, Dim>(P, rand_pool64, mu, sd));
 
         // sample remaining with muBeam as mean velocity
         mu[Dim - 1] = muBeam_m;
         Kokkos::parallel_for(Kokkos::RangePolicy<int>(nlocBulk, nlocal),
-                             ippl::random::randn<double, Dim>(*P, rand_pool64, mu, sd));
+                             ippl::random::randn<double, Dim>(P, rand_pool64, mu, sd));
 
         Kokkos::fence();
         ippl::Comm->barrier();
diff --git a/alpine/LandauDampingManager.h b/alpine/LandauDampingManager.h
index b8491c7a5..23dcc632a 100644
--- a/alpine/LandauDampingManager.h
+++ b/alpine/LandauDampingManager.h
@@ -226,10 +226,10 @@ class LandauDampingManager : public AlpineManager<T, Dim> {
 
         this->pcontainer_m->create(nlocal);
 
-        view_type* R = &(this->pcontainer_m->R.getView());
-        samplingR.generate(*R, rand_pool64);
+        view_type R = this->pcontainer_m->R.getView();
+        samplingR.generate(R, rand_pool64);
 
-        view_type* P = &(this->pcontainer_m->P.getView());
+        view_type P = this->pcontainer_m->P.getView();
 
         double mu[Dim];
         double sd[Dim];
@@ -237,7 +237,7 @@ class LandauDampingManager : public AlpineManager<T, Dim> {
             mu[i] = 0.0;
             sd[i] = 1.0;
         }
-        Kokkos::parallel_for(nlocal, ippl::random::randn<double, Dim>(*P, rand_pool64, mu, sd));
+        Kokkos::parallel_for(nlocal, ippl::random::randn<double, Dim>(P, rand_pool64, mu, sd));
         Kokkos::fence();
         ippl::Comm->barrier();
 
diff --git a/alpine/PenningTrapManager.h b/alpine/PenningTrapManager.h
index b721f3ec8..c7e81de43 100644
--- a/alpine/PenningTrapManager.h
+++ b/alpine/PenningTrapManager.h
@@ -202,14 +202,14 @@ class PenningTrapManager : public AlpineManager<T, Dim> {
 
         this->pcontainer_m->create(nlocal);
 
-        view_type* R = &(this->pcontainer_m->R.getView());
-        samplingR.generate(*R, rand_pool64);
+        view_type R = this->pcontainer_m->R.getView();
+        samplingR.generate(R, rand_pool64);
 
-        view_type* P = &(this->pcontainer_m->P.getView());
+        view_type P = this->pcontainer_m->P.getView();
 
         double muP[Dim] = {0.0, 0.0, 0.0};
         double sdP[Dim] = {1.0, 1.0, 1.0};
-        Kokkos::parallel_for(nlocal, ippl::random::randn<double, Dim>(*P, rand_pool64, muP, sdP));
+        Kokkos::parallel_for(nlocal, ippl::random::randn<double, Dim>(P, rand_pool64, muP, sdP));
 
         Kokkos::fence();
         ippl::Comm->barrier();
diff --git a/src/LinearSolvers/PCG.h b/src/LinearSolvers/PCG.h
index bdb65778d..c02c6e60b 100644
--- a/src/LinearSolvers/PCG.h
+++ b/src/LinearSolvers/PCG.h
@@ -6,9 +6,9 @@
 #ifndef IPPL_PCG_H
 #define IPPL_PCG_H
 
+#include "FEM/FEMVector.h"
 #include "Preconditioner.h"
 #include "SolverAlgorithm.h"
-#include "FEM/FEMVector.h"
 
 namespace ippl {
     template <typename OperatorRet, typename LowerRet, typename UpperRet, typename UpperLowerRet,
@@ -20,12 +20,12 @@ namespace ippl {
 
     public:
         using typename Base::lhs_type, typename Base::rhs_type;
-        using OperatorF    = std::function<OperatorRet(lhs_type)>;
-        using LowerF       = std::function<LowerRet(lhs_type)>;
-        using UpperF       = std::function<UpperRet(lhs_type)>;
-        using UpperLowerF  = std::function<UpperLowerRet(lhs_type)>;
-        using InverseDiagF = std::function<InverseDiagRet(lhs_type)>;
-        using DiagF        = std::function<DiagRet(lhs_type)>;
+        using OperatorF    = std::function<OperatorRet(lhs_type&)>;
+        using LowerF       = std::function<LowerRet(lhs_type&)>;
+        using UpperF       = std::function<UpperRet(lhs_type&)>;
+        using UpperLowerF  = std::function<UpperLowerRet(lhs_type&)>;
+        using InverseDiagF = std::function<InverseDiagRet(lhs_type&)>;
+        using DiagF        = std::function<DiagRet(lhs_type&)>;
         using mesh_type    = typename lhs_type::Mesh_t;
         using layout_type  = typename lhs_type::Layout_t;
 
@@ -75,9 +75,8 @@ namespace ippl {
             // set in main
             [[maybe_unused]] int outer =
                 1,  // This is a dummy default parameter, actual default parameter should be
-            [[maybe_unused]] double omega =
-                1  // This is a dummy default parameter, actual default parameter should be
-                   // set in main
+            [[maybe_unused]] double omega = 1  // This is a dummy default parameter, actual default
+                                               // parameter should be set in main
         ) {}
         /*!
          * Query how many iterations were required to obtain the solution
@@ -90,10 +89,10 @@ namespace ippl {
                                 const ParameterList& params) override {
             constexpr unsigned Dim = lhs_type::dim;
 
-            static IpplTimings::TimerRef cg_ops = IpplTimings::getTimer("CG");
+            static IpplTimings::TimerRef cg_ops    = IpplTimings::getTimer("CG");
             static IpplTimings::TimerRef up_layout = IpplTimings::getTimer("updateLayout");
-            static IpplTimings::TimerRef apply = IpplTimings::getTimer("applyOp");
-            static IpplTimings::TimerRef inner = IpplTimings::getTimer("innerProduct");
+            static IpplTimings::TimerRef apply     = IpplTimings::getTimer("applyOp");
+            static IpplTimings::TimerRef inner     = IpplTimings::getTimer("innerProduct");
 
             IpplTimings::startTimer(cg_ops);
 
@@ -138,14 +137,13 @@ namespace ippl {
             d.setFieldBC(bc);
 
             IpplTimings::startTimer(inner);
-            T delta1          = innerProduct(r, d);
+            T delta1 = innerProduct(r, d);
             IpplTimings::stopTimer(inner);
             T delta0          = delta1;
             residueNorm       = Kokkos::sqrt(delta1);
             const T tolerance = params.get<T>("tolerance") * norm(rhs);
 
             while (iterations_m < maxIterations && residueNorm > tolerance) {
-
                 IpplTimings::startTimer(apply);
                 q = op_m(d);
                 IpplTimings::stopTimer(apply);
@@ -153,7 +151,7 @@ namespace ippl {
                 IpplTimings::startTimer(inner);
                 T alpha = delta1 / innerProduct(d, q);
                 IpplTimings::stopTimer(inner);
-                lhs     = lhs + alpha * d;
+                lhs = lhs + alpha * d;
 
                 // The exact residue is given by
                 // r = rhs - op_m(lhs);
@@ -188,26 +186,28 @@ namespace ippl {
         T residueNorm    = 0;
         int iterations_m = 0;
 
-    private:
+        // Workspaces, allocated once via initializeFields() and reused across
+        // solves. Protected so derived solvers (e.g. PCG) can extend the
+        // workspace set without redeclaring r, d, q as locals on every
+        // operator() call.
         lhs_type r;
         lhs_type d;
         lhs_type q;
     };
 
-
     template <typename OperatorRet, typename LowerRet, typename UpperRet, typename UpperLowerRet,
               typename InverseDiagRet, typename T>
-    class CG<OperatorRet, LowerRet, UpperRet, UpperLowerRet, InverseDiagRet, FEMVector<T>, FEMVector<T> >
-            : public SolverAlgorithm<FEMVector<T>, FEMVector<T>> {
+    class CG<OperatorRet, LowerRet, UpperRet, UpperLowerRet, InverseDiagRet, FEMVector<T>,
+             FEMVector<T>> : public SolverAlgorithm<FEMVector<T>, FEMVector<T>> {
         using Base = SolverAlgorithm<FEMVector<T>, FEMVector<T>>;
 
     public:
         using typename Base::lhs_type, typename Base::rhs_type;
-        using OperatorF    = std::function<OperatorRet(lhs_type)>;
-        using LowerF       = std::function<LowerRet(lhs_type)>;
-        using UpperF       = std::function<UpperRet(lhs_type)>;
-        using UpperLowerF  = std::function<UpperLowerRet(lhs_type)>;
-        using InverseDiagF = std::function<InverseDiagRet(lhs_type)>;
+        using OperatorF    = std::function<OperatorRet(lhs_type&)>;
+        using LowerF       = std::function<LowerRet(lhs_type&)>;
+        using UpperF       = std::function<UpperRet(lhs_type&)>;
+        using UpperLowerF  = std::function<UpperLowerRet(lhs_type&)>;
+        using InverseDiagF = std::function<InverseDiagRet(lhs_type&)>;
 
         virtual ~CG() = default;
 
@@ -249,13 +249,12 @@ namespace ippl {
          * @return Iteration count of last solve
          */
         virtual int getIterationCount() { return iterations_m; }
-        
+
         virtual void operator()(lhs_type& lhs, rhs_type& rhs,
                                 const ParameterList& params) override {
-            
-            //constexpr unsigned Dim             = lhs_type::dim;
-            //typename lhs_type::Mesh_t& mesh     = lhs.get_mesh();
-            //typename lhs_type::Layout_t& layout = lhs.getLayout();
+            // constexpr unsigned Dim             = lhs_type::dim;
+            // typename lhs_type::Mesh_t& mesh     = lhs.get_mesh();
+            // typename lhs_type::Layout_t& layout = lhs.getLayout();
 
             iterations_m            = 0;
             const int maxIterations = params.get<int>("max_iterations");
@@ -263,23 +262,21 @@ namespace ippl {
             // Variable names mostly based on description in
             // https://www.cs.cmu.edu/~quake-papers/painless-conjugate-gradient.pdf
             lhs_type r = lhs.deepCopy();
-            r = 0;
+            r          = 0;
             lhs_type d = lhs.deepCopy();
-            d = 0;
-            
-           
+            d          = 0;
+
             r = rhs - op_m(lhs);
             r.setHalo(0);
-            d = r; //.deepCopy();
-            //d.setFieldBC(bc);
+            d = r;  //.deepCopy();
+            // d.setFieldBC(bc);
             T delta1          = innerProduct(r, d);
             T delta0          = delta1;
             residueNorm       = Kokkos::sqrt(delta1);
             const T tolerance = params.get<T>("tolerance") * norm(rhs);
 
             lhs_type q = lhs.deepCopy();
-            q = 0;
-            
+            q          = 0;
 
             while (iterations_m < maxIterations && residueNorm > tolerance) {
                 q = op_m(d);
@@ -303,8 +300,6 @@ namespace ippl {
                 residueNorm = Kokkos::sqrt(delta1);
                 d           = r + beta * d;
                 ++iterations_m;
-
-                
             }
         }
 
@@ -316,7 +311,6 @@ namespace ippl {
         int iterations_m = 0;
     };
 
-
     template <typename OperatorRet, typename LowerRet, typename UpperRet, typename UpperLowerRet,
               typename InverseDiagRet, typename DiagRet, typename FieldLHS,
               typename FieldRHS = FieldLHS>
@@ -327,17 +321,27 @@ namespace ippl {
 
     public:
         using typename Base::lhs_type, typename Base::rhs_type;
-        using OperatorF    = std::function<OperatorRet(lhs_type)>;
-        using LowerF       = std::function<LowerRet(lhs_type)>;
-        using UpperF       = std::function<UpperRet(lhs_type)>;
-        using UpperLowerF  = std::function<UpperLowerRet(lhs_type)>;
-        using InverseDiagF = std::function<InverseDiagRet(lhs_type)>;
-        using DiagF        = std::function<DiagRet(lhs_type)>;
+        using OperatorF    = std::function<OperatorRet(lhs_type&)>;
+        using LowerF       = std::function<LowerRet(lhs_type&)>;
+        using UpperF       = std::function<UpperRet(lhs_type&)>;
+        using UpperLowerF  = std::function<UpperLowerRet(lhs_type&)>;
+        using InverseDiagF = std::function<InverseDiagRet(lhs_type&)>;
+        using DiagF        = std::function<DiagRet(lhs_type&)>;
+
+        using mesh_type   = typename lhs_type::Mesh_t;
+        using layout_type = typename lhs_type::Layout_t;
 
         PCG()
             : CG<OperatorRet, LowerRet, UpperRet, UpperLowerRet, InverseDiagRet, DiagRet, FieldLHS,
                  FieldRHS>()
-            , preconditioner_m(nullptr){};
+            , preconditioner_m(nullptr) {};
+
+        void initializeFields(mesh_type& mesh, layout_type& layout) override {
+            CG<OperatorRet, LowerRet, UpperRet, UpperLowerRet, InverseDiagRet, DiagRet, FieldLHS,
+               FieldRHS>::initializeFields(mesh, layout);
+            s.initialize(mesh, layout);
+            pcond_out.initialize(mesh, layout);
+        }
 
         /*!
          * Sets the differential operator for the conjugate gradient algorithm
@@ -365,8 +369,9 @@ namespace ippl {
             // set in main
             double omega = 1.57079632679  // This is a dummy default parameter, actual default
             // parameter should be set in main
-            // default = pi/2 as this was found optimal during hyperparameter scan for test case 
-            // (see https://amas.web.psi.ch/people/aadelmann/ETH-Accel-Lecture-1/projectscompleted/cse/BSc-mbolliger.pdf)
+            // default = pi/2 as this was found optimal during hyperparameter scan for test case
+            // (see
+            // https://amas.web.psi.ch/people/aadelmann/ETH-Accel-Lecture-1/projectscompleted/cse/BSc-mbolliger.pdf)
             ) override {
             if (preconditioner_type == "jacobi") {
                 // Turn on damping parameter
@@ -396,8 +401,7 @@ namespace ippl {
                 preconditioner_m =
                     std::move(std::make_unique<
                               richardson_preconditioner_alt<FieldLHS, OperatorF, InverseDiagF>>(
-                        std::move(op), std::move(inverse_diagonal),
-                        richardson_iterations));
+                        std::move(op), std::move(inverse_diagonal), richardson_iterations));
             } else if (preconditioner_type == "gauss-seidel") {
                 preconditioner_m = std::move(
                     std::make_unique<gs_preconditioner<FieldLHS, LowerF, UpperF, InverseDiagF>>(
@@ -422,19 +426,23 @@ namespace ippl {
                                     "Preconditioner has not been set for PCG solver");
             }
 
-            typename lhs_type::Mesh_t& mesh     = lhs.get_mesh();
-            typename lhs_type::Layout_t& layout = lhs.getLayout();
-
             this->iterations_m      = 0;
             const int maxIterations = params.get<int>("max_iterations");
 
             // Variable names mostly based on description in
             // https://www.cs.cmu.edu/~quake-papers/painless-conjugate-gradient.pdf
-            lhs_type r(mesh, layout);
-            lhs_type d(mesh, layout);
-            lhs_type s(mesh, layout);
-            lhs_type q(mesh, layout);
-
+            // Field layouts are updated such that we don't keep a stale domain decomposition
+            // if the lhs layout has been changed during the simulation, for example by the load
+            // balancer.
+            this->r.updateLayout(lhs.getLayout());
+            this->d.updateLayout(lhs.getLayout());
+            s.updateLayout(lhs.getLayout());
+            pcond_out.updateLayout(lhs.getLayout());
+            this->q.updateLayout(lhs.getLayout());
+
+            // Each preconditioner's init_fields() is responsible for being
+            // cheap on the steady-state path (refreshing layout in-place, not
+            // reallocating).
             preconditioner_m->init_fields(lhs);
 
             using bc_type  = BConds<lhs_type, Dim>;
@@ -459,20 +467,22 @@ namespace ippl {
                 }
             }
 
-            r = rhs - this->op_m(lhs);
-            d = preconditioner_m->operator()(r).deepCopy();
-            d.setFieldBC(bc);
+            this->r = rhs - this->op_m(lhs);
+            (*preconditioner_m)(this->r, pcond_out);
+            this->d = T(1) * pcond_out;
+            this->d.setFieldBC(bc);
 
-            T delta1          = innerProduct(r, d);
+            T delta1          = innerProduct(this->r, this->d);
             T delta0          = delta1;
             this->residueNorm = Kokkos::sqrt(Kokkos::abs(delta1));
             const T tolerance = params.get<T>("tolerance") * this->residueNorm;
 
-            while (this->iterations_m < maxIterations && this->residueNorm > tolerance) {
-                q       = this->op_m(d);
-                q       = q.deepCopy();
-                T alpha = delta1 / innerProduct(d, q);
-                lhs     = lhs + alpha * d;
+            while (this->iterations_m<maxIterations&& this->residueNorm> tolerance) {
+                // op_m(d) writes its expression into q's existing storage; no
+                // allocation, no per-iteration deep copy.
+                this->q = this->op_m(this->d);
+                T alpha = delta1 / innerProduct(this->d, this->q);
+                lhs     = lhs + alpha * this->d;
 
                 // The exact residue is given by
                 // r = rhs - BaseCG::op_m(lhs);
@@ -481,16 +491,17 @@ namespace ippl {
                 // the correction does not have a significant effect on accuracy;
                 // in some implementations, the correction may be applied every few
                 // iterations to offset accumulated floating point errors
-                r = r - alpha * q;
-                s = preconditioner_m->operator()(r).deepCopy();
+                this->r = this->r - alpha * this->q;
+                // s := M^{-1} r; preconditioner writes into s (NoBcFace).
+                (*preconditioner_m)(this->r, s);
 
                 delta0 = delta1;
-                delta1 = innerProduct(r, s);
+                delta1 = innerProduct(this->r, s);
 
                 T beta            = delta1 / delta0;
                 this->residueNorm = Kokkos::sqrt(Kokkos::abs(delta1));
 
-                d = s + beta * d;
+                this->d = s + beta * this->d;
                 ++this->iterations_m;
             }
 
@@ -502,6 +513,20 @@ namespace ippl {
 
     protected:
         std::unique_ptr<preconditioner<FieldLHS>> preconditioner_m;
+
+        /*
+         * Extends the CG workspace with the preconditioner result buffers s and
+         * pcond_out so operator() does not allocate per solve.
+         * Preconditioner result buffers, allocated once via initializeFields()
+         * pcond_out keeps its default NoBcFace BCs: the preconditioner's internal operator
+         * chain therefore never triggers PeriodicFace::apply MPI calls -- if it
+         * did, the global MPI sequence would diverge from the master code path
+         * (where the preconditioner returned a fresh NoBcFace field) and
+         * intermittent multi-rank halo deadlocks would follow.
+         */
+
+        lhs_type s;
+        lhs_type pcond_out;
     };
 
 };  // namespace ippl
diff --git a/src/LinearSolvers/Preconditioner.h b/src/LinearSolvers/Preconditioner.h
index 09eb876aa..2fda7c4cc 100644
--- a/src/LinearSolvers/Preconditioner.h
+++ b/src/LinearSolvers/Preconditioner.h
@@ -7,13 +7,23 @@
 #ifndef IPPL_PRECONDITIONER_H
 #define IPPL_PRECONDITIONER_H
 
+#include <vector>
+
 #include "Expression/IpplOperations.h"  // get the function apply()
 
 // Expands to a lambda that acts as a wrapper for a differential operator
 // fun: the function for which to create the wrapper, such as ippl::laplace
 // type: the argument type, which should match the LHS type for the solver
+//
+// IMPORTANT: takes the field by *reference*, not by value. A by-value
+// signature would copy the Field on every call -- BareField has shared-
+// ownership Kokkos::View members, so the data isn't actually copied, but
+// the embedded HaloCells::haloData_m buffer that the halo exchange grows
+// via Kokkos::realloc only grows on the copy and never propagates back to
+// the original. The result was a fresh cudaMalloc per op_m() call in
+// multi-rank runs (see nsys profile in ~/prof/pif-pr-verify).
 #define IPPL_SOLVER_OPERATOR_WRAPPER(fun, type) \
-    [](type arg) {                              \
+    [](type& arg) {                             \
         return fun(arg);                        \
     }
 
@@ -32,17 +42,18 @@ namespace ippl {
 
         virtual ~preconditioner() = default;
 
-        // Placeholder for the function operator, actually implemented in the derived classes
-        virtual Field operator()(Field& u) {
-            Field res = u.deepCopy();
-            return res;
+        // Apply the preconditioner: result = M^{-1} u. Concrete preconditioners
+        // override this to write into the caller-provided result buffer; this
+        // avoids per-call Field allocations and per-call deep copies in PCG.
+        // The default (identity) preconditioner copies u into result.
+        virtual void operator()(Field& u, Field& result) {
+            Kokkos::deep_copy(result.getView(), u.getView());
         }
 
-        // Placeholder for setting additional fields, actually implemented in the derived classes
-        virtual void init_fields(Field& b) {
-            Field res = b.deepCopy();
-            return;
-        }
+        // Allocate any scratch fields the preconditioner needs. Called once
+        // by the owning solver after the layout is known. Concrete
+        // preconditioners that need scratch override this.
+        virtual void init_fields(Field& /*b*/) {}
 
         std::string get_type() { return type_m; };
 
@@ -66,14 +77,23 @@ namespace ippl {
             inverse_diagonal_m = std::move(inverse_diagonal);
         }
 
-        Field operator()(Field& u) override {
-            mesh_type& mesh     = u.get_mesh();
-            layout_type& layout = u.getLayout();
-            Field res(mesh, layout);
-
-            res = inverse_diagonal_m(u);
-            res = w_m * res;
-            return res;
+        void operator()(Field& u, Field& result) override {
+            // result = w * D^{-1} * u, written element-wise into the caller-
+            // provided result buffer. Two forms of inverse_diagonal_m are
+            // supported, matching the convention used by the other
+            // preconditioners in this file:
+            //   - returns a scalar (e.g. uniform-mesh Poisson Laplacian):
+            //       result = w * scalar * u
+            //   - returns a Field / expression equal to D^{-1} * u
+            //     (e.g. matrix-free FEM operators):
+            //       result = w * inverse_diagonal_m(u)
+            if constexpr (std::is_same_v<InvDiagF, std::function<double(Field&)>>) {
+                const double scale = w_m * inverse_diagonal_m(u);
+                result             = scale * u;
+            } else {
+                result = inverse_diagonal_m(u);
+                result = w_m * result;
+            }
         }
 
     protected:
@@ -125,12 +145,47 @@ namespace ippl {
             return *this = polynomial_newton_preconditioner(other);
         }
 
-        Field recursive_preconditioner(Field& u, unsigned int level) {
-            mesh_type& mesh     = u.get_mesh();
-            layout_type& layout = u.getLayout();
-            // Define etas if not defined yet
+        // Recursive Newton expansion P_k(u) where:
+        //   P_0(u) = eta_0 * u
+        //   P_k(u) = eta_k * (2 P_{k-1}(u) - P_{k-1}(A P_{k-1}(u)))
+        // Writes the result of level `level` into `out`. Uses one scratch
+        // field per recursion depth (Pr, PA, PAPr) so that no Field is
+        // allocated per call. The two recursive calls at each level execute
+        // sequentially and may reuse scratch at lower depths because the
+        // first call's lower-depth values have already been folded into
+        // Pr_scratch[level] before the second call begins.
+        void recursive_preconditioner(Field& u, unsigned int level, Field& out) {
+            // Timers accumulated across all levels of the recursion: useful
+            // for "how much of pcond is laplace vs combine vs leaf-assign".
+            // Counts include every call at every level (so the call-count
+            // column equals 2^(level_m+1)-1, not the number of operator()
+            // invocations).
+            if (level == 0) {
+                out = eta_m[0] * u;
+                return;
+            }
+            Field& Pr   = Pr_scratch_m[level];
+            Field& PA   = PA_scratch_m[level];
+            Field& PAPr = PAPr_scratch_m[level];
+
+            // Inner recursive call owns its own timing; we don't double-count
+            // it here.
+            recursive_preconditioner(u, level - 1, Pr);
+            PA = op_m(Pr);
+            recursive_preconditioner(PA, level - 1, PAPr);
+            out = eta_m[level] * (2.0 * Pr - PAPr);
+        }
+
+        void operator()(Field& u, Field& result) override {
+            // One outer timer per polynomial_newton apply, complementing the
+            // per-call recursion timers. Lets us see how many outer pcond
+            // calls a solve does vs. the time inside each.
+            recursive_preconditioner(u, level_m, result);
+        }
+
+        void init_fields(Field& b) override {
+            // One-shot precomputation of the eta coefficients.
             if (eta_m == nullptr) {
-                // Precompute the etas for later use
                 eta_m    = new double[level_m + 1];
                 eta_m[0] = 2.0 / ((alpha_m + beta_m) * (1.0 + zeta_m));
                 if (level_m > 0) {
@@ -143,25 +198,31 @@ namespace ippl {
                 }
             }
 
-            Field res(mesh, layout);
-            // Base case
-            if (level == 0) {
-                res = eta_m[0] * u;
-                return res;
+            // First call: allocate one scratch field per recursion depth.
+            // Subsequent calls: refresh layout in place so a load-balance
+            // repartition tracks correctly without freeing/reallocating
+            // when the local extents are unchanged.
+            mesh_type& mesh     = b.get_mesh();
+            layout_type& layout = b.getLayout();
+            if (!fields_initialized_m) {
+                Pr_scratch_m.resize(level_m + 1);
+                PA_scratch_m.resize(level_m + 1);
+                PAPr_scratch_m.resize(level_m + 1);
+                for (unsigned int i = 1; i <= level_m; ++i) {
+                    Pr_scratch_m[i]   = Field(mesh, layout);
+                    PA_scratch_m[i]   = Field(mesh, layout);
+                    PAPr_scratch_m[i] = Field(mesh, layout);
+                }
+                fields_initialized_m = true;
+            } else {
+                for (unsigned int i = 1; i <= level_m; ++i) {
+                    Pr_scratch_m[i].updateLayout(layout);
+                    PA_scratch_m[i].updateLayout(layout);
+                    PAPr_scratch_m[i].updateLayout(layout);
+                }
             }
-            // Recursive case
-            Field PAPr(mesh, layout);
-            Field Pr(mesh, layout);
-
-            Pr   = recursive_preconditioner(u, level - 1);
-            PAPr = op_m(Pr);
-            PAPr = recursive_preconditioner(PAPr, level - 1);
-            res  = eta_m[level] * (2.0 * Pr - PAPr);
-            return res;
         }
 
-        Field operator()(Field& u) override { return recursive_preconditioner(u, level_m); }
-
     protected:
         OperatorF op_m;        // Operator to be preconditioned
         double alpha_m;        // Smallest Eigenvalue
@@ -170,6 +231,10 @@ namespace ippl {
         double zeta_m;  // smallest (alpha + beta) is multiplied by (1+zeta) to avoid clustering of
                         // Eigenvalues
         double* eta_m = nullptr;  // Size is determined at runtime
+        std::vector<Field> Pr_scratch_m;
+        std::vector<Field> PA_scratch_m;
+        std::vector<Field> PAPr_scratch_m;
+        bool fields_initialized_m = false;
     };
 
     /*!
@@ -219,19 +284,37 @@ namespace ippl {
             return *this = polynomial_chebyshev_preconditioner(other);
         }
 
-        Field operator()(Field& r) override {
-            mesh_type& mesh     = r.get_mesh();
-            layout_type& layout = r.getLayout();
+        void operator()(Field& r, Field& result) override {
+            // x_m, x_old_m, A_m, z_m are pre-allocated scratch (init_fields).
+            // Coefficients in rho_m are also computed once.
+            x_old_m = r / theta_m;
+            A_m     = op_m(r);
+            x_m     = 2.0 * rho_m[1] / delta_m * (2.0 * r - A_m / theta_m);
+            if (degree_m == 0) {
+                // result = x_old
+                Kokkos::deep_copy(result.getView(), x_old_m.getView());
+                return;
+            }
 
-            Field res(mesh, layout);
-            Field x(mesh, layout);
-            Field x_old(mesh, layout);
-            Field A(mesh, layout);
-            Field z(mesh, layout);
+            if (degree_m == 1) {
+                // result = x
+                Kokkos::deep_copy(result.getView(), x_m.getView());
+                return;
+            }
+            for (unsigned int i = 2; i < degree_m + 1; ++i) {
+                A_m = op_m(x_m);
+                z_m = 2.0 / delta_m * (r - A_m);
+                // Write the new x value into result (the caller's buffer);
+                // x_old gets a deep copy of the previous x.
+                result = rho_m[i] * (2 * sigma_m * x_m - rho_m[i - 1] * x_old_m + z_m);
+                Kokkos::deep_copy(x_old_m.getView(), x_m.getView());
+                Kokkos::deep_copy(x_m.getView(), result.getView());
+            }
+        }
 
-            // Precompute the coefficients if not done yet
+        void init_fields(Field& b) override {
+            // One-shot precomputation of the rho coefficients.
             if (rho_m == nullptr) {
-                // Start precomputing the coefficients
                 theta_m = (beta_m + alpha_m) / 2.0 * (1.0 + zeta_m);
                 delta_m = (beta_m - alpha_m) / 2.0;
                 sigma_m = theta_m / delta_m;
@@ -241,29 +324,23 @@ namespace ippl {
                 for (unsigned int i = 1; i < degree_m + 1; ++i) {
                     rho_m[i] = 1.0 / (2.0 * sigma_m - rho_m[i - 1]);
                 }
-            }  // End of precomputing the coefficients
-
-            res = r.deepCopy();
-
-            x_old = r / theta_m;
-            A     = op_m(r);
-            x     = 2.0 * rho_m[1] / delta_m * (2.0 * r - A / theta_m);
-
-            if (degree_m == 0) {
-                return x_old;
-            }
-
-            if (degree_m == 1) {
-                return x;
             }
-            for (unsigned int i = 2; i < degree_m + 1; ++i) {
-                A     = op_m(x);
-                z     = 2.0 / delta_m * (r - A);
-                res   = rho_m[i] * (2 * sigma_m * x - rho_m[i - 1] * x_old + z);
-                x_old = x.deepCopy();
-                x     = res.deepCopy();
+            mesh_type& mesh     = b.get_mesh();
+            layout_type& layout = b.getLayout();
+            // First call allocates; subsequent calls refresh the layout so a
+            // repartition is tracked without throwing the storage away.
+            if (!fields_initialized_m) {
+                x_m                  = Field(mesh, layout);
+                x_old_m              = Field(mesh, layout);
+                A_m                  = Field(mesh, layout);
+                z_m                  = Field(mesh, layout);
+                fields_initialized_m = true;
+            } else {
+                x_m.updateLayout(layout);
+                x_old_m.updateLayout(layout);
+                A_m.updateLayout(layout);
+                z_m.updateLayout(layout);
             }
-            return res;
         }
 
     protected:
@@ -276,6 +353,11 @@ namespace ippl {
         unsigned degree_m;
         double zeta_m;
         double* rho_m = nullptr;  // Size is determined at runtime
+        Field x_m;
+        Field x_old_m;
+        Field A_m;
+        Field z_m;
+        bool fields_initialized_m = false;
     };
 
     /*!
@@ -295,22 +377,18 @@ namespace ippl {
             inverse_diagonal_m = std::move(inverse_diagonal);
         }
 
-        Field operator()(Field& r) override {
+        void operator()(Field& r, Field& result) override {
+            // Richardson iteration in-place on the caller-provided result
+            // buffer. ULg_m stays as a member scratch; the inner deep copies
+            // remain because the (upper, diag, inverse, lower) operators may
+            // return Field-valued expressions that alias their input.
 
-            // In the FEM solver, which uses the preconditioner, 
-            // we re-use a resultField to avoid allocating new
-            // memory at every iteration.
-            // In order for the operator calls to not rewrite
-            // on this same field over and over when calling 
-            // the operators (upper, diag, inverse, lower, etc)
-            // we need deep copies to the preconditioner fields.
-
-            g_m = 0;
+            result = 0;
             for (unsigned int j = 0; j < innerloops_m; ++j) {
-                ULg_m = upper_and_lower_m(g_m);
-                ULg_m = ULg_m.deepCopy();
-                g_m   = r - ULg_m;
-         
+                ULg_m  = upper_and_lower_m(result);
+                ULg_m  = ULg_m.deepCopy();
+                result = r - ULg_m;
+
                 // The inverse diagonal is applied to the
                 // vector itself to return the result usually.
                 // However, the operator for FEM already
@@ -318,21 +396,23 @@ namespace ippl {
                 // due to the matrix-free evaluation.
                 // Therefore, we need this if to differentiate
                 // the two cases.
-                if constexpr (std::is_same_v<InvDiagF, std::function<double(Field)>>) {
-                    g_m = inverse_diagonal_m(g_m) * g_m;
+                if constexpr (std::is_same_v<InvDiagF, std::function<double(Field&)>>) {
+                    result = inverse_diagonal_m(result) * result;
                 } else {
-                    g_m = inverse_diagonal_m(g_m).deepCopy();
+                    result = inverse_diagonal_m(result).deepCopy();
                 }
             }
-            return g_m;
         }
 
         void init_fields(Field& b) override {
             layout_type& layout = b.getLayout();
             mesh_type& mesh     = b.get_mesh();
-
-            ULg_m = Field(mesh, layout);
-            g_m = Field(mesh, layout);
+            if (!fields_initialized_m) {
+                ULg_m                = Field(mesh, layout);
+                fields_initialized_m = true;
+            } else {
+                ULg_m.updateLayout(layout);
+            }
         }
 
     protected:
@@ -340,7 +420,7 @@ namespace ippl {
         InvDiagF inverse_diagonal_m;
         unsigned innerloops_m;
         Field ULg_m;
-        Field g_m;
+        bool fields_initialized_m = false;
     };
 
     /*!
@@ -357,29 +437,26 @@ namespace ippl {
         using layout_type             = typename Field::Layout_t;
 
         richardson_preconditioner_alt(OperatorF&& op, InvDiagF&& inverse_diagonal,
-                                  unsigned innerloops = 5)
+                                      unsigned innerloops = 5)
             : preconditioner<Field>("Richardson_alt")
             , innerloops_m(innerloops) {
-            op_m  = std::move(op);
+            op_m               = std::move(op);
             inverse_diagonal_m = std::move(inverse_diagonal);
         }
 
-        Field operator()(Field& r) override {
-            // In the FEM solver, which uses the preconditioner, 
-            // we re-use a resultField to avoid allocating new
-            // memory at every iteration.
-            // In order for the operator calls to not rewrite
-            // on this same field over and over when calling 
-            // the operators (upper, diag, inverse, lower, etc)
-            // we need deep copies to the preconditioner fields.
+        void operator()(Field& r, Field& result) override {
+            // result holds the running iterate; Ag_m and g_old_m are scratch
+            // members. The inner deep copies remain because the operators
+            // (op_m, inverse_diagonal_m) may return Field-valued expressions
+            // that alias their input.
 
-            g_m = 0;
+            result  = 0;
             g_old_m = 0;
 
             for (unsigned int j = 0; j < innerloops_m; ++j) {
-                Ag_m = op_m(g_m);
-                Ag_m = Ag_m.deepCopy();
-                g_m  = r - Ag_m;
+                Ag_m   = op_m(result);
+                Ag_m   = Ag_m.deepCopy();
+                result = r - Ag_m;
 
                 // The inverse diagonal is applied to the
                 // vector itself to return the result usually.
@@ -388,23 +465,26 @@ namespace ippl {
                 // due to the matrix-free evaluation.
                 // Therefore, we need this if to differentiate
                 // the two cases.
-                if constexpr (std::is_same_v<InvDiagF, std::function<double(Field)>>) {
-                    g_m = g_old_m + inverse_diagonal_m(g_m) * g_m;
+                if constexpr (std::is_same_v<InvDiagF, std::function<double(Field&)>>) {
+                    result = g_old_m + inverse_diagonal_m(result) * result;
                 } else {
-                    g_m = g_old_m + inverse_diagonal_m(g_m);
+                    result = g_old_m + inverse_diagonal_m(result);
                 }
-                g_old_m = g_m.deepCopy();
+                Kokkos::deep_copy(g_old_m.getView(), result.getView());
             }
-            return g_m;
         }
 
         void init_fields(Field& b) override {
             layout_type& layout = b.getLayout();
             mesh_type& mesh     = b.get_mesh();
-
-            Ag_m = Field(mesh, layout);
-            g_m = Field(mesh, layout);
-            g_old_m = Field(mesh, layout);
+            if (!fields_initialized_m) {
+                Ag_m                 = Field(mesh, layout);
+                g_old_m              = Field(mesh, layout);
+                fields_initialized_m = true;
+            } else {
+                Ag_m.updateLayout(layout);
+                g_old_m.updateLayout(layout);
+            }
         }
 
     protected:
@@ -412,8 +492,8 @@ namespace ippl {
         InvDiagF inverse_diagonal_m;
         unsigned innerloops_m;
         Field Ag_m;
-        Field g_m;
         Field g_old_m;
+        bool fields_initialized_m = false;
     };
 
     /*!
@@ -435,30 +515,27 @@ namespace ippl {
             inverse_diagonal_m = std::move(inverse_diagonal);
         }
 
-        Field operator()(Field& b) override {
-            layout_type& layout = b.getLayout();
-            mesh_type& mesh     = b.get_mesh();
-
-            Field x(mesh, layout);
+        void operator()(Field& b, Field& result) override {
+            // The running iterate lives in result; UL_m and r_m are scratch
+            // members. The inner deep copies remain because the (upper, lower,
+            // inverse) operators may return Field-valued expressions that
+            // alias their input.
 
-            x = 0;  // Initial guess
-
-            // In the FEM solver, which uses the preconditioner, 
-            // we re-use a resultField to avoid allocating new
-            // memory at every iteration.
-            // In order for the operator calls to not rewrite
-            // on this same field over and over when calling 
-            // the operators (upper, diag, inverse, lower, etc)
-            // we need deep copies to the preconditioner fields.
+            result = 0;  // Initial guess
 
             for (unsigned int k = 0; k < outerloops_m; ++k) {
-                UL_m = upper_m(x);
+                UL_m = upper_m(result);
                 UL_m = UL_m.deepCopy();
                 r_m  = b - UL_m;
                 for (unsigned int j = 0; j < innerloops_m; ++j) {
-                    UL_m = lower_m(x);
-                    UL_m = UL_m.deepCopy();
-                    x    = r_m - UL_m;
+                    UL_m   = lower_m(result);
+                    UL_m   = UL_m.deepCopy();
+                    result = r_m - UL_m;
+                    if constexpr (std::is_same_v<InvDiagF, std::function<double(Field&)>>) {
+                        result = inverse_diagonal_m(result) * result;
+                    } else {
+                        result = inverse_diagonal_m(result).deepCopy();
+                    }
                     // The inverse diagonal is applied to the
                     // vector itself to return the result usually.
                     // However, the operator for FEM already
@@ -466,19 +543,19 @@ namespace ippl {
                     // due to the matrix-free evaluation.
                     // Therefore, we need this if to differentiate
                     // the two cases.
-                    if constexpr (std::is_same_v<InvDiagF, std::function<double(Field)>>) {
-                        x = inverse_diagonal_m(x) * x;
-                    } else {
-                        x = inverse_diagonal_m(x).deepCopy();
-                    }
                 }
-                UL_m = lower_m(x);
+                UL_m = lower_m(result);
                 UL_m = UL_m.deepCopy();
                 r_m  = b - UL_m;
                 for (unsigned int j = 0; j < innerloops_m; ++j) {
-                    UL_m = upper_m(x);
-                    UL_m = UL_m.deepCopy();
-                    x    = r_m - UL_m;
+                    UL_m   = upper_m(result);
+                    UL_m   = UL_m.deepCopy();
+                    result = r_m - UL_m;
+                    if constexpr (std::is_same_v<InvDiagF, std::function<double(Field&)>>) {
+                        result = inverse_diagonal_m(result) * result;
+                    } else {
+                        result = inverse_diagonal_m(result).deepCopy();
+                    }
                     // The inverse diagonal is applied to the
                     // vector itself to return the result usually.
                     // However, the operator for FEM already
@@ -486,22 +563,21 @@ namespace ippl {
                     // due to the matrix-free evaluation.
                     // Therefore, we need this if to differentiate
                     // the two cases.
-                    if constexpr (std::is_same_v<InvDiagF, std::function<double(Field)>>) {
-                        x = inverse_diagonal_m(x) * x;
-                    } else {
-                        x = inverse_diagonal_m(x).deepCopy();
-                    }
                 }
             }
-            return x;
         }
 
         void init_fields(Field& b) override {
             layout_type& layout = b.getLayout();
             mesh_type& mesh     = b.get_mesh();
-
-            UL_m = Field(mesh, layout);
-            r_m  = Field(mesh, layout);
+            if (!fields_initialized_m) {
+                UL_m                 = Field(mesh, layout);
+                r_m                  = Field(mesh, layout);
+                fields_initialized_m = true;
+            } else {
+                UL_m.updateLayout(layout);
+                r_m.updateLayout(layout);
+            }
         }
 
     protected:
@@ -512,6 +588,7 @@ namespace ippl {
         unsigned outerloops_m;
         Field UL_m;
         Field r_m;
+        bool fields_initialized_m = false;
     };
 
     /*!
@@ -536,29 +613,12 @@ namespace ippl {
             diagonal_m         = std::move(diagonal);
         }
 
-        Field operator()(Field& b) override {
-            static IpplTimings::TimerRef initTimer = IpplTimings::getTimer("SSOR Init");
-            IpplTimings::startTimer(initTimer);
-
-            double D;
-
-            layout_type& layout = b.getLayout();
-            mesh_type& mesh     = b.get_mesh();
-
-            Field x(mesh, layout);
-
-            x = 0;  // Initial guess
-
-            IpplTimings::stopTimer(initTimer);
-
-            static IpplTimings::TimerRef loopTimer = IpplTimings::getTimer("SSOR loop");
-            IpplTimings::startTimer(loopTimer);
-
-            // In the FEM solver, which uses the preconditioner, 
+        void operator()(Field& b, Field& result) override {
+            // In the FEM solver, which uses the preconditioner,
             // we re-use a resultField to avoid allocating new
             // memory at every iteration.
             // In order for the operator calls to not rewrite
-            // on this same field over and over when calling 
+            // on this same field over and over when calling
             // the operators (upper, diag, inverse, lower, etc)
             // we need deep copies to the preconditioner fields.
 
@@ -569,53 +629,65 @@ namespace ippl {
             // due to the matrix-free evaluation.
             // Therefore, we need this if to differentiate
             // the two cases.
+
+            double D;
+
+            // In order for the operator calls to not rewrite on this same field
+            // over and over when calling the operators (upper, diag, inverse,
+            // lower, etc) we need deep copies to the preconditioner fields."
+
+            result = 0;  // Initial guess
+
             for (unsigned int k = 0; k < outerloops_m; ++k) {
-                if constexpr (std::is_same_v<InvDiagF, std::function<double(Field)>>) {
-                    UL_m = upper_m(x);
-                    D    = diagonal_m(x);
-                    r_m  = omega_m * (b - UL_m) + (1.0 - omega_m) * D * x;
+                if constexpr (std::is_same_v<InvDiagF, std::function<double(Field&)>>) {
+                    UL_m = upper_m(result);
+                    D    = diagonal_m(result);
+                    r_m  = omega_m * (b - UL_m) + (1.0 - omega_m) * D * result;
 
                     for (unsigned int j = 0; j < innerloops_m; ++j) {
-                        UL_m = lower_m(x);
-                        x    = r_m - omega_m * UL_m;
-                        x    = inverse_diagonal_m(x) * x;
+                        UL_m   = lower_m(result);
+                        result = r_m - omega_m * UL_m;
+                        result = inverse_diagonal_m(result) * result;
                     }
-                    UL_m = lower_m(x);
-                    D    = diagonal_m(x);
-                    r_m  = omega_m * (b - UL_m) + (1.0 - omega_m) * D * x;
+                    UL_m = lower_m(result);
+                    D    = diagonal_m(result);
+                    r_m  = omega_m * (b - UL_m) + (1.0 - omega_m) * D * result;
                     for (unsigned int j = 0; j < innerloops_m; ++j) {
-                        UL_m = upper_m(x);
-                        x    = r_m - omega_m * UL_m;
-                        x    = inverse_diagonal_m(x) * x;
+                        UL_m   = upper_m(result);
+                        result = r_m - omega_m * UL_m;
+                        result = inverse_diagonal_m(result) * result;
                     }
                 } else {
-                    UL_m = upper_m(x).deepCopy();
-                    r_m  = omega_m * (b - UL_m) + (1.0 - omega_m) * diagonal_m(x);
+                    UL_m = upper_m(result).deepCopy();
+                    r_m  = omega_m * (b - UL_m) + (1.0 - omega_m) * diagonal_m(result);
 
                     for (unsigned int j = 0; j < innerloops_m; ++j) {
-                        UL_m = lower_m(x).deepCopy();
-                        x    = r_m - omega_m * UL_m;
-                        x    = inverse_diagonal_m(x).deepCopy();
+                        UL_m   = lower_m(result).deepCopy();
+                        result = r_m - omega_m * UL_m;
+                        result = inverse_diagonal_m(result).deepCopy();
                     }
-                    UL_m = lower_m(x).deepCopy();
-                    r_m  = omega_m * (b - UL_m) + (1.0 - omega_m) * diagonal_m(x);
+                    UL_m = lower_m(result).deepCopy();
+                    r_m  = omega_m * (b - UL_m) + (1.0 - omega_m) * diagonal_m(result);
                     for (unsigned int j = 0; j < innerloops_m; ++j) {
-                        UL_m = upper_m(x).deepCopy();
-                        x    = r_m - omega_m * UL_m;
-                        x    = inverse_diagonal_m(x).deepCopy();
+                        UL_m   = upper_m(result).deepCopy();
+                        result = r_m - omega_m * UL_m;
+                        result = inverse_diagonal_m(result).deepCopy();
                     }
                 }
             }
-            IpplTimings::stopTimer(loopTimer);
-            return x;
         }
 
         void init_fields(Field& b) override {
             layout_type& layout = b.getLayout();
             mesh_type& mesh     = b.get_mesh();
-
-            UL_m = Field(mesh, layout);
-            r_m  = Field(mesh, layout);
+            if (!fields_initialized_m) {
+                UL_m                 = Field(mesh, layout);
+                r_m                  = Field(mesh, layout);
+                fields_initialized_m = true;
+            } else {
+                UL_m.updateLayout(layout);
+                r_m.updateLayout(layout);
+            }
         }
 
     protected:
@@ -628,6 +700,7 @@ namespace ippl {
         double omega_m;
         Field UL_m;
         Field r_m;
+        bool fields_initialized_m = false;
     };
 
     /*!
diff --git a/src/PoissonSolvers/PoissonCG.h b/src/PoissonSolvers/PoissonCG.h
index 7c32f4257..db4c2d1dd 100644
--- a/src/PoissonSolvers/PoissonCG.h
+++ b/src/PoissonSolvers/PoissonCG.h
@@ -11,13 +11,12 @@
 #include "Poisson.h"
 namespace ippl {
 
-// Expands to a lambda that acts as a wrapper for a differential operator
-// fun: the function for which to create the wrapper, such as ippl::laplace
-// type: the argument type, which should match the LHS type for the solver
-#define IPPL_SOLVER_OPERATOR_WRAPPER(fun, type) \
-    [](type arg) {                              \
-        return fun(arg);                        \
-    }
+// IPPL_SOLVER_OPERATOR_WRAPPER is defined once in LinearSolvers/Preconditioner.h
+// (re-exported through this header via PCG.h). Defining it again here used to
+// silently shadow that definition with a by-value lambda, which copies the
+// Field on every op_m() call and reintroduces the per-iteration cudaMalloc
+// in the halo exchange (the realloc never propagates back to the original
+// Field). Keep a single by-reference definition.
 
     template <typename FieldLHS, typename FieldRHS = FieldLHS>
     class PoissonCG : public Poisson<FieldLHS, FieldRHS> {