diff --git a/alpine/BumponTailInstabilityManager.h b/alpine/BumponTailInstabilityManager.h index 7525f9af1..5e03a0712 100644 --- a/alpine/BumponTailInstabilityManager.h +++ b/alpine/BumponTailInstabilityManager.h @@ -274,10 +274,10 @@ class BumponTailInstabilityManager : public AlpineManager { this->pcontainer_m->create(nlocal); - view_type* R = &(this->pcontainer_m->R.getView()); - samplingR.generate(*R, rand_pool64); + view_type R = this->pcontainer_m->R.getView(); + samplingR.generate(R, rand_pool64); - view_type* P = &(this->pcontainer_m->P.getView()); + view_type P = this->pcontainer_m->P.getView(); double mu[Dim]; double sd[Dim]; @@ -288,12 +288,12 @@ class BumponTailInstabilityManager : public AlpineManager { // sample first nlocBulk with muBulk as mean velocity mu[Dim - 1] = muBulk_m; Kokkos::parallel_for(Kokkos::RangePolicy(0, nlocBulk), - ippl::random::randn(*P, rand_pool64, mu, sd)); + ippl::random::randn(P, rand_pool64, mu, sd)); // sample remaining with muBeam as mean velocity mu[Dim - 1] = muBeam_m; Kokkos::parallel_for(Kokkos::RangePolicy(nlocBulk, nlocal), - ippl::random::randn(*P, rand_pool64, mu, sd)); + ippl::random::randn(P, rand_pool64, mu, sd)); Kokkos::fence(); ippl::Comm->barrier(); diff --git a/alpine/LandauDampingManager.h b/alpine/LandauDampingManager.h index b8491c7a5..23dcc632a 100644 --- a/alpine/LandauDampingManager.h +++ b/alpine/LandauDampingManager.h @@ -226,10 +226,10 @@ class LandauDampingManager : public AlpineManager { this->pcontainer_m->create(nlocal); - view_type* R = &(this->pcontainer_m->R.getView()); - samplingR.generate(*R, rand_pool64); + view_type R = this->pcontainer_m->R.getView(); + samplingR.generate(R, rand_pool64); - view_type* P = &(this->pcontainer_m->P.getView()); + view_type P = this->pcontainer_m->P.getView(); double mu[Dim]; double sd[Dim]; @@ -237,7 +237,7 @@ class LandauDampingManager : public AlpineManager { mu[i] = 0.0; sd[i] = 1.0; } - Kokkos::parallel_for(nlocal, ippl::random::randn(*P, rand_pool64, mu, sd)); + Kokkos::parallel_for(nlocal, ippl::random::randn(P, rand_pool64, mu, sd)); Kokkos::fence(); ippl::Comm->barrier(); diff --git a/alpine/PenningTrapManager.h b/alpine/PenningTrapManager.h index b721f3ec8..c7e81de43 100644 --- a/alpine/PenningTrapManager.h +++ b/alpine/PenningTrapManager.h @@ -202,14 +202,14 @@ class PenningTrapManager : public AlpineManager { this->pcontainer_m->create(nlocal); - view_type* R = &(this->pcontainer_m->R.getView()); - samplingR.generate(*R, rand_pool64); + view_type R = this->pcontainer_m->R.getView(); + samplingR.generate(R, rand_pool64); - view_type* P = &(this->pcontainer_m->P.getView()); + view_type P = this->pcontainer_m->P.getView(); double muP[Dim] = {0.0, 0.0, 0.0}; double sdP[Dim] = {1.0, 1.0, 1.0}; - Kokkos::parallel_for(nlocal, ippl::random::randn(*P, rand_pool64, muP, sdP)); + Kokkos::parallel_for(nlocal, ippl::random::randn(P, rand_pool64, muP, sdP)); Kokkos::fence(); ippl::Comm->barrier(); diff --git a/src/LinearSolvers/PCG.h b/src/LinearSolvers/PCG.h index bdb65778d..c02c6e60b 100644 --- a/src/LinearSolvers/PCG.h +++ b/src/LinearSolvers/PCG.h @@ -6,9 +6,9 @@ #ifndef IPPL_PCG_H #define IPPL_PCG_H +#include "FEM/FEMVector.h" #include "Preconditioner.h" #include "SolverAlgorithm.h" -#include "FEM/FEMVector.h" namespace ippl { template ; - using LowerF = std::function; - using UpperF = std::function; - using UpperLowerF = std::function; - using InverseDiagF = std::function; - using DiagF = std::function; + using OperatorF = std::function; + using LowerF = std::function; + using UpperF = std::function; + using UpperLowerF = std::function; + using InverseDiagF = std::function; + using DiagF = std::function; using mesh_type = typename lhs_type::Mesh_t; using layout_type = typename lhs_type::Layout_t; @@ -75,9 +75,8 @@ namespace ippl { // set in main [[maybe_unused]] int outer = 1, // This is a dummy default parameter, actual default parameter should be - [[maybe_unused]] double omega = - 1 // This is a dummy default parameter, actual default parameter should be - // set in main + [[maybe_unused]] double omega = 1 // This is a dummy default parameter, actual default + // parameter should be set in main ) {} /*! * Query how many iterations were required to obtain the solution @@ -90,10 +89,10 @@ namespace ippl { const ParameterList& params) override { constexpr unsigned Dim = lhs_type::dim; - static IpplTimings::TimerRef cg_ops = IpplTimings::getTimer("CG"); + static IpplTimings::TimerRef cg_ops = IpplTimings::getTimer("CG"); static IpplTimings::TimerRef up_layout = IpplTimings::getTimer("updateLayout"); - static IpplTimings::TimerRef apply = IpplTimings::getTimer("applyOp"); - static IpplTimings::TimerRef inner = IpplTimings::getTimer("innerProduct"); + static IpplTimings::TimerRef apply = IpplTimings::getTimer("applyOp"); + static IpplTimings::TimerRef inner = IpplTimings::getTimer("innerProduct"); IpplTimings::startTimer(cg_ops); @@ -138,14 +137,13 @@ namespace ippl { d.setFieldBC(bc); IpplTimings::startTimer(inner); - T delta1 = innerProduct(r, d); + T delta1 = innerProduct(r, d); IpplTimings::stopTimer(inner); T delta0 = delta1; residueNorm = Kokkos::sqrt(delta1); const T tolerance = params.get("tolerance") * norm(rhs); while (iterations_m < maxIterations && residueNorm > tolerance) { - IpplTimings::startTimer(apply); q = op_m(d); IpplTimings::stopTimer(apply); @@ -153,7 +151,7 @@ namespace ippl { IpplTimings::startTimer(inner); T alpha = delta1 / innerProduct(d, q); IpplTimings::stopTimer(inner); - lhs = lhs + alpha * d; + lhs = lhs + alpha * d; // The exact residue is given by // r = rhs - op_m(lhs); @@ -188,26 +186,28 @@ namespace ippl { T residueNorm = 0; int iterations_m = 0; - private: + // Workspaces, allocated once via initializeFields() and reused across + // solves. Protected so derived solvers (e.g. PCG) can extend the + // workspace set without redeclaring r, d, q as locals on every + // operator() call. lhs_type r; lhs_type d; lhs_type q; }; - template - class CG, FEMVector > - : public SolverAlgorithm, FEMVector> { + class CG, + FEMVector> : public SolverAlgorithm, FEMVector> { using Base = SolverAlgorithm, FEMVector>; public: using typename Base::lhs_type, typename Base::rhs_type; - using OperatorF = std::function; - using LowerF = std::function; - using UpperF = std::function; - using UpperLowerF = std::function; - using InverseDiagF = std::function; + using OperatorF = std::function; + using LowerF = std::function; + using UpperF = std::function; + using UpperLowerF = std::function; + using InverseDiagF = std::function; virtual ~CG() = default; @@ -249,13 +249,12 @@ namespace ippl { * @return Iteration count of last solve */ virtual int getIterationCount() { return iterations_m; } - + virtual void operator()(lhs_type& lhs, rhs_type& rhs, const ParameterList& params) override { - - //constexpr unsigned Dim = lhs_type::dim; - //typename lhs_type::Mesh_t& mesh = lhs.get_mesh(); - //typename lhs_type::Layout_t& layout = lhs.getLayout(); + // constexpr unsigned Dim = lhs_type::dim; + // typename lhs_type::Mesh_t& mesh = lhs.get_mesh(); + // typename lhs_type::Layout_t& layout = lhs.getLayout(); iterations_m = 0; const int maxIterations = params.get("max_iterations"); @@ -263,23 +262,21 @@ namespace ippl { // Variable names mostly based on description in // https://www.cs.cmu.edu/~quake-papers/painless-conjugate-gradient.pdf lhs_type r = lhs.deepCopy(); - r = 0; + r = 0; lhs_type d = lhs.deepCopy(); - d = 0; - - + d = 0; + r = rhs - op_m(lhs); r.setHalo(0); - d = r; //.deepCopy(); - //d.setFieldBC(bc); + d = r; //.deepCopy(); + // d.setFieldBC(bc); T delta1 = innerProduct(r, d); T delta0 = delta1; residueNorm = Kokkos::sqrt(delta1); const T tolerance = params.get("tolerance") * norm(rhs); lhs_type q = lhs.deepCopy(); - q = 0; - + q = 0; while (iterations_m < maxIterations && residueNorm > tolerance) { q = op_m(d); @@ -303,8 +300,6 @@ namespace ippl { residueNorm = Kokkos::sqrt(delta1); d = r + beta * d; ++iterations_m; - - } } @@ -316,7 +311,6 @@ namespace ippl { int iterations_m = 0; }; - template @@ -327,17 +321,27 @@ namespace ippl { public: using typename Base::lhs_type, typename Base::rhs_type; - using OperatorF = std::function; - using LowerF = std::function; - using UpperF = std::function; - using UpperLowerF = std::function; - using InverseDiagF = std::function; - using DiagF = std::function; + using OperatorF = std::function; + using LowerF = std::function; + using UpperF = std::function; + using UpperLowerF = std::function; + using InverseDiagF = std::function; + using DiagF = std::function; + + using mesh_type = typename lhs_type::Mesh_t; + using layout_type = typename lhs_type::Layout_t; PCG() : CG() - , preconditioner_m(nullptr){}; + , preconditioner_m(nullptr) {}; + + void initializeFields(mesh_type& mesh, layout_type& layout) override { + CG::initializeFields(mesh, layout); + s.initialize(mesh, layout); + pcond_out.initialize(mesh, layout); + } /*! * Sets the differential operator for the conjugate gradient algorithm @@ -365,8 +369,9 @@ namespace ippl { // set in main double omega = 1.57079632679 // This is a dummy default parameter, actual default // parameter should be set in main - // default = pi/2 as this was found optimal during hyperparameter scan for test case - // (see https://amas.web.psi.ch/people/aadelmann/ETH-Accel-Lecture-1/projectscompleted/cse/BSc-mbolliger.pdf) + // default = pi/2 as this was found optimal during hyperparameter scan for test case + // (see + // https://amas.web.psi.ch/people/aadelmann/ETH-Accel-Lecture-1/projectscompleted/cse/BSc-mbolliger.pdf) ) override { if (preconditioner_type == "jacobi") { // Turn on damping parameter @@ -396,8 +401,7 @@ namespace ippl { preconditioner_m = std::move(std::make_unique< richardson_preconditioner_alt>( - std::move(op), std::move(inverse_diagonal), - richardson_iterations)); + std::move(op), std::move(inverse_diagonal), richardson_iterations)); } else if (preconditioner_type == "gauss-seidel") { preconditioner_m = std::move( std::make_unique>( @@ -422,19 +426,23 @@ namespace ippl { "Preconditioner has not been set for PCG solver"); } - typename lhs_type::Mesh_t& mesh = lhs.get_mesh(); - typename lhs_type::Layout_t& layout = lhs.getLayout(); - this->iterations_m = 0; const int maxIterations = params.get("max_iterations"); // Variable names mostly based on description in // https://www.cs.cmu.edu/~quake-papers/painless-conjugate-gradient.pdf - lhs_type r(mesh, layout); - lhs_type d(mesh, layout); - lhs_type s(mesh, layout); - lhs_type q(mesh, layout); - + // Field layouts are updated such that we don't keep a stale domain decomposition + // if the lhs layout has been changed during the simulation, for example by the load + // balancer. + this->r.updateLayout(lhs.getLayout()); + this->d.updateLayout(lhs.getLayout()); + s.updateLayout(lhs.getLayout()); + pcond_out.updateLayout(lhs.getLayout()); + this->q.updateLayout(lhs.getLayout()); + + // Each preconditioner's init_fields() is responsible for being + // cheap on the steady-state path (refreshing layout in-place, not + // reallocating). preconditioner_m->init_fields(lhs); using bc_type = BConds; @@ -459,20 +467,22 @@ namespace ippl { } } - r = rhs - this->op_m(lhs); - d = preconditioner_m->operator()(r).deepCopy(); - d.setFieldBC(bc); + this->r = rhs - this->op_m(lhs); + (*preconditioner_m)(this->r, pcond_out); + this->d = T(1) * pcond_out; + this->d.setFieldBC(bc); - T delta1 = innerProduct(r, d); + T delta1 = innerProduct(this->r, this->d); T delta0 = delta1; this->residueNorm = Kokkos::sqrt(Kokkos::abs(delta1)); const T tolerance = params.get("tolerance") * this->residueNorm; - while (this->iterations_m < maxIterations && this->residueNorm > tolerance) { - q = this->op_m(d); - q = q.deepCopy(); - T alpha = delta1 / innerProduct(d, q); - lhs = lhs + alpha * d; + while (this->iterations_mresidueNorm> tolerance) { + // op_m(d) writes its expression into q's existing storage; no + // allocation, no per-iteration deep copy. + this->q = this->op_m(this->d); + T alpha = delta1 / innerProduct(this->d, this->q); + lhs = lhs + alpha * this->d; // The exact residue is given by // r = rhs - BaseCG::op_m(lhs); @@ -481,16 +491,17 @@ namespace ippl { // the correction does not have a significant effect on accuracy; // in some implementations, the correction may be applied every few // iterations to offset accumulated floating point errors - r = r - alpha * q; - s = preconditioner_m->operator()(r).deepCopy(); + this->r = this->r - alpha * this->q; + // s := M^{-1} r; preconditioner writes into s (NoBcFace). + (*preconditioner_m)(this->r, s); delta0 = delta1; - delta1 = innerProduct(r, s); + delta1 = innerProduct(this->r, s); T beta = delta1 / delta0; this->residueNorm = Kokkos::sqrt(Kokkos::abs(delta1)); - d = s + beta * d; + this->d = s + beta * this->d; ++this->iterations_m; } @@ -502,6 +513,20 @@ namespace ippl { protected: std::unique_ptr> preconditioner_m; + + /* + * Extends the CG workspace with the preconditioner result buffers s and + * pcond_out so operator() does not allocate per solve. + * Preconditioner result buffers, allocated once via initializeFields() + * pcond_out keeps its default NoBcFace BCs: the preconditioner's internal operator + * chain therefore never triggers PeriodicFace::apply MPI calls -- if it + * did, the global MPI sequence would diverge from the master code path + * (where the preconditioner returned a fresh NoBcFace field) and + * intermittent multi-rank halo deadlocks would follow. + */ + + lhs_type s; + lhs_type pcond_out; }; }; // namespace ippl diff --git a/src/LinearSolvers/Preconditioner.h b/src/LinearSolvers/Preconditioner.h index 09eb876aa..2fda7c4cc 100644 --- a/src/LinearSolvers/Preconditioner.h +++ b/src/LinearSolvers/Preconditioner.h @@ -7,13 +7,23 @@ #ifndef IPPL_PRECONDITIONER_H #define IPPL_PRECONDITIONER_H +#include + #include "Expression/IpplOperations.h" // get the function apply() // Expands to a lambda that acts as a wrapper for a differential operator // fun: the function for which to create the wrapper, such as ippl::laplace // type: the argument type, which should match the LHS type for the solver +// +// IMPORTANT: takes the field by *reference*, not by value. A by-value +// signature would copy the Field on every call -- BareField has shared- +// ownership Kokkos::View members, so the data isn't actually copied, but +// the embedded HaloCells::haloData_m buffer that the halo exchange grows +// via Kokkos::realloc only grows on the copy and never propagates back to +// the original. The result was a fresh cudaMalloc per op_m() call in +// multi-rank runs (see nsys profile in ~/prof/pif-pr-verify). #define IPPL_SOLVER_OPERATOR_WRAPPER(fun, type) \ - [](type arg) { \ + [](type& arg) { \ return fun(arg); \ } @@ -32,17 +42,18 @@ namespace ippl { virtual ~preconditioner() = default; - // Placeholder for the function operator, actually implemented in the derived classes - virtual Field operator()(Field& u) { - Field res = u.deepCopy(); - return res; + // Apply the preconditioner: result = M^{-1} u. Concrete preconditioners + // override this to write into the caller-provided result buffer; this + // avoids per-call Field allocations and per-call deep copies in PCG. + // The default (identity) preconditioner copies u into result. + virtual void operator()(Field& u, Field& result) { + Kokkos::deep_copy(result.getView(), u.getView()); } - // Placeholder for setting additional fields, actually implemented in the derived classes - virtual void init_fields(Field& b) { - Field res = b.deepCopy(); - return; - } + // Allocate any scratch fields the preconditioner needs. Called once + // by the owning solver after the layout is known. Concrete + // preconditioners that need scratch override this. + virtual void init_fields(Field& /*b*/) {} std::string get_type() { return type_m; }; @@ -66,14 +77,23 @@ namespace ippl { inverse_diagonal_m = std::move(inverse_diagonal); } - Field operator()(Field& u) override { - mesh_type& mesh = u.get_mesh(); - layout_type& layout = u.getLayout(); - Field res(mesh, layout); - - res = inverse_diagonal_m(u); - res = w_m * res; - return res; + void operator()(Field& u, Field& result) override { + // result = w * D^{-1} * u, written element-wise into the caller- + // provided result buffer. Two forms of inverse_diagonal_m are + // supported, matching the convention used by the other + // preconditioners in this file: + // - returns a scalar (e.g. uniform-mesh Poisson Laplacian): + // result = w * scalar * u + // - returns a Field / expression equal to D^{-1} * u + // (e.g. matrix-free FEM operators): + // result = w * inverse_diagonal_m(u) + if constexpr (std::is_same_v>) { + const double scale = w_m * inverse_diagonal_m(u); + result = scale * u; + } else { + result = inverse_diagonal_m(u); + result = w_m * result; + } } protected: @@ -125,12 +145,47 @@ namespace ippl { return *this = polynomial_newton_preconditioner(other); } - Field recursive_preconditioner(Field& u, unsigned int level) { - mesh_type& mesh = u.get_mesh(); - layout_type& layout = u.getLayout(); - // Define etas if not defined yet + // Recursive Newton expansion P_k(u) where: + // P_0(u) = eta_0 * u + // P_k(u) = eta_k * (2 P_{k-1}(u) - P_{k-1}(A P_{k-1}(u))) + // Writes the result of level `level` into `out`. Uses one scratch + // field per recursion depth (Pr, PA, PAPr) so that no Field is + // allocated per call. The two recursive calls at each level execute + // sequentially and may reuse scratch at lower depths because the + // first call's lower-depth values have already been folded into + // Pr_scratch[level] before the second call begins. + void recursive_preconditioner(Field& u, unsigned int level, Field& out) { + // Timers accumulated across all levels of the recursion: useful + // for "how much of pcond is laplace vs combine vs leaf-assign". + // Counts include every call at every level (so the call-count + // column equals 2^(level_m+1)-1, not the number of operator() + // invocations). + if (level == 0) { + out = eta_m[0] * u; + return; + } + Field& Pr = Pr_scratch_m[level]; + Field& PA = PA_scratch_m[level]; + Field& PAPr = PAPr_scratch_m[level]; + + // Inner recursive call owns its own timing; we don't double-count + // it here. + recursive_preconditioner(u, level - 1, Pr); + PA = op_m(Pr); + recursive_preconditioner(PA, level - 1, PAPr); + out = eta_m[level] * (2.0 * Pr - PAPr); + } + + void operator()(Field& u, Field& result) override { + // One outer timer per polynomial_newton apply, complementing the + // per-call recursion timers. Lets us see how many outer pcond + // calls a solve does vs. the time inside each. + recursive_preconditioner(u, level_m, result); + } + + void init_fields(Field& b) override { + // One-shot precomputation of the eta coefficients. if (eta_m == nullptr) { - // Precompute the etas for later use eta_m = new double[level_m + 1]; eta_m[0] = 2.0 / ((alpha_m + beta_m) * (1.0 + zeta_m)); if (level_m > 0) { @@ -143,25 +198,31 @@ namespace ippl { } } - Field res(mesh, layout); - // Base case - if (level == 0) { - res = eta_m[0] * u; - return res; + // First call: allocate one scratch field per recursion depth. + // Subsequent calls: refresh layout in place so a load-balance + // repartition tracks correctly without freeing/reallocating + // when the local extents are unchanged. + mesh_type& mesh = b.get_mesh(); + layout_type& layout = b.getLayout(); + if (!fields_initialized_m) { + Pr_scratch_m.resize(level_m + 1); + PA_scratch_m.resize(level_m + 1); + PAPr_scratch_m.resize(level_m + 1); + for (unsigned int i = 1; i <= level_m; ++i) { + Pr_scratch_m[i] = Field(mesh, layout); + PA_scratch_m[i] = Field(mesh, layout); + PAPr_scratch_m[i] = Field(mesh, layout); + } + fields_initialized_m = true; + } else { + for (unsigned int i = 1; i <= level_m; ++i) { + Pr_scratch_m[i].updateLayout(layout); + PA_scratch_m[i].updateLayout(layout); + PAPr_scratch_m[i].updateLayout(layout); + } } - // Recursive case - Field PAPr(mesh, layout); - Field Pr(mesh, layout); - - Pr = recursive_preconditioner(u, level - 1); - PAPr = op_m(Pr); - PAPr = recursive_preconditioner(PAPr, level - 1); - res = eta_m[level] * (2.0 * Pr - PAPr); - return res; } - Field operator()(Field& u) override { return recursive_preconditioner(u, level_m); } - protected: OperatorF op_m; // Operator to be preconditioned double alpha_m; // Smallest Eigenvalue @@ -170,6 +231,10 @@ namespace ippl { double zeta_m; // smallest (alpha + beta) is multiplied by (1+zeta) to avoid clustering of // Eigenvalues double* eta_m = nullptr; // Size is determined at runtime + std::vector Pr_scratch_m; + std::vector PA_scratch_m; + std::vector PAPr_scratch_m; + bool fields_initialized_m = false; }; /*! @@ -219,19 +284,37 @@ namespace ippl { return *this = polynomial_chebyshev_preconditioner(other); } - Field operator()(Field& r) override { - mesh_type& mesh = r.get_mesh(); - layout_type& layout = r.getLayout(); + void operator()(Field& r, Field& result) override { + // x_m, x_old_m, A_m, z_m are pre-allocated scratch (init_fields). + // Coefficients in rho_m are also computed once. + x_old_m = r / theta_m; + A_m = op_m(r); + x_m = 2.0 * rho_m[1] / delta_m * (2.0 * r - A_m / theta_m); + if (degree_m == 0) { + // result = x_old + Kokkos::deep_copy(result.getView(), x_old_m.getView()); + return; + } - Field res(mesh, layout); - Field x(mesh, layout); - Field x_old(mesh, layout); - Field A(mesh, layout); - Field z(mesh, layout); + if (degree_m == 1) { + // result = x + Kokkos::deep_copy(result.getView(), x_m.getView()); + return; + } + for (unsigned int i = 2; i < degree_m + 1; ++i) { + A_m = op_m(x_m); + z_m = 2.0 / delta_m * (r - A_m); + // Write the new x value into result (the caller's buffer); + // x_old gets a deep copy of the previous x. + result = rho_m[i] * (2 * sigma_m * x_m - rho_m[i - 1] * x_old_m + z_m); + Kokkos::deep_copy(x_old_m.getView(), x_m.getView()); + Kokkos::deep_copy(x_m.getView(), result.getView()); + } + } - // Precompute the coefficients if not done yet + void init_fields(Field& b) override { + // One-shot precomputation of the rho coefficients. if (rho_m == nullptr) { - // Start precomputing the coefficients theta_m = (beta_m + alpha_m) / 2.0 * (1.0 + zeta_m); delta_m = (beta_m - alpha_m) / 2.0; sigma_m = theta_m / delta_m; @@ -241,29 +324,23 @@ namespace ippl { for (unsigned int i = 1; i < degree_m + 1; ++i) { rho_m[i] = 1.0 / (2.0 * sigma_m - rho_m[i - 1]); } - } // End of precomputing the coefficients - - res = r.deepCopy(); - - x_old = r / theta_m; - A = op_m(r); - x = 2.0 * rho_m[1] / delta_m * (2.0 * r - A / theta_m); - - if (degree_m == 0) { - return x_old; - } - - if (degree_m == 1) { - return x; } - for (unsigned int i = 2; i < degree_m + 1; ++i) { - A = op_m(x); - z = 2.0 / delta_m * (r - A); - res = rho_m[i] * (2 * sigma_m * x - rho_m[i - 1] * x_old + z); - x_old = x.deepCopy(); - x = res.deepCopy(); + mesh_type& mesh = b.get_mesh(); + layout_type& layout = b.getLayout(); + // First call allocates; subsequent calls refresh the layout so a + // repartition is tracked without throwing the storage away. + if (!fields_initialized_m) { + x_m = Field(mesh, layout); + x_old_m = Field(mesh, layout); + A_m = Field(mesh, layout); + z_m = Field(mesh, layout); + fields_initialized_m = true; + } else { + x_m.updateLayout(layout); + x_old_m.updateLayout(layout); + A_m.updateLayout(layout); + z_m.updateLayout(layout); } - return res; } protected: @@ -276,6 +353,11 @@ namespace ippl { unsigned degree_m; double zeta_m; double* rho_m = nullptr; // Size is determined at runtime + Field x_m; + Field x_old_m; + Field A_m; + Field z_m; + bool fields_initialized_m = false; }; /*! @@ -295,22 +377,18 @@ namespace ippl { inverse_diagonal_m = std::move(inverse_diagonal); } - Field operator()(Field& r) override { + void operator()(Field& r, Field& result) override { + // Richardson iteration in-place on the caller-provided result + // buffer. ULg_m stays as a member scratch; the inner deep copies + // remain because the (upper, diag, inverse, lower) operators may + // return Field-valued expressions that alias their input. - // In the FEM solver, which uses the preconditioner, - // we re-use a resultField to avoid allocating new - // memory at every iteration. - // In order for the operator calls to not rewrite - // on this same field over and over when calling - // the operators (upper, diag, inverse, lower, etc) - // we need deep copies to the preconditioner fields. - - g_m = 0; + result = 0; for (unsigned int j = 0; j < innerloops_m; ++j) { - ULg_m = upper_and_lower_m(g_m); - ULg_m = ULg_m.deepCopy(); - g_m = r - ULg_m; - + ULg_m = upper_and_lower_m(result); + ULg_m = ULg_m.deepCopy(); + result = r - ULg_m; + // The inverse diagonal is applied to the // vector itself to return the result usually. // However, the operator for FEM already @@ -318,21 +396,23 @@ namespace ippl { // due to the matrix-free evaluation. // Therefore, we need this if to differentiate // the two cases. - if constexpr (std::is_same_v>) { - g_m = inverse_diagonal_m(g_m) * g_m; + if constexpr (std::is_same_v>) { + result = inverse_diagonal_m(result) * result; } else { - g_m = inverse_diagonal_m(g_m).deepCopy(); + result = inverse_diagonal_m(result).deepCopy(); } } - return g_m; } void init_fields(Field& b) override { layout_type& layout = b.getLayout(); mesh_type& mesh = b.get_mesh(); - - ULg_m = Field(mesh, layout); - g_m = Field(mesh, layout); + if (!fields_initialized_m) { + ULg_m = Field(mesh, layout); + fields_initialized_m = true; + } else { + ULg_m.updateLayout(layout); + } } protected: @@ -340,7 +420,7 @@ namespace ippl { InvDiagF inverse_diagonal_m; unsigned innerloops_m; Field ULg_m; - Field g_m; + bool fields_initialized_m = false; }; /*! @@ -357,29 +437,26 @@ namespace ippl { using layout_type = typename Field::Layout_t; richardson_preconditioner_alt(OperatorF&& op, InvDiagF&& inverse_diagonal, - unsigned innerloops = 5) + unsigned innerloops = 5) : preconditioner("Richardson_alt") , innerloops_m(innerloops) { - op_m = std::move(op); + op_m = std::move(op); inverse_diagonal_m = std::move(inverse_diagonal); } - Field operator()(Field& r) override { - // In the FEM solver, which uses the preconditioner, - // we re-use a resultField to avoid allocating new - // memory at every iteration. - // In order for the operator calls to not rewrite - // on this same field over and over when calling - // the operators (upper, diag, inverse, lower, etc) - // we need deep copies to the preconditioner fields. + void operator()(Field& r, Field& result) override { + // result holds the running iterate; Ag_m and g_old_m are scratch + // members. The inner deep copies remain because the operators + // (op_m, inverse_diagonal_m) may return Field-valued expressions + // that alias their input. - g_m = 0; + result = 0; g_old_m = 0; for (unsigned int j = 0; j < innerloops_m; ++j) { - Ag_m = op_m(g_m); - Ag_m = Ag_m.deepCopy(); - g_m = r - Ag_m; + Ag_m = op_m(result); + Ag_m = Ag_m.deepCopy(); + result = r - Ag_m; // The inverse diagonal is applied to the // vector itself to return the result usually. @@ -388,23 +465,26 @@ namespace ippl { // due to the matrix-free evaluation. // Therefore, we need this if to differentiate // the two cases. - if constexpr (std::is_same_v>) { - g_m = g_old_m + inverse_diagonal_m(g_m) * g_m; + if constexpr (std::is_same_v>) { + result = g_old_m + inverse_diagonal_m(result) * result; } else { - g_m = g_old_m + inverse_diagonal_m(g_m); + result = g_old_m + inverse_diagonal_m(result); } - g_old_m = g_m.deepCopy(); + Kokkos::deep_copy(g_old_m.getView(), result.getView()); } - return g_m; } void init_fields(Field& b) override { layout_type& layout = b.getLayout(); mesh_type& mesh = b.get_mesh(); - - Ag_m = Field(mesh, layout); - g_m = Field(mesh, layout); - g_old_m = Field(mesh, layout); + if (!fields_initialized_m) { + Ag_m = Field(mesh, layout); + g_old_m = Field(mesh, layout); + fields_initialized_m = true; + } else { + Ag_m.updateLayout(layout); + g_old_m.updateLayout(layout); + } } protected: @@ -412,8 +492,8 @@ namespace ippl { InvDiagF inverse_diagonal_m; unsigned innerloops_m; Field Ag_m; - Field g_m; Field g_old_m; + bool fields_initialized_m = false; }; /*! @@ -435,30 +515,27 @@ namespace ippl { inverse_diagonal_m = std::move(inverse_diagonal); } - Field operator()(Field& b) override { - layout_type& layout = b.getLayout(); - mesh_type& mesh = b.get_mesh(); - - Field x(mesh, layout); + void operator()(Field& b, Field& result) override { + // The running iterate lives in result; UL_m and r_m are scratch + // members. The inner deep copies remain because the (upper, lower, + // inverse) operators may return Field-valued expressions that + // alias their input. - x = 0; // Initial guess - - // In the FEM solver, which uses the preconditioner, - // we re-use a resultField to avoid allocating new - // memory at every iteration. - // In order for the operator calls to not rewrite - // on this same field over and over when calling - // the operators (upper, diag, inverse, lower, etc) - // we need deep copies to the preconditioner fields. + result = 0; // Initial guess for (unsigned int k = 0; k < outerloops_m; ++k) { - UL_m = upper_m(x); + UL_m = upper_m(result); UL_m = UL_m.deepCopy(); r_m = b - UL_m; for (unsigned int j = 0; j < innerloops_m; ++j) { - UL_m = lower_m(x); - UL_m = UL_m.deepCopy(); - x = r_m - UL_m; + UL_m = lower_m(result); + UL_m = UL_m.deepCopy(); + result = r_m - UL_m; + if constexpr (std::is_same_v>) { + result = inverse_diagonal_m(result) * result; + } else { + result = inverse_diagonal_m(result).deepCopy(); + } // The inverse diagonal is applied to the // vector itself to return the result usually. // However, the operator for FEM already @@ -466,19 +543,19 @@ namespace ippl { // due to the matrix-free evaluation. // Therefore, we need this if to differentiate // the two cases. - if constexpr (std::is_same_v>) { - x = inverse_diagonal_m(x) * x; - } else { - x = inverse_diagonal_m(x).deepCopy(); - } } - UL_m = lower_m(x); + UL_m = lower_m(result); UL_m = UL_m.deepCopy(); r_m = b - UL_m; for (unsigned int j = 0; j < innerloops_m; ++j) { - UL_m = upper_m(x); - UL_m = UL_m.deepCopy(); - x = r_m - UL_m; + UL_m = upper_m(result); + UL_m = UL_m.deepCopy(); + result = r_m - UL_m; + if constexpr (std::is_same_v>) { + result = inverse_diagonal_m(result) * result; + } else { + result = inverse_diagonal_m(result).deepCopy(); + } // The inverse diagonal is applied to the // vector itself to return the result usually. // However, the operator for FEM already @@ -486,22 +563,21 @@ namespace ippl { // due to the matrix-free evaluation. // Therefore, we need this if to differentiate // the two cases. - if constexpr (std::is_same_v>) { - x = inverse_diagonal_m(x) * x; - } else { - x = inverse_diagonal_m(x).deepCopy(); - } } } - return x; } void init_fields(Field& b) override { layout_type& layout = b.getLayout(); mesh_type& mesh = b.get_mesh(); - - UL_m = Field(mesh, layout); - r_m = Field(mesh, layout); + if (!fields_initialized_m) { + UL_m = Field(mesh, layout); + r_m = Field(mesh, layout); + fields_initialized_m = true; + } else { + UL_m.updateLayout(layout); + r_m.updateLayout(layout); + } } protected: @@ -512,6 +588,7 @@ namespace ippl { unsigned outerloops_m; Field UL_m; Field r_m; + bool fields_initialized_m = false; }; /*! @@ -536,29 +613,12 @@ namespace ippl { diagonal_m = std::move(diagonal); } - Field operator()(Field& b) override { - static IpplTimings::TimerRef initTimer = IpplTimings::getTimer("SSOR Init"); - IpplTimings::startTimer(initTimer); - - double D; - - layout_type& layout = b.getLayout(); - mesh_type& mesh = b.get_mesh(); - - Field x(mesh, layout); - - x = 0; // Initial guess - - IpplTimings::stopTimer(initTimer); - - static IpplTimings::TimerRef loopTimer = IpplTimings::getTimer("SSOR loop"); - IpplTimings::startTimer(loopTimer); - - // In the FEM solver, which uses the preconditioner, + void operator()(Field& b, Field& result) override { + // In the FEM solver, which uses the preconditioner, // we re-use a resultField to avoid allocating new // memory at every iteration. // In order for the operator calls to not rewrite - // on this same field over and over when calling + // on this same field over and over when calling // the operators (upper, diag, inverse, lower, etc) // we need deep copies to the preconditioner fields. @@ -569,53 +629,65 @@ namespace ippl { // due to the matrix-free evaluation. // Therefore, we need this if to differentiate // the two cases. + + double D; + + // In order for the operator calls to not rewrite on this same field + // over and over when calling the operators (upper, diag, inverse, + // lower, etc) we need deep copies to the preconditioner fields." + + result = 0; // Initial guess + for (unsigned int k = 0; k < outerloops_m; ++k) { - if constexpr (std::is_same_v>) { - UL_m = upper_m(x); - D = diagonal_m(x); - r_m = omega_m * (b - UL_m) + (1.0 - omega_m) * D * x; + if constexpr (std::is_same_v>) { + UL_m = upper_m(result); + D = diagonal_m(result); + r_m = omega_m * (b - UL_m) + (1.0 - omega_m) * D * result; for (unsigned int j = 0; j < innerloops_m; ++j) { - UL_m = lower_m(x); - x = r_m - omega_m * UL_m; - x = inverse_diagonal_m(x) * x; + UL_m = lower_m(result); + result = r_m - omega_m * UL_m; + result = inverse_diagonal_m(result) * result; } - UL_m = lower_m(x); - D = diagonal_m(x); - r_m = omega_m * (b - UL_m) + (1.0 - omega_m) * D * x; + UL_m = lower_m(result); + D = diagonal_m(result); + r_m = omega_m * (b - UL_m) + (1.0 - omega_m) * D * result; for (unsigned int j = 0; j < innerloops_m; ++j) { - UL_m = upper_m(x); - x = r_m - omega_m * UL_m; - x = inverse_diagonal_m(x) * x; + UL_m = upper_m(result); + result = r_m - omega_m * UL_m; + result = inverse_diagonal_m(result) * result; } } else { - UL_m = upper_m(x).deepCopy(); - r_m = omega_m * (b - UL_m) + (1.0 - omega_m) * diagonal_m(x); + UL_m = upper_m(result).deepCopy(); + r_m = omega_m * (b - UL_m) + (1.0 - omega_m) * diagonal_m(result); for (unsigned int j = 0; j < innerloops_m; ++j) { - UL_m = lower_m(x).deepCopy(); - x = r_m - omega_m * UL_m; - x = inverse_diagonal_m(x).deepCopy(); + UL_m = lower_m(result).deepCopy(); + result = r_m - omega_m * UL_m; + result = inverse_diagonal_m(result).deepCopy(); } - UL_m = lower_m(x).deepCopy(); - r_m = omega_m * (b - UL_m) + (1.0 - omega_m) * diagonal_m(x); + UL_m = lower_m(result).deepCopy(); + r_m = omega_m * (b - UL_m) + (1.0 - omega_m) * diagonal_m(result); for (unsigned int j = 0; j < innerloops_m; ++j) { - UL_m = upper_m(x).deepCopy(); - x = r_m - omega_m * UL_m; - x = inverse_diagonal_m(x).deepCopy(); + UL_m = upper_m(result).deepCopy(); + result = r_m - omega_m * UL_m; + result = inverse_diagonal_m(result).deepCopy(); } } } - IpplTimings::stopTimer(loopTimer); - return x; } void init_fields(Field& b) override { layout_type& layout = b.getLayout(); mesh_type& mesh = b.get_mesh(); - - UL_m = Field(mesh, layout); - r_m = Field(mesh, layout); + if (!fields_initialized_m) { + UL_m = Field(mesh, layout); + r_m = Field(mesh, layout); + fields_initialized_m = true; + } else { + UL_m.updateLayout(layout); + r_m.updateLayout(layout); + } } protected: @@ -628,6 +700,7 @@ namespace ippl { double omega_m; Field UL_m; Field r_m; + bool fields_initialized_m = false; }; /*! diff --git a/src/PoissonSolvers/PoissonCG.h b/src/PoissonSolvers/PoissonCG.h index 7c32f4257..db4c2d1dd 100644 --- a/src/PoissonSolvers/PoissonCG.h +++ b/src/PoissonSolvers/PoissonCG.h @@ -11,13 +11,12 @@ #include "Poisson.h" namespace ippl { -// Expands to a lambda that acts as a wrapper for a differential operator -// fun: the function for which to create the wrapper, such as ippl::laplace -// type: the argument type, which should match the LHS type for the solver -#define IPPL_SOLVER_OPERATOR_WRAPPER(fun, type) \ - [](type arg) { \ - return fun(arg); \ - } +// IPPL_SOLVER_OPERATOR_WRAPPER is defined once in LinearSolvers/Preconditioner.h +// (re-exported through this header via PCG.h). Defining it again here used to +// silently shadow that definition with a by-value lambda, which copies the +// Field on every op_m() call and reintroduces the per-iteration cudaMalloc +// in the halo exchange (the realloc never propagates back to the original +// Field). Keep a single by-reference definition. template class PoissonCG : public Poisson {