From ac07afa89a93fc2d5b4f33c9e47d0a1329f45b9b Mon Sep 17 00:00:00 2001
From: Fischill Paul <paul.fischill@inf.ethz.ch>
Date: Sun, 31 May 2026 22:39:50 +0200
Subject: [PATCH 01/14] Extract particle communication update infrastructure

Split the communication and particle-update infrastructure from PR501 on top of the PCG split. This brings in reusable communication buffers, page-granular archive allocation, particle attribute serialization hooks, packed particle send IDs, particle sorting buffers, and the rewritten ParticleSpatialLayout update path.

Keep this branch independent from the later interpolation, FFT, NUFFT, and PIF splits by dropping those APIs from the extracted ParticleAttrib changes. Add particle update regression coverage and update existing tests for live-view and page-sized buffer semantics.

Validated with a Debug Serial Kokkos 5.0.0 build: full 1-rank ctest passes, plus ParticleSendRecv, ParticleUpdate, and ParticleUpdateNonuniform pass under mpiexec -n 2.
---
 PR501_SPLIT_MAP.md                            |  51 +
 src/Communicate/Archive.h                     |  57 +-
 src/Communicate/Archive.hpp                   | 373 +++++--
 src/Communicate/BufferHandler.hpp             |  30 +-
 src/Communicate/Buffers.cpp                   |   4 +-
 src/Communicate/Buffers.hpp                   |  10 +-
 src/Communicate/Communicator.cpp              |  36 +-
 src/Communicate/Communicator.h                |  54 +-
 src/Communicate/CommunicatorLogging.cpp       |  35 +-
 src/Communicate/Environment.cpp               |  18 +-
 src/Communicate/Environment.h                 |   3 +
 src/Communicate/LogEntry.cpp                  |  25 +-
 src/Communicate/LogEntry.h                    |   5 +
 src/Communicate/LoggingBufferHandler.h        |   1 -
 src/Field/BareField.hpp                       |   4 +-
 src/Field/HaloCells.h                         |   7 +-
 src/Field/HaloCells.hpp                       |  17 +-
 src/FieldLayout/FieldLayout.h                 |  44 +-
 src/FieldLayout/FieldLayout.hpp               |  28 +-
 src/FieldLayout/SubFieldLayout.hpp            |   2 +-
 src/Ippl.h                                    |   3 +-
 src/Particle/ParticleAttrib.h                 |  77 +-
 src/Particle/ParticleAttrib.hpp               | 100 +-
 src/Particle/ParticleAttribBase.h             |  11 +-
 src/Particle/ParticleBase.h                   |  17 +-
 src/Particle/ParticleBase.hpp                 | 103 +-
 src/Particle/ParticleSort.h                   | 331 +++++++
 src/Particle/ParticleSpatialLayout.h          | 115 ++-
 src/Particle/ParticleSpatialLayout.hpp        | 623 ++++++------
 src/Particle/SortBuffer.h                     | 185 ++++
 src/Utility/BufferView.h                      | 229 +++++
 src/Utility/IpplTimings.cpp                   | 159 ++-
 src/Utility/IpplTimings.h                     |  79 +-
 src/Utility/ParallelDispatch.h                | 129 ++-
 src/Utility/ParameterList.h                   |   2 +
 src/Utility/Timer.cpp                         |  23 +-
 src/Utility/Timer.h                           |   9 +-
 src/Utility/Tuning.h                          | 213 ++++
 src/Utility/TypeUtils.h                       |  79 +-
 src/Utility/ViewUtils.h                       |   3 +-
 unit_tests/Communicate/BufferHandler.cpp      |  24 +-
 unit_tests/Particle/CMakeLists.txt            |   2 +
 unit_tests/Particle/ParticleBase.cpp          |   1 +
 unit_tests/Particle/ParticleSendRecv.cpp      |  10 +-
 unit_tests/Particle/ParticleUpdate.cpp        | 830 ++++++++++++++++
 .../Particle/ParticleUpdateNonuniform.cpp     | 937 ++++++++++++++++++
 46 files changed, 4400 insertions(+), 698 deletions(-)
 create mode 100644 src/Particle/ParticleSort.h
 create mode 100644 src/Particle/SortBuffer.h
 create mode 100644 src/Utility/BufferView.h
 create mode 100644 src/Utility/Tuning.h
 create mode 100644 unit_tests/Particle/ParticleUpdate.cpp
 create mode 100644 unit_tests/Particle/ParticleUpdateNonuniform.cpp

diff --git a/PR501_SPLIT_MAP.md b/PR501_SPLIT_MAP.md
index ba0e88300..90dd59c21 100644
--- a/PR501_SPLIT_MAP.md
+++ b/PR501_SPLIT_MAP.md
@@ -363,3 +363,54 @@ Assessment:
 - This is a good standalone first PR.
 - It has a narrow file set and clear motivation: remove repeated allocation work from the PCG/preconditioner loop.
 - Recommended next validation before opening: run the relevant solver integration tests or CSCS GPU tests that originally showed PCG allocation/slowdown symptoms.
+
+### 2026-05-31: PR 2 Prototype - Communication And Particle Update Infrastructure
+
+Status: extracted on top of `pr501-pcg`.
+
+Branch:
+
+```text
+pr501-communication-particle-update
+```
+
+Included PR501 areas:
+
+- Communication buffer/archive updates from `9507a796` and `f2820064`
+- Particle layout rewrite and sort buffers from `2878b90b`
+- Particle update regression tests from the particle portion of `f0560f76`
+- Local integration fixes needed to keep this split independent from the later interpolation/FFT/NUFFT PRs
+
+Deliberately excluded:
+
+- Higher-order interpolation/gather/scatter APIs
+- FFT backend and NUFFT transform integration
+- PIF example/application changes
+
+Validation:
+
+```text
+cmake -S . -B build-pr501-communication-debug -DCMAKE_BUILD_TYPE=Debug -DIPPL_ENABLE_UNIT_TESTS=ON -DIPPL_ENABLE_FFT=ON -DIPPL_DEFAULT_TEST_PROCS=1
+cmake --build build-pr501-communication-debug --parallel 8
+ctest --test-dir build-pr501-communication-debug --output-on-failure
+/opt/homebrew/Cellar/open-mpi/5.0.8/bin/mpiexec -n 2 build-pr501-communication-debug/unit_tests/Particle/ParticleSendRecv
+/opt/homebrew/Cellar/open-mpi/5.0.8/bin/mpiexec -n 2 build-pr501-communication-debug/unit_tests/Particle/ParticleUpdate
+/opt/homebrew/Cellar/open-mpi/5.0.8/bin/mpiexec -n 2 build-pr501-communication-debug/unit_tests/Particle/ParticleUpdateNonuniform
+```
+
+Result:
+
+```text
+100% tests passed, 0 tests failed out of 36
+Total Test time (real) = 38.64 sec
+
+2-rank ParticleSendRecv: passed
+2-rank ParticleUpdate: passed
+2-rank ParticleUpdateNonuniform: passed
+```
+
+Assessment:
+
+- This is a coherent second PR after `pr501-pcg`.
+- It is still larger than the PCG split, but it avoids the major algorithmic dependencies from interpolation, FFT, NUFFT, and PIF.
+- Recommended next validation before opening: run the same particle tests on CUDA/HIP builds and larger MPI rank counts, especially the original `ParticleSendRecv` failure configurations.
diff --git a/src/Communicate/Archive.h b/src/Communicate/Archive.h
index 6ec74f946..2e8561e36 100644
--- a/src/Communicate/Archive.h
+++ b/src/Communicate/Archive.h
@@ -9,6 +9,9 @@
 //   that they have type char and thus contain raw bytes, unlike other typed buffers
 //   such as detail::FieldBufferData used by HaloCells.
 //
+//   On CUDA/HIP the internal buffer is allocated directly via cudaMalloc/hipMalloc
+//   so that the pointer is page-aligned (4K) and compatible with MPI IPC.
+//
 #ifndef IPPL_ARCHIVE_H
 #define IPPL_ARCHIVE_H
 
@@ -32,6 +35,7 @@ namespace ippl {
             using pointer_type = typename buffer_type::pointer_type;
 
             Archive(size_type size = 0);
+            ~Archive();
 
             /*!
              * Serialize.
@@ -40,6 +44,10 @@ namespace ippl {
             template <typename T, class... ViewArgs>
             void serialize(const Kokkos::View<T*, ViewArgs...>& view, size_type nsends);
 
+            template <typename T, class... ViewArgs, typename HashView>
+            void serialize(const Kokkos::View<T*, ViewArgs...>& view, const HashView& hash,
+                           size_type nsends);
+
             /*!
              * Serialize vector attributes
              *
@@ -52,6 +60,10 @@ namespace ippl {
             void serialize(const Kokkos::View<Vector<T, Dim>*, ViewArgs...>& view,
                            size_type nsends);
 
+            template <typename T, unsigned Dim, class... ViewArgs, typename HashView>
+            void serialize(const Kokkos::View<Vector<T, Dim>*, ViewArgs...>& view,
+                           const HashView& hash, size_type nsends);
+
             /*!
              * Deserialize.
              * @param view to put data to
@@ -59,6 +71,14 @@ namespace ippl {
             template <typename T, class... ViewArgs>
             void deserialize(Kokkos::View<T*, ViewArgs...>& view, size_type nrecvs);
 
+            template <typename T, class... ViewArgs>
+            void deserialize(Kokkos::View<T*, ViewArgs...>& view, size_type offset,
+                             size_type nrecvs);
+
+            template <typename T, unsigned Dim, class... ViewArgs>
+            void deserialize(Kokkos::View<Vector<T, Dim>*, ViewArgs...>& view, size_type offset,
+                             size_type nrecvs);
+
             /*!
              * Deserialize vector attributes
              *
@@ -71,33 +91,50 @@ namespace ippl {
             void deserialize(Kokkos::View<Vector<T, Dim>*, ViewArgs...>& view, size_type nrecvs);
 
             /*!
-             * @returns a pointer to the data of the buffer
+             * @returns a pointer to the data of the buffer.
+             *          On GPU this is a page-aligned device pointer from cudaMalloc/hipMalloc.
              */
-            pointer_type getBuffer() { return buffer_m.data(); }
+            pointer_type getBuffer() { return bufferData(); }
 
             /*!
-             * @returns the size of the buffer
+             * @returns the number of bytes written so far
              */
             size_type getSize() const { return writepos_m; }
 
-            size_type getBufferSize() const { return buffer_m.size(); }
-
-            void resizeBuffer(size_type size) { Kokkos::resize(buffer_m, size); }
+            /*!
+             * @returns the total capacity of the buffer in bytes
+             */
+            size_type getBufferSize() const { return bufferSize(); }
 
-            void reallocBuffer(size_type size) { Kokkos::realloc(buffer_m, size); }
+            void resizeBuffer(size_type size);
+            void reallocBuffer(size_type size);
 
             void resetWritePos() { writepos_m = 0; }
             void resetReadPos() { readpos_m = 0; }
 
-            ~Archive() = default;
-
         private:
             //! write position for serialization
             size_type writepos_m;
             //! read position for deserialization
             size_type readpos_m;
-            //! serialized data
+
+#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
+            //! Raw device pointer from cudaMalloc/hipMalloc (page-aligned, IPC-safe)
+            pointer_type buffer_ptr_m = nullptr;
+            size_type buffer_size_m   = 0;
+
+            pointer_type bufferData() const { return buffer_ptr_m; }
+            size_type bufferSize() const { return buffer_size_m; }
+
+            void gpuAlloc(size_type size);
+            void gpuFree();
+#else
+            //! serialized data (standard Kokkos view on CPU)
             buffer_type buffer_m;
+
+            pointer_type bufferData() const { return buffer_m.data(); }
+            size_type bufferSize() const { return buffer_m.size(); }
+#endif
         };
     }  // namespace detail
 }  // namespace ippl
diff --git a/src/Communicate/Archive.hpp b/src/Communicate/Archive.hpp
index a55c33c7d..19e7b2423 100644
--- a/src/Communicate/Archive.hpp
+++ b/src/Communicate/Archive.hpp
@@ -4,6 +4,12 @@
 //
 #include "Archive.h"
 
+#if defined(KOKKOS_ENABLE_CUDA)
+#include <cuda_runtime.h>
+#elif defined(KOKKOS_ENABLE_HIP)
+#include <hip/hip_runtime.h>
+#endif
+
 namespace ippl {
     namespace detail {
         KOKKOS_INLINE_FUNCTION void copyBytes(char* dst, const char* src, size_t size) {
@@ -12,117 +18,364 @@ namespace ippl {
             }
         }
 
+        template <typename T, typename HashView, typename BufferPtr>
+        struct SerializeHashFunctor {
+            const T* view_data;
+            HashView hash;
+            BufferPtr buf;
+            size_t elem_size;
+            size_t wpos;
+
+            KOKKOS_INLINE_FUNCTION void operator()(const size_t i) const {
+                const char* src = reinterpret_cast<const char*>(view_data + hash(i));
+                char* dst       = buf + i * elem_size + wpos;
+                copyBytes(dst, src, elem_size);
+            }
+        };
+
+        template <typename T, unsigned Dim, typename HashView, typename BufferPtr>
+        struct SerializeHashVectorFunctor {
+            const Vector<T, Dim>* view_data;
+            HashView hash;
+            BufferPtr buf;
+            size_t elem_size;
+            size_t wpos;
+
+            KOKKOS_INLINE_FUNCTION void operator()(const size_t i, const size_t d) const {
+                const Vector<T, Dim>* vec = view_data + hash(i);
+                const T value             = (*vec)[d];
+                const char* src           = reinterpret_cast<const char*>(&value);
+                char* dst                 = buf + (Dim * i + d) * elem_size + wpos;
+                copyBytes(dst, src, elem_size);
+            }
+        };
+
+        // =================================================================
+        // GPU buffer management
+        // =================================================================
+
+#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
+
+        template <class... Properties>
+        void Archive<Properties...>::gpuAlloc(size_type size) {
+            if (size == 0) return;
+#if defined(KOKKOS_ENABLE_HIP)
+            // HSA IPC (used by Cray MPICH for large-message GPU transfers)
+            // requires allocation sizes to be multiples of the GPU page
+            // granularity (64 KB on MI250X / MI300X).  Without this,
+            // hsa_amd_ipc_memory_attach fails with INVALID_ARGUMENT.
+            static constexpr size_type kGranularity = 65536;
+            size = ((size + kGranularity - 1) / kGranularity) * kGranularity;
+#endif
+            void* ptr = nullptr;
+#if defined(KOKKOS_ENABLE_CUDA)
+            cudaMalloc(&ptr, size);
+#else
+            hipMalloc(&ptr, size);
+#endif
+            buffer_ptr_m  = static_cast<pointer_type>(ptr);
+            buffer_size_m = size;
+        }
+
+        template <class... Properties>
+        void Archive<Properties...>::gpuFree() {
+            if (buffer_ptr_m) {
+#if defined(KOKKOS_ENABLE_CUDA)
+                cudaFree(buffer_ptr_m);
+#else
+                hipFree(buffer_ptr_m);
+#endif
+                buffer_ptr_m  = nullptr;
+                buffer_size_m = 0;
+            }
+        }
+
+        template <class... Properties>
+        Archive<Properties...>::Archive(size_type size)
+            : writepos_m(0)
+            , readpos_m(0) {
+            gpuAlloc(size);
+        }
+
+        template <class... Properties>
+        Archive<Properties...>::~Archive() {
+            gpuFree();
+        }
+
+        template <class... Properties>
+        void Archive<Properties...>::resizeBuffer(size_type size) {
+            if (size <= buffer_size_m) return;
+
+#if defined(KOKKOS_ENABLE_HIP)
+            static constexpr size_type kGranularity = 65536;
+            size = ((size + kGranularity - 1) / kGranularity) * kGranularity;
+#endif
+            pointer_type new_ptr = nullptr;
+            void* vptr           = nullptr;
+#if defined(KOKKOS_ENABLE_CUDA)
+            cudaMalloc(&vptr, size);
+#else
+            hipMalloc(&vptr, size);
+#endif
+            new_ptr = static_cast<pointer_type>(vptr);
+
+            if (buffer_ptr_m && buffer_size_m > 0) {
+#if defined(KOKKOS_ENABLE_CUDA)
+                cudaMemcpy(new_ptr, buffer_ptr_m, buffer_size_m, cudaMemcpyDeviceToDevice);
+                cudaFree(buffer_ptr_m);
+#else
+                hipMemcpy(new_ptr, buffer_ptr_m, buffer_size_m, hipMemcpyDeviceToDevice);
+                hipFree(buffer_ptr_m);
+#endif
+            }
+
+            buffer_ptr_m  = new_ptr;
+            buffer_size_m = size;
+        }
+
+        template <class... Properties>
+        void Archive<Properties...>::reallocBuffer(size_type size) {
+            gpuFree();
+            gpuAlloc(size);
+        }
+
+#else  // CPU path
+
         template <class... Properties>
         Archive<Properties...>::Archive(size_type size)
             : writepos_m(0)
             , readpos_m(0)
             , buffer_m("buffer", size) {}
 
-        // -----------------------------------
-        // Scalar serialize
+        template <class... Properties>
+        Archive<Properties...>::~Archive() = default;
+
+        template <class... Properties>
+        void Archive<Properties...>::resizeBuffer(size_type size) {
+            Kokkos::resize(buffer_m, size);
+        }
+
+        template <class... Properties>
+        void Archive<Properties...>::reallocBuffer(size_type size) {
+            Kokkos::realloc(buffer_m, size);
+        }
+
+#endif  // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
+
+        // =================================================================
+        // Serialize — scalar
+        // =================================================================
+
         template <class... Properties>
         template <typename T, class... ViewArgs>
         void Archive<Properties...>::serialize(const Kokkos::View<T*, ViewArgs...>& view,
                                                size_type nsends) {
-            constexpr size_t size = sizeof(T);
-            char* dst_ptr         = (char*)(buffer_m.data()) + writepos_m;
-            char* src_ptr         = (char*)(view.data());
-            assert(writepos_m + (nsends * size) <= buffer_m.size());
-            // construct temp views of the src/dst buffers of the correct size (bytes)
-            using src_view_type =
-                Kokkos::View<char*, typename Kokkos::View<T*, ViewArgs...>::memory_space,
-                             Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
-            using dst_view_type =
-                Kokkos::View<char*, typename buffer_type::memory_space,
-                             Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
-            src_view_type src_view(src_ptr, size * nsends);
-            dst_view_type dst_view(dst_ptr, size * nsends);
-            Kokkos::deep_copy(dst_view, src_view);
+            using exec_space  = typename Kokkos::View<T*, ViewArgs...>::execution_space;
+            using policy_type = Kokkos::RangePolicy<exec_space>;
+
+            size_t size    = sizeof(T);
+            auto base      = bufferData();
+            auto writepos  = writepos_m;
+            Kokkos::parallel_for(
+                "Archive::serialize()", policy_type(0, nsends), KOKKOS_LAMBDA(const size_type i) {
+                    const char* src = reinterpret_cast<const char*>(view.data() + i);
+                    char* dst       = base + i * size + writepos;
+                    copyBytes(dst, src, size);
+                });
             Kokkos::fence();
-            writepos_m += (nsends * size);
+            writepos_m += size * nsends;
         }
 
-        // -----------------------------------
-        // Vector serialize
+        // =================================================================
+        // Serialize — scalar with hash
+        // =================================================================
+
+        template <class... Properties>
+        template <typename T, class... ViewArgs, typename HashView>
+        void Archive<Properties...>::serialize(const Kokkos::View<T*, ViewArgs...>& view,
+                                               const HashView& hash, size_type nsends) {
+            using exec_space  = HashView::execution_space;
+            using policy_type = Kokkos::RangePolicy<exec_space>;
+            using BufferPtr   = pointer_type;
+
+            SerializeHashFunctor<T, HashView, BufferPtr> f{view.data(), hash, bufferData(),
+                                                           sizeof(T), writepos_m};
+
+            Kokkos::parallel_for("Archive::serialize(hash)", policy_type(0, nsends), f);
+            Kokkos::fence();
+            writepos_m += sizeof(T) * nsends;
+        }
+
+        // =================================================================
+        // Serialize — vector
+        // =================================================================
+
         template <class... Properties>
         template <typename T, unsigned Dim, class... ViewArgs>
         void Archive<Properties...>::serialize(
             const Kokkos::View<Vector<T, Dim>*, ViewArgs...>& view, size_type nsends) {
-            constexpr size_t size         = sizeof(T);
-            char* dst_ptr                 = (char*)(buffer_m.data());
-            ippl::Vector<T, Dim>* src_ptr = view.data();
-            auto wp                       = writepos_m;
-            // The Kokkos range policies expect int64
-            // so we have to explicitly specify size_type (uint64)
             using exec_space = typename Kokkos::View<T*, ViewArgs...>::execution_space;
+
+            size_t size    = sizeof(T);
+            auto base      = bufferData();
+            auto writepos  = writepos_m;
+            // Default index type for range policies is int64,
+            // so we have to explicitly specify size_type (uint64)
             using mdrange_t =
                 Kokkos::MDRangePolicy<Kokkos::Rank<2>, Kokkos::IndexType<size_type>, exec_space>;
             Kokkos::parallel_for(
-                "Archive::serialize()", mdrange_t({0, 0}, {(long int)nsends, Dim}),
+                "Archive::serialize()",
+                // The constructor for Kokkos range policies always
+                // expects int64 regardless of index type provided
+                // by template parameters, so the typecast is necessary
+                // to avoid compiler warnings
+                mdrange_t({0, 0}, {(long int)nsends, Dim}),
                 KOKKOS_LAMBDA(const size_type i, const size_t d) {
-                    const char* src = reinterpret_cast<const char*>(&src_ptr[i][d]);
-                    char* dst       = dst_ptr + (Dim * i + d) * size + wp;
+                    const T value   = view.data()[i][d];
+                    const char* src = reinterpret_cast<const char*>(&value);
+                    char* dst       = base + (Dim * i + d) * size + writepos;
                     copyBytes(dst, src, size);
                 });
+            Kokkos::fence();
+            writepos_m += Dim * size * nsends;
+        }
+
+        // =================================================================
+        // Serialize — vector with hash
+        // =================================================================
 
+        template <class... Properties>
+        template <typename T, unsigned Dim, class... ViewArgs, typename HashView>
+        void Archive<Properties...>::serialize(
+            const Kokkos::View<Vector<T, Dim>*, ViewArgs...>& view, const HashView& hash,
+            size_type nsends) {
+            using exec_space = typename HashView::execution_space;
+            size_t size      = sizeof(T);
+            using BufferPtr  = pointer_type;
+            using mdrange_t =
+                Kokkos::MDRangePolicy<Kokkos::Rank<2>, Kokkos::IndexType<size_type>, exec_space>;
+
+            SerializeHashVectorFunctor<T, Dim, HashView, BufferPtr> f{
+                view.data(), hash, bufferData(), size, writepos_m};
+
+            Kokkos::parallel_for("Archive::serialize(hash, vector)",
+                                 mdrange_t({0, 0}, {(long int)nsends, Dim}), f);
             Kokkos::fence();
             writepos_m += Dim * size * nsends;
         }
 
-        // -----------------------------------
-        // Scalar Deserialize
+        // =================================================================
+        // Deserialize — scalar
+        // =================================================================
+
         template <class... Properties>
         template <typename T, class... ViewArgs>
         void Archive<Properties...>::deserialize(Kokkos::View<T*, ViewArgs...>& view,
                                                  size_type nrecvs) {
-            // if we have to enlarge the destination view
+            using exec_space  = typename Kokkos::View<T*, ViewArgs...>::execution_space;
+            using policy_type = Kokkos::RangePolicy<exec_space>;
+
+            size_t size = sizeof(T);
             if (nrecvs > view.extent(0)) {
                 Kokkos::realloc(view, nrecvs);
             }
-            //
-            constexpr size_t size = sizeof(T);
-            char* src_ptr         = (char*)(buffer_m.data()) + readpos_m;
-            char* dst_ptr         = (char*)(view.data());
-            assert(readpos_m + (nrecvs * size) <= buffer_m.size());
-            // construct temp views of the src/dst buffers of the correct size (bytes)
-            using src_view_type =
-                Kokkos::View<char*, typename buffer_type::memory_space,
-                             Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
-            using dst_view_type =
-                Kokkos::View<char*, typename Kokkos::View<T*, ViewArgs...>::memory_space,
-                             Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
-            src_view_type src_view(src_ptr, size * nrecvs);
-            dst_view_type dst_view(dst_ptr, size * nrecvs);
-            Kokkos::deep_copy(dst_view, src_view);
+            auto base    = bufferData();
+            auto readpos = readpos_m;
+            Kokkos::parallel_for(
+                "Archive::deserialize()", policy_type(0, nrecvs), KOKKOS_LAMBDA(const size_type i) {
+                    const char* src = base + i * size + readpos;
+                    char* dst       = reinterpret_cast<char*>(view.data() + i);
+                    copyBytes(dst, src, size);
+                });
+            // Wait for deserialization kernel to complete
+            // (as with serialization kernels)
             Kokkos::fence();
-            readpos_m += (nrecvs * size);
+            readpos_m += size * nrecvs;
         }
 
-        // -----------------------------------
-        // Vector Deserialize
+        // =================================================================
+        // Deserialize — vector
+        // =================================================================
+
         template <class... Properties>
         template <typename T, unsigned Dim, class... ViewArgs>
         void Archive<Properties...>::deserialize(Kokkos::View<Vector<T, Dim>*, ViewArgs...>& view,
-                                                 size_type nrecvs) 
-        {
-            // if we have to enlarge the destination view
+                                                 size_type nrecvs) {
+            using exec_space = typename Kokkos::View<T*, ViewArgs...>::execution_space;
+
+            size_t size = sizeof(T);
             if (nrecvs > view.extent(0)) {
                 Kokkos::realloc(view, nrecvs);
             }
-            //
-            constexpr size_t size         = sizeof(T);
-            char* src_ptr                 = (char*)(buffer_m.data());
-            ippl::Vector<T, Dim>* dst_ptr = view.data();
-            auto rp                       = readpos_m;
-            using exec_space              = typename Kokkos::View<T*, ViewArgs...>::execution_space;
             using mdrange_t =
                 Kokkos::MDRangePolicy<Kokkos::Rank<2>, Kokkos::IndexType<size_type>, exec_space>;
+            auto base    = bufferData();
+            auto readpos = readpos_m;
             Kokkos::parallel_for(
                 "Archive::deserialize()", mdrange_t({0, 0}, {(long int)nrecvs, Dim}),
                 KOKKOS_LAMBDA(const size_type i, const size_t d) {
-                    const char* src = src_ptr + (Dim * i + d) * size + rp;
-                    char* dst       = reinterpret_cast<char*>(&dst_ptr[i][d]);
+                    const char* src = base + (Dim * i + d) * size + readpos;
+                    T value;
+                    char* dst = reinterpret_cast<char*>(&value);
+                    copyBytes(dst, src, size);
+                    view.data()[i](d) = value;
+                });
+            Kokkos::fence();
+            readpos_m += Dim * size * nrecvs;
+        }
+
+        // =================================================================
+        // Deserialize — scalar with offset
+        // =================================================================
+
+        template <class... Properties>
+        template <typename T, class... ViewArgs>
+        void Archive<Properties...>::deserialize(Kokkos::View<T*, ViewArgs...>& view,
+                                                 size_type offset, size_type nrecvs) {
+            using exec_space  = typename Kokkos::View<T*, ViewArgs...>::execution_space;
+            using policy_type = Kokkos::RangePolicy<exec_space>;
+            size_t size       = sizeof(T);
+            if (offset + nrecvs > view.extent(0)) {
+                Kokkos::resize(view, offset + nrecvs);
+            }
+            auto base    = bufferData();
+            auto readpos = readpos_m;
+            Kokkos::parallel_for(
+                "Archive::deserialize(offset)", policy_type(0, nrecvs),
+                KOKKOS_LAMBDA(const size_type i) {
+                    const char* src = base + i * size + readpos;
+                    char* dst       = reinterpret_cast<char*>(view.data() + offset + i);
+                    copyBytes(dst, src, size);
+                });
+            Kokkos::fence();
+            readpos_m += size * nrecvs;
+        }
+
+        // =================================================================
+        // Deserialize — vector with offset
+        // =================================================================
+
+        template <class... Properties>
+        template <typename T, unsigned Dim, class... ViewArgs>
+        void Archive<Properties...>::deserialize(Kokkos::View<Vector<T, Dim>*, ViewArgs...>& view,
+                                                 size_type offset, size_type nrecvs) {
+            using exec_space = typename Kokkos::View<T*, ViewArgs...>::execution_space;
+            size_t size      = sizeof(T);
+            if (offset + nrecvs > view.extent(0)) {
+                Kokkos::resize(view, offset + nrecvs);
+            }
+            using mdrange_t =
+                Kokkos::MDRangePolicy<Kokkos::Rank<2>, Kokkos::IndexType<size_type>, exec_space>;
+            auto base    = bufferData();
+            auto readpos = readpos_m;
+            Kokkos::parallel_for(
+                "Archive::deserialize(offset, vector)", mdrange_t({0, 0}, {(long int)nrecvs, Dim}),
+                KOKKOS_LAMBDA(const size_type i, const size_t d) {
+                    const char* src = base + (Dim * i + d) * size + readpos;
+                    T value;
+                    char* dst = reinterpret_cast<char*>(&value);
                     copyBytes(dst, src, size);
+                    view.data()[offset + i](d) = value;
                 });
             Kokkos::fence();
             readpos_m += Dim * size * nrecvs;
diff --git a/src/Communicate/BufferHandler.hpp b/src/Communicate/BufferHandler.hpp
index c6d57f0a3..e27369c06 100644
--- a/src/Communicate/BufferHandler.hpp
+++ b/src/Communicate/BufferHandler.hpp
@@ -1,6 +1,8 @@
 #ifndef IPPL_BUFFER_HANDLER_HPP
 #define IPPL_BUFFER_HANDLER_HPP
 
+#include <algorithm>
+
 namespace ippl {
 
     template <typename MemorySpace>
@@ -11,6 +13,15 @@ namespace ippl {
     DefaultBufferHandler<MemorySpace>::getBuffer(size_type size, double overallocation) {
         size_type requiredSize = static_cast<size_type>(size * overallocation);
 
+        // Round up to page granularity to avoid 0-byte or tiny allocations
+        // and to align with the GPU-aware MPI registration cache.
+        constexpr size_type PAGE_SIZE = 4096;
+        if (requiredSize < PAGE_SIZE) {
+            requiredSize = PAGE_SIZE;
+        } else {
+            requiredSize = ((requiredSize + PAGE_SIZE - 1) / PAGE_SIZE) * PAGE_SIZE;
+        }
+
         auto freeBuffer = findFreeBuffer(requiredSize);
         if (freeBuffer != nullptr) {
             return getFreeBuffer(freeBuffer);
@@ -120,17 +131,12 @@ namespace ippl {
     template <typename MemorySpace>
     typename DefaultBufferHandler<MemorySpace>::buffer_type
     DefaultBufferHandler<MemorySpace>::reallocateLargestFreeBuffer(size_type requiredSize) {
-        auto largest_it    = std::prev(free_buffers.end());
-        buffer_type buffer = *largest_it;
-
-        freeSize_m -= buffer->getBufferSize();
-        usedSize_m += requiredSize;
-
-        free_buffers.erase(buffer);
-        buffer->reallocBuffer(requiredSize);
-
-        used_buffers.insert(buffer);
-        return buffer;
+        // Always allocate a
+        // fresh buffer instead of reallocating the largest free one: a
+        // free + alloc cycle can release the device pointer that a GPU-aware
+        // MPI's registration cache still holds. Keeping the old buffers in the free
+        // pool and returning a new buffer at a new address sidesteps this.
+        return allocateNewBuffer(requiredSize);
     }
 
     template <typename MemorySpace>
@@ -145,4 +151,4 @@ namespace ippl {
 
 }  // namespace ippl
 
-#endif
+#endif
\ No newline at end of file
diff --git a/src/Communicate/Buffers.cpp b/src/Communicate/Buffers.cpp
index 026633fec..e99100bc2 100644
--- a/src/Communicate/Buffers.cpp
+++ b/src/Communicate/Buffers.cpp
@@ -32,13 +32,13 @@ namespace ippl {
         }
 
         void Communicator::deleteAllBuffers() {
-            buffer_handlers_m->forAll([]<typename BufferHandler>(BufferHandler&& bh) {
+            getBufferHandler().forAll([]<typename BufferHandler>(BufferHandler&& bh) {
                 bh.deleteAllBuffers();
             });
         }
 
         void Communicator::freeAllBuffers() {
-            buffer_handlers_m->forAll([]<typename BufferHandler>(BufferHandler&& bh) {
+            getBufferHandler().forAll([]<typename BufferHandler>(BufferHandler&& bh) {
                 bh.freeAllBuffers();
             });
         }
diff --git a/src/Communicate/Buffers.hpp b/src/Communicate/Buffers.hpp
index 98c0b513f..597461b4d 100644
--- a/src/Communicate/Buffers.hpp
+++ b/src/Communicate/Buffers.hpp
@@ -20,21 +20,23 @@
 //   exchanging particle data between ranks.
 //
 
+#include "Utility/IpplTimings.h"
 namespace ippl {
     namespace mpi {
 
         template <typename MemorySpace, typename T>
         Communicator::buffer_type<MemorySpace> Communicator::getBuffer(size_type size,
                                                                        double overallocation) {
-            auto& buffer_handler = buffer_handlers_m->get<MemorySpace>();
+            auto& buffer_handler = getBufferHandler().get<MemorySpace>();
 
-            return buffer_handler.getBuffer(size * sizeof(T),
-                                            std::max(overallocation, defaultOveralloc_m));
+            auto buf = buffer_handler.getBuffer(size * sizeof(T),
+                                                std::max(overallocation, defaultOveralloc_m));
+            return buf;
         }
 
         template <typename MemorySpace>
         void Communicator::freeBuffer(Communicator::buffer_type<MemorySpace> buffer) {
-            auto& buffer_handler = buffer_handlers_m->get<MemorySpace>();
+            auto& buffer_handler = getBufferHandler().get<MemorySpace>();
 
             buffer_handler.freeBuffer(buffer);
         }
diff --git a/src/Communicate/Communicator.cpp b/src/Communicate/Communicator.cpp
index 2c33f939f..e1a8b777b 100644
--- a/src/Communicate/Communicator.cpp
+++ b/src/Communicate/Communicator.cpp
@@ -3,25 +3,27 @@
 
 namespace ippl::mpi {
 
+    namespace {
+        // Populate rank_m / size_m from the live communicator.
+        void cacheRankAndSize(const MPI_Comm& comm, int& rank, int& size) {
+            MPI_Comm_rank(comm, &rank);
+            MPI_Comm_size(comm, &size);
+        }
+    }  // namespace
+
     Communicator::Communicator()
-        : buffer_handlers_m(get_buffer_handler_instance())
-        , comm_m(new MPI_Comm(MPI_COMM_WORLD)) {
-        MPI_Comm_rank(*comm_m, &rank_m);
-        MPI_Comm_size(*comm_m, &size_m);
+        : comm_m(new MPI_Comm(MPI_COMM_WORLD)) {
+        cacheRankAndSize(*comm_m, rank_m, size_m);
     }
 
     Communicator::Communicator(MPI_Comm comm) {
-        buffer_handlers_m = get_buffer_handler_instance();
-        comm_m            = std::make_shared<MPI_Comm>(comm);
-        MPI_Comm_rank(*comm_m, &rank_m);
-        MPI_Comm_size(*comm_m, &size_m);
+        comm_m = std::make_shared<MPI_Comm>(comm);
+        cacheRankAndSize(*comm_m, rank_m, size_m);
     }
 
     Communicator& Communicator::operator=(MPI_Comm comm) {
-        buffer_handlers_m = get_buffer_handler_instance();
-        comm_m            = std::make_shared<MPI_Comm>(comm);
-        MPI_Comm_rank(*comm_m, &rank_m);
-        MPI_Comm_size(*comm_m, &size_m);
+        comm_m = std::make_shared<MPI_Comm>(comm);
+        cacheRankAndSize(*comm_m, rank_m, size_m);
         return *this;
     }
 
@@ -41,14 +43,4 @@ namespace ippl::mpi {
         return (flag != 0);
     }
 
-    // ---------------------------------------
-    // singleton access to buffer manager
-    // ---------------------------------------
-    std::shared_ptr<Communicator::buffer_handler_type> Communicator::get_buffer_handler_instance() {
-        static std::shared_ptr<Communicator::buffer_handler_type> comm_buff_handler_ptr{nullptr};
-        if (comm_buff_handler_ptr == nullptr) {
-            comm_buff_handler_ptr = std::make_shared<Communicator::buffer_handler_type>();
-        }
-        return comm_buff_handler_ptr;
-    }
 }  // namespace ippl::mpi
diff --git a/src/Communicate/Communicator.h b/src/Communicate/Communicator.h
index 9819ab3bd..7685d20d5 100644
--- a/src/Communicate/Communicator.h
+++ b/src/Communicate/Communicator.h
@@ -15,6 +15,7 @@
 
 ////////////////////////////////////////////////
 // For message size check; see below
+
 #include <climits>
 #include <cstdlib>
 
@@ -27,7 +28,6 @@
 
 namespace ippl {
     namespace mpi {
-
         class Communicator : public TagMaker {
         public:
             Communicator();
@@ -157,16 +157,21 @@ namespace ippl {
 
             const MPI_Comm& getCommunicator() const noexcept { return *comm_m; }
 
+            // MPI uses int for byte counts; messages exceeding INT_MAX must
+            // be split. This shared check aborts the run with a single
+            // diagnostic instead of silently truncating.
+            void assertMessageSize(size_type msize) const {
+                if (msize > static_cast<size_type>(INT_MAX)) {
+                    std::cerr << "Communicator: message size " << msize
+                              << " bytes exceeds INT_MAX (" << INT_MAX << ")\n";
+                    MPI_Abort(*comm_m, -1);
+                }
+            }
+
             template <class Buffer, typename Archive>
             void recv(int src, int tag, Buffer& buffer, Archive& ar, size_type msize,
                       size_type nrecvs) {
-                // Temporary fix. MPI communication seems to have problems when the
-                // count argument exceeds the range of int, so large messages should
-                // be split into smaller messages
-                if (msize > INT_MAX) {
-                    std::cerr << "Message size exceeds range of int" << std::endl;
-                    this->abort();
-                }
+                assertMessageSize(msize);
                 MPI_Status status;
                 MPI_Recv(ar.getBuffer(), msize, MPI_BYTE, src, tag, *comm_m, &status);
 
@@ -176,20 +181,27 @@ namespace ippl {
             template <class Buffer, typename Archive>
             void isend(int dest, int tag, Buffer& buffer, Archive& ar, MPI_Request& request,
                        size_type nsends) {
-                if (ar.getSize() > INT_MAX) {
-                    std::cerr << "Message size exceeds range of int" << std::endl;
-                    this->abort();
-                }
+                assertMessageSize(ar.getSize());
                 buffer.serialize(ar, nsends);
                 MPI_Isend(ar.getBuffer(), ar.getSize(), MPI_BYTE, dest, tag, *comm_m, &request);
             }
 
+            template <typename Archive>
+            void isend(int dest, int tag, Archive& ar, MPI_Request& request) {
+                assertMessageSize(ar.getSize());
+                MPI_Isend(ar.getBuffer(), ar.getSize(), MPI_BYTE, dest, tag, *comm_m, &request);
+            }
+
+            template <typename Archive>
+            void recv(int src, int tag, Archive& ar, size_type msize) {
+                assertMessageSize(msize);
+                MPI_Status status;
+                MPI_Recv(ar.getBuffer(), msize, MPI_BYTE, src, tag, *comm_m, &status);
+            }
+
             template <typename Archive>
             void irecv(int src, int tag, Archive& ar, MPI_Request& request, size_type msize) {
-                if (msize > INT_MAX) {
-                    std::cerr << "Message size exceeds range of int" << std::endl;
-                    this->abort();
-                }
+                assertMessageSize(msize);
                 MPI_Irecv(ar.getBuffer(), msize, MPI_BYTE, src, tag, *comm_m, &request);
             }
 
@@ -201,7 +213,11 @@ namespace ippl {
             std::vector<LogEntry> gatherLogsFromAllRanks(const std::vector<LogEntry>& localLogs);
             void writeLogsToFile(const std::vector<LogEntry>& allLogs, const std::string& filename);
 
-            std::shared_ptr<buffer_handler_type> buffer_handlers_m;
+            static buffer_handler_type& getBufferHandler() {
+                static buffer_handler_type handler;
+                return handler;
+            }
+
             double defaultOveralloc_m = 1.0;
 
             /////////////////////////////////////////////////////////////////////////////////////
@@ -210,12 +226,10 @@ namespace ippl {
             std::shared_ptr<MPI_Comm> comm_m;
             int size_m;
             int rank_m;
-
-        public:
-            std::shared_ptr<buffer_handler_type> get_buffer_handler_instance();
         };
 
     }  // namespace mpi
+
 }  // namespace ippl
 
 #include "Communicate/Collectives.hpp"
diff --git a/src/Communicate/CommunicatorLogging.cpp b/src/Communicate/CommunicatorLogging.cpp
index 541ff4074..39c0197e8 100644
--- a/src/Communicate/CommunicatorLogging.cpp
+++ b/src/Communicate/CommunicatorLogging.cpp
@@ -7,8 +7,16 @@
 
 #include "Communicate/Communicator.h"
 #include "Communicate/LogEntry.h"
+#include "Communicate/LoggingBufferHandler.h"
 
 namespace ippl::mpi {
+
+    template <class MemorySpace>
+    struct is_a_logger : std::false_type {};
+
+    template <class MemorySpace>
+    struct is_a_logger<LoggingBufferHandler<MemorySpace>> : std::true_type {};
+
     void Communicator::printLogs(const std::string& filename) {
         std::vector<LogEntry> localLogs = gatherLocalLogs();
 
@@ -24,20 +32,17 @@ namespace ippl::mpi {
         }
     }
 
-    template <class MemorySpace>
-    struct is_a_logger : std::false_type {};
-
-    template <class MemorySpace>
-    struct is_a_logger<LoggingBufferHandler<MemorySpace> > : std::true_type {};
-
     std::vector<LogEntry> Communicator::gatherLocalLogs() {
         std::vector<LogEntry> localLogs;
-        if constexpr (is_a_logger<buffer_handler_type>::value) {
-            buffer_handlers_m->forAll([&](auto& loggingHandler) {
-                const auto& logs = loggingHandler.getLogs();
+
+        getBufferHandler().forAll([&](auto& handler) {
+            using handler_t = std::decay_t<decltype(handler)>;
+            if constexpr (is_a_logger<handler_t>::value) {
+                const auto& logs = handler.getLogs();
                 localLogs.insert(localLogs.end(), logs.begin(), logs.end());
-            });
-        }
+            }
+        });
+
         return localLogs;
     }
 
@@ -88,11 +93,9 @@ namespace ippl::mpi {
         size_t offset = 0;
 
         while (offset < buffer.size()) {
-            LogEntry logEntry = LogEntry::deserialize(buffer, offset);
-
-            logs.push_back(logEntry);
-
-            offset += logEntry.serialize().size();
+            // deserializeAdvance walks offset past the consumed bytes -
+            // avoids the O(N^2) blowup of re-serializing each entry
+            logs.push_back(LogEntry::deserializeAdvance(buffer, offset));
         }
         return logs;
     }
diff --git a/src/Communicate/Environment.cpp b/src/Communicate/Environment.cpp
index 8e19df365..706c821f1 100644
--- a/src/Communicate/Environment.cpp
+++ b/src/Communicate/Environment.cpp
@@ -9,9 +9,23 @@ namespace ippl {
     namespace mpi {
 
         Environment::Environment(int& argc, char**& argv, const MPI_Comm& comm)
-            : comm_m(comm) {
+            : comm_m(comm)
+            , threadMultiple_m(false) {
             if (!initialized()) {
-                MPI_Init(&argc, &argv);
+                int provided = MPI_THREAD_SINGLE;
+                int rc = MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided);
+                if (rc != MPI_SUCCESS) {
+                    std::cerr << "MPI_Init_thread failed (rc=" << rc << ")" << std::endl;
+                    std::exit(EXIT_FAILURE);
+                }
+                threadMultiple_m = (provided >= MPI_THREAD_MULTIPLE);
+                if (!threadMultiple_m) {
+                    int rank = 0;
+                    MPI_Comm_rank(comm_m, &rank);
+                    if (rank == 0) {
+                        std::cerr << "MPI doesn't support MPI_THREAD_MULTIPLE!" << std::endl;
+                    }
+                }
             }
         }
 
diff --git a/src/Communicate/Environment.h b/src/Communicate/Environment.h
index fd03a9ee4..bad195df4 100644
--- a/src/Communicate/Environment.h
+++ b/src/Communicate/Environment.h
@@ -23,10 +23,13 @@ namespace ippl {
 
             static bool finalized();
 
+            bool threadMultiple() noexcept { return threadMultiple_m; }
+
             void abort(int errorcode = -1) noexcept { MPI_Abort(comm_m, errorcode); }
 
         private:
             MPI_Comm comm_m;
+            bool threadMultiple_m;
         };
     }  // namespace mpi
 }  // namespace ippl
diff --git a/src/Communicate/LogEntry.cpp b/src/Communicate/LogEntry.cpp
index 444221d6d..66539e879 100644
--- a/src/Communicate/LogEntry.cpp
+++ b/src/Communicate/LogEntry.cpp
@@ -40,28 +40,31 @@ namespace ippl {
         return buffer;
     }
 
-    LogEntry LogEntry::deserialize(const std::vector<char>& buffer, size_t offset) {
+    LogEntry LogEntry::deserializeAdvance(const std::vector<char>& buffer, size_t& offset) {
         LogEntry entry;
-        size_t current_pos = offset;
 
-        entry.methodName  = deserializeString(buffer, current_pos);
-        entry.usedSize    = deserializeBasicType<size_t>(buffer, current_pos);
-        entry.freeSize    = deserializeBasicType<size_t>(buffer, current_pos);
-        entry.memorySpace = deserializeString(buffer, current_pos);
-        entry.rank        = deserializeBasicType<int>(buffer, current_pos);
+        entry.methodName  = deserializeString(buffer, offset);
+        entry.usedSize    = deserializeBasicType<size_t>(buffer, offset);
+        entry.freeSize    = deserializeBasicType<size_t>(buffer, offset);
+        entry.memorySpace = deserializeString(buffer, offset);
+        entry.rank        = deserializeBasicType<int>(buffer, offset);
 
-        auto duration   = deserializeBasicType<long long>(buffer, current_pos);
+        auto duration   = deserializeBasicType<long long>(buffer, offset);
         entry.timestamp = std::chrono::time_point<std::chrono::high_resolution_clock>(
             std::chrono::high_resolution_clock::duration(duration));
 
-        size_t mapSize = deserializeBasicType<size_t>(buffer, current_pos);
+        size_t mapSize = deserializeBasicType<size_t>(buffer, offset);
         for (size_t i = 0; i < mapSize; ++i) {
-            std::string key       = deserializeString(buffer, current_pos);
-            std::string value     = deserializeString(buffer, current_pos);
+            std::string key       = deserializeString(buffer, offset);
+            std::string value     = deserializeString(buffer, offset);
             entry.parameters[key] = value;
         }
 
         return entry;
     }
 
+    LogEntry LogEntry::deserialize(const std::vector<char>& buffer, size_t offset) {
+        return deserializeAdvance(buffer, offset);
+    }
+
 }  // namespace ippl
diff --git a/src/Communicate/LogEntry.h b/src/Communicate/LogEntry.h
index a853c3431..1929915c3 100644
--- a/src/Communicate/LogEntry.h
+++ b/src/Communicate/LogEntry.h
@@ -20,6 +20,11 @@ namespace ippl {
 
         std::vector<char> serialize() const;
         static LogEntry deserialize(const std::vector<char>& buffer, size_t offset = 0);
+
+        /// Variant of deserialize() that advances the caller's offset past
+        /// the just-read entry. Lets callers walk a byte-stream of entries
+        /// without re-serializing each one to compute its size.
+        static LogEntry deserializeAdvance(const std::vector<char>& buffer, size_t& offset);
     };
 
     template <typename T>
diff --git a/src/Communicate/LoggingBufferHandler.h b/src/Communicate/LoggingBufferHandler.h
index f6a0bd5d6..95fb418cd 100644
--- a/src/Communicate/LoggingBufferHandler.h
+++ b/src/Communicate/LoggingBufferHandler.h
@@ -1,7 +1,6 @@
 #ifndef IPPL_LOGGING_BUFFER_HANDLER_H
 #define IPPL_LOGGING_BUFFER_HANDLER_H
 
-#include <chrono>
 #include <map>
 #include <memory>
 #include <string>
diff --git a/src/Field/BareField.hpp b/src/Field/BareField.hpp
index 695576af3..14a36eb45 100644
--- a/src/Field/BareField.hpp
+++ b/src/Field/BareField.hpp
@@ -146,7 +146,7 @@ namespace ippl {
     template <typename T, unsigned Dim, class... ViewArgs>
     void BareField<T, Dim, ViewArgs...>::fillHalo() {
         if (layout_m->comm.size() > 1) {
-            halo_m.fillHalo(dview_m, layout_m);
+            halo_m.fillHalo(dview_m, layout_m, nghost_m);
         }
         if (layout_m->isAllPeriodic_m) {
             using Op = typename detail::HaloCells<T, Dim, ViewArgs...>::assign;
@@ -157,7 +157,7 @@ namespace ippl {
     template <typename T, unsigned Dim, class... ViewArgs>
     void BareField<T, Dim, ViewArgs...>::accumulateHalo() {
         if (layout_m->comm.size() > 1) {
-            halo_m.accumulateHalo(dview_m, layout_m);
+            halo_m.accumulateHalo(dview_m, layout_m, nghost_m);
         }
         if (layout_m->isAllPeriodic_m) {
             using Op = typename detail::HaloCells<T, Dim, ViewArgs...>::rhs_plus_assign;
diff --git a/src/Field/HaloCells.h b/src/Field/HaloCells.h
index c4d87b9bd..6e7606083 100644
--- a/src/Field/HaloCells.h
+++ b/src/Field/HaloCells.h
@@ -57,7 +57,7 @@ namespace ippl {
              * @param view the original field data
              * @param layout the field layout storing the domain decomposition
              */
-            void accumulateHalo(view_type& view, Layout_t* layout);
+            void accumulateHalo(view_type& view, Layout_t* layout, int nghost);
 
             /*!
              * Send halo data to internal cells for only the physical cells
@@ -75,7 +75,7 @@ namespace ippl {
              * @param view the original field data
              * @param layout the field layout storing the domain decomposition
              */
-            void fillHalo(view_type&, Layout_t* layout);
+            void fillHalo(view_type&, Layout_t* layout, int nghost);
 
             /*!
              * Pack the field data to be sent into a contiguous array.
@@ -139,7 +139,8 @@ namespace ippl {
              * unpack function call
              */
             template <class Op>
-            void exchangeBoundaries(view_type& view, Layout_t* layout, SendOrder order, int nghost = 1);
+            void exchangeBoundaries(view_type& view, Layout_t* layout, SendOrder order,
+                                    int nghost);
 
             /*!
              * Extract the subview of the original data. This does not copy.
diff --git a/src/Field/HaloCells.hpp b/src/Field/HaloCells.hpp
index eb56642ce..d99ea638f 100644
--- a/src/Field/HaloCells.hpp
+++ b/src/Field/HaloCells.hpp
@@ -7,6 +7,7 @@
 #include <vector>
 
 #include "Utility/IpplException.h"
+#include "Utility/ParallelDispatch.h"
 
 #include "Communicate/Communicator.h"
 
@@ -16,8 +17,8 @@ namespace ippl {
         HaloCells<T, Dim, ViewArgs...>::HaloCells() {}
 
         template <typename T, unsigned Dim, class... ViewArgs>
-        void HaloCells<T, Dim, ViewArgs...>::accumulateHalo(view_type& view, Layout_t* layout) {
-            exchangeBoundaries<lhs_plus_assign>(view, layout, HALO_TO_INTERNAL);
+        void HaloCells<T, Dim, ViewArgs...>::accumulateHalo(view_type& view, Layout_t* layout, int nghost) {
+            exchangeBoundaries<lhs_plus_assign>(view, layout, HALO_TO_INTERNAL, nghost);
         }
 
         template <typename T, unsigned Dim, class... ViewArgs>
@@ -25,8 +26,8 @@ namespace ippl {
             exchangeBoundaries<lhs_plus_assign>(view, layout, HALO_TO_INTERNAL_NOGHOST, nghost);
         }
         template <typename T, unsigned Dim, class... ViewArgs>
-        void HaloCells<T, Dim, ViewArgs...>::fillHalo(view_type& view, Layout_t* layout) {
-            exchangeBoundaries<assign>(view, layout, INTERNAL_TO_HALO);
+        void HaloCells<T, Dim, ViewArgs...>::fillHalo(view_type& view, Layout_t* layout, int nghost) {
+            exchangeBoundaries<assign>(view, layout, INTERNAL_TO_HALO, nghost);
         }
 
         template <typename T, unsigned Dim, class... ViewArgs>
@@ -45,9 +46,9 @@ namespace ippl {
             auto ldom = layout->getLocalNDIndex();
             for (const auto& axis : ldom) {
                 if ((axis.length() == 1) && (Dim != 1)) {
-                    throw std::runtime_error(
-                        "HaloCells: Cannot do neighbour exchange when domain decomposition "
-                        "contains planes!");
+                    throw IpplException(
+                        "HaloCells::exchangeBoundaries",
+                        "Cannot do neighbour exchange when domain decomposition contains planes.");
                 }
             }
 
@@ -62,7 +63,7 @@ namespace ippl {
                 totalRequests += componentNeighbors.size();
             }
 
-            int me=Comm->rank();
+            int me = Comm->rank();
 
             using memory_space = typename view_type::memory_space;
             using buffer_type  = mpi::Communicator::buffer_type<memory_space>;
diff --git a/src/FieldLayout/FieldLayout.h b/src/FieldLayout/FieldLayout.h
index 780040e8b..c8f1c1e2b 100644
--- a/src/FieldLayout/FieldLayout.h
+++ b/src/FieldLayout/FieldLayout.h
@@ -201,7 +201,7 @@ namespace ippl {
         FieldLayout(const mpi::Communicator& = MPI_COMM_WORLD);
 
         FieldLayout(mpi::Communicator, const NDIndex<Dim>& domain, std::array<bool, Dim> decomp,
-                    bool isAllPeriodic = false);
+                    bool isAllPeriodic = false, int nghost = 1);
 
         // Destructor: Everything deletes itself automatically ... the base
         // class destructors inform all the FieldLayoutUser's we're going away.
@@ -213,38 +213,29 @@ namespace ippl {
         // FieldLayout constructors:
 
         void initialize(const NDIndex<Dim>& domain, std::array<bool, Dim> decomp,
-                        bool isAllPeriodic = false);
+                        bool isAllPeriodic = false, int nghost = 1);
 
         // Return the domain.
         const NDIndex<Dim>& getDomain() const { return gDomain_m; }
 
-        // Compare FieldLayouts to see if they represent the same domain; if
-        // dimensionalities are different, the NDIndex operator==() will return
-        // false:
+        // Compare FieldLayouts. Different dimensionalities or different global
+        // domains are not equal; same global domain but different per-rank
+        // local-domain decompositions are also not equal.
         template <unsigned Dim2>
         bool operator==(const FieldLayout<Dim2>& x) const {
-
-            // Throw exception if the domains are not the same
-            if (gDomain_m != x.getDomain()) {
-                throw std::runtime_error("FieldLayout: only FieldLayouts with the same global domain should be compared");
-            }
-
-            return gDomain_m == x.getDomain();
-        }
-
-        bool operator==(const FieldLayout<Dim>& x) const {
-
-            // Throw exception if the domains are not the same
-            if (gDomain_m != x.getDomain()) {
-                throw std::runtime_error("FieldLayout: only FieldLayouts with the same global domain should be compared");
-            }
-
-            for (unsigned int i = 0; i < Dim; ++i) {
-                if (hLocalDomains_m(comm.rank())[i] != x.getLocalNDIndex()[i]) {
+            if constexpr (Dim != Dim2) {
+                return false;
+            } else {
+                if (gDomain_m != x.getDomain()) {
                     return false;
                 }
+                for (unsigned int i = 0; i < Dim; ++i) {
+                    if (hLocalDomains_m(comm.rank())[i] != x.getLocalNDIndex()[i]) {
+                        return false;
+                    }
+                }
+                return true;
             }
-            return true;
         }
 
         // for the requested dimension, report if the distribution is
@@ -330,7 +321,7 @@ namespace ippl {
          * Finds all neighboring ranks based on the field layout
          * @param nghost number of ghost cells (default 1)
          */
-        void findNeighbors(int nghost = 1);
+        void findNeighbors(int nghost);
 
         /*!
          * Adds a neighbor to the neighbor list
@@ -389,6 +380,9 @@ namespace ippl {
         // Minimum width of all the local domains for each dimension
         unsigned int minWidth_m[Dim];
 
+        // Nghost needed for computing send/receive ranges
+        int nghost_m;
+
         void calcWidths();
     };
 
diff --git a/src/FieldLayout/FieldLayout.hpp b/src/FieldLayout/FieldLayout.hpp
index 2bcb1bcd1..ad6ae5a53 100644
--- a/src/FieldLayout/FieldLayout.hpp
+++ b/src/FieldLayout/FieldLayout.hpp
@@ -37,7 +37,8 @@ namespace ippl {
     FieldLayout<Dim>::FieldLayout(const mpi::Communicator& communicator)
         : comm(communicator)
         , dLocalDomains_m("local domains (device)", 0)
-        , hLocalDomains_m(Kokkos::create_mirror_view(dLocalDomains_m)) {
+        , hLocalDomains_m(Kokkos::create_mirror_view(dLocalDomains_m))
+        , nghost_m(1) {
         for (unsigned int d = 0; d < Dim; ++d) {
             minWidth_m[d] = 0;
         }
@@ -45,9 +46,9 @@ namespace ippl {
 
     template <unsigned Dim>
     FieldLayout<Dim>::FieldLayout(mpi::Communicator communicator, const NDIndex<Dim>& domain,
-                                  std::array<bool, Dim> isParallel, bool isAllPeriodic)
+                                  std::array<bool, Dim> isParallel, bool isAllPeriodic, int nghost)
         : FieldLayout(communicator) {
-        initialize(domain, isParallel, isAllPeriodic);
+        initialize(domain, isParallel, isAllPeriodic, nghost);
     }
 
     template <unsigned Dim>
@@ -60,7 +61,7 @@ namespace ippl {
             hLocalDomains_m(i) = domains[i];
         }
 
-        findNeighbors();
+        findNeighbors(nghost_m);
 
         Kokkos::deep_copy(dLocalDomains_m, hLocalDomains_m);
 
@@ -69,7 +70,7 @@ namespace ippl {
 
     template <unsigned Dim>
     void FieldLayout<Dim>::initialize(const NDIndex<Dim>& domain, std::array<bool, Dim> isParallel,
-                                      bool isAllPeriodic) {
+                                      bool isAllPeriodic, int nghost) {
         int nRanks = comm.size();
 
         gDomain_m = domain;
@@ -78,11 +79,17 @@ namespace ippl {
 
         isParallelDim_m = isParallel;
 
+        nghost_m = nghost;
+
         if (nRanks < 2) {
             Kokkos::resize(dLocalDomains_m, nRanks);
             Kokkos::resize(hLocalDomains_m, nRanks);
             hLocalDomains_m(0) = domain;
             Kokkos::deep_copy(dLocalDomains_m, hLocalDomains_m);
+            // Even on a single rank we must populate minWidth_m so that
+            // getDistribution(d) returns the right answer; without this the
+            // serial-build path silently reports every dim as parallel.
+            calcWidths();
             return;
         }
 
@@ -95,8 +102,11 @@ namespace ippl {
         }
 
         if (totparelems < nRanks) {
-            throw std::runtime_error("FieldLayout:initialize: domain can only be partitioned in to "
-                + std::to_string(totparelems) + " local domains, but there are " + std::to_string(nRanks) + " ranks, decrease the number of ranks or increase the domain.");
+            throw std::runtime_error(
+                "FieldLayout:initialize: domain can only be partitioned in to "
+                + std::to_string(totparelems) + " local domains, but there are "
+                + std::to_string(nRanks)
+                + " ranks, decrease the number of ranks or increase the domain.");
         }
 
         Kokkos::resize(dLocalDomains_m, nRanks);
@@ -105,7 +115,7 @@ namespace ippl {
         detail::Partitioner<Dim> partitioner;
         partitioner.split(domain, hLocalDomains_m, isParallel, nRanks);
 
-        findNeighbors();
+        findNeighbors(nghost);
 
         Kokkos::deep_copy(dLocalDomains_m, hLocalDomains_m);
 
@@ -288,7 +298,7 @@ namespace ippl {
             // 0 - touching the lower axis value
             // 1 - touching the upper axis value
             // 2 - parallel to the axis
-            if (intersect[d].length() == 1) {
+            if (intersect[d].length() == static_cast<size_t>(nghost)) {
                 if (gnd[d].first() != intersect[d].first()) {
                     index += digit;
                 }
diff --git a/src/FieldLayout/SubFieldLayout.hpp b/src/FieldLayout/SubFieldLayout.hpp
index 53ee24c11..50a83f9e2 100644
--- a/src/FieldLayout/SubFieldLayout.hpp
+++ b/src/FieldLayout/SubFieldLayout.hpp
@@ -102,7 +102,7 @@ namespace ippl {
             }
         }
 
-        this->findNeighbors();
+        this->findNeighbors(this->nghost_m);
 
         Kokkos::deep_copy(this->dLocalDomains_m, this->hLocalDomains_m);
 
diff --git a/src/Ippl.h b/src/Ippl.h
index 174c72a29..a606c655c 100644
--- a/src/Ippl.h
+++ b/src/Ippl.h
@@ -10,7 +10,6 @@
 #include "Types/IpplTypes.h"
 
 #include "Utility/Inform.h"
-#include "Utility/ParallelDispatch.h"
 
 #include "Communicate/Communicator.h"
 #include "Communicate/Environment.h"
@@ -46,6 +45,8 @@ namespace ippl {
     }  // namespace detail
 }  // namespace ippl
 
+#include "Utility/ParallelDispatch.h"
+
 // FIMXE remove (only for backwards compatibility)
 #include "IpplCore.h"
 
diff --git a/src/Particle/ParticleAttrib.h b/src/Particle/ParticleAttrib.h
index 3d41b9029..7a720475f 100644
--- a/src/Particle/ParticleAttrib.h
+++ b/src/Particle/ParticleAttrib.h
@@ -20,8 +20,12 @@
 
 #include "Expression/IpplExpressions.h"
 
+#ifdef IPPL_ENABLE_FFT
+#include "FFT/FFT.h"
+#endif
 #include "Interpolation/CIC.h"
 #include "Particle/ParticleAttribBase.h"
+#include "Particle/SortBuffer.h"
 
 namespace ippl {
 
@@ -73,32 +77,46 @@ namespace ippl {
         void unpack(size_type) override;
 
         void serialize(detail::Archive<memory_space>& ar, size_type nsends) override {
-            ar.serialize(buf_m, nsends);
+            ar.serialize(dview_m, nsends);
+        }
+
+        void serialize(detail::Archive<memory_space>& ar, const hash_type& hash,
+                       size_type nsends) override {
+            ar.serialize(dview_m, hash, nsends);
         }
 
         void deserialize(detail::Archive<memory_space>& ar, size_type nrecvs) override {
             ar.deserialize(buf_m, nrecvs);
         }
 
-        virtual ~ParticleAttrib() = default;
-        
+        void deserialize(detail::Archive<memory_space>& ar, size_type offset,
+                         size_type nrecvs) override {
+            this->resize(offset + nrecvs);
+            ar.deserialize(dview_m, offset, nrecvs);
+        }
+
+        KOKKOS_INLINE_FUNCTION virtual ~ParticleAttrib() = default;
+
         size_type size() const override { return dview_m.extent(0); }
-        
+
         size_type packedSize(const size_type count) const override {
             return count * sizeof(value_type);
         }
-        
-        void resize(size_type n) { Kokkos::resize(dview_m, n); }
 
         /*!
-         * @brief Reallocate the underlying view with a new size.
+         * @brief Resize the underlying view, preserving existing entries on grow.
          *
-         * This function reallocates the device view to a new size. Existing data is
-         * discarded and should not be relied upon after this call. Note that this function does not
-         * apply overallocation. For use from outside, call `ParticleAttrib::alloc(size_type)`
-         * instead.
+         * Capacity-only operation; does not change the live particle count.
+         * Overallocation is the caller's responsibility (`alloc()` / `create()`
+         * apply it when sizing this view).
+         */
+        void resize(size_type n) { Kokkos::resize(dview_m, n); }
+
+        /*!
+         * @brief Reallocate the underlying view, discarding existing entries.
          *
-         * @param n The new size to allocate in the internal view.
+         * Capacity-only operation. Does not apply the default overallocation
+         * factor — call `alloc()` instead from the outside.
          */
         void realloc(size_type n) { Kokkos::realloc(dview_m, n); }
 
@@ -112,11 +130,22 @@ namespace ippl {
 
         KOKKOS_INLINE_FUNCTION T& operator()(const size_t i) const { return dview_m(i); }
 
-        view_type& getView() { return dview_m; }
-
-        const view_type& getView() const { return dview_m; }
+        /*!
+         * Returns a subview of the underlying storage covering only the live
+         * particle range [0, size()).
+         */
+        view_type getView() {
+            return Kokkos::subview(dview_m,
+                                   Kokkos::make_pair(size_type(0),
+                                                     static_cast<size_type>(*(this->localNum_mp))));
+        }
+        const view_type getView() const {
+            return Kokkos::subview(dview_m,
+                                   Kokkos::make_pair(size_type(0),
+                                                     static_cast<size_type>(*(this->localNum_mp))));
+        }
 
-        host_mirror_type getHostMirror() const { return Kokkos::create_mirror(dview_m); }
+        host_mirror_type getHostMirror() const { return Kokkos::create_mirror(getView()); }
 
         void set_name(const std::string& name_) override {
             size_t len = name_.size();
@@ -227,9 +256,21 @@ namespace ippl {
         void internalCopy(const hash_type& indices) override;
 
     private:
-        view_type dview_m;
-        view_type buf_m;
+        view_type dview_m{"ParticleAttrib::dview", 0};
+        view_type buf_m{"ParticleAttrib::buf", 0};
     };
+
+    namespace detail {
+        template <typename Attrib>
+        struct AttribTraits;
+
+        template <typename T, class... Props>
+        struct AttribTraits<ParticleAttrib<T, Props...>> {
+            using value_type = T;
+            using view_type =
+                std::decay_t<decltype(std::declval<ParticleAttrib<T, Props...>>().getView())>;
+        };
+    }  // namespace detail
 }  // namespace ippl
 
 #include "Particle/ParticleAttrib.hpp"
diff --git a/src/Particle/ParticleAttrib.hpp b/src/Particle/ParticleAttrib.hpp
index 1422bb9dd..955f83db5 100644
--- a/src/Particle/ParticleAttrib.hpp
+++ b/src/Particle/ParticleAttrib.hpp
@@ -17,8 +17,15 @@
 
 #include "Communicate/DataTypes.h"
 
+#include "Utility/BufferView.h"
 #include "Utility/IpplTimings.h"
 
+#ifdef IPPL_ENABLE_FFT
+#include "FFT/FFT.h"
+#endif
+#include "Particle/ParticleSort.h"
+#include "Particle/SortBuffer.h"
+
 namespace ippl {
 
     template <typename T, class... Properties>
@@ -71,17 +78,12 @@ namespace ippl {
         Kokkos::parallel_for(
             "ParticleAttrib::pack()", policy_type(0, size),
             KOKKOS_LAMBDA(const size_t i) { buf(i) = dview(hash(i)); });
-        Kokkos::fence();
     }
 
     template <typename T, class... Properties>
     void ParticleAttrib<T, Properties...>::unpack(size_type nrecvs) {
-        auto size          = dview_m.extent(0);
         size_type required = *(this->localNum_mp) + nrecvs;
-        if (size < required) {
-            int overalloc = Comm->getDefaultOverallocation();
-            this->resize(required * overalloc);
-        }
+        this->resize(required);
 
         size_type count   = *(this->localNum_mp);
         auto buf = buf_m;
@@ -94,7 +96,6 @@ namespace ippl {
     }
 
     template <typename T, class... Properties>
-    // KOKKOS_INLINE_FUNCTION
     ParticleAttrib<T, Properties...>& ParticleAttrib<T, Properties...>::operator=(T x) {
         auto dview = dview_m;
         using policy_type = Kokkos::RangePolicy<execution_space>;
@@ -106,7 +107,6 @@ namespace ippl {
 
     template <typename T, class... Properties>
     template <typename E, size_t N>
-    // KOKKOS_INLINE_FUNCTION
     ParticleAttrib<T, Properties...>& ParticleAttrib<T, Properties...>::operator=(
         detail::Expression<E, N> const& expr) {
         using capture_type = detail::CapturedExpression<E, N>;
@@ -147,11 +147,11 @@ namespace ippl {
         const NDIndex<Dim>& lDom       = layout.getLocalNDIndex();
         const int nghost               = f.getNghost();
 
-        //using policy_type = Kokkos::RangePolicy<execution_space>;
         const bool useHashView = hash_array.extent(0) > 0;
         if (useHashView && (iteration_policy.end() > hash_array.extent(0))) {
             Inform m("scatter");
-            m << "Hash array was passed to scatter, but size does not match iteration policy." << endl;
+            m << "Hash array was passed to scatter, but size does not match iteration policy."
+              << endl;
             ippl::Comm->abort();
         }
         auto dview = dview_m;
@@ -159,10 +159,8 @@ namespace ippl {
         Kokkos::parallel_for(
             "ParticleAttrib::scatter", iteration_policy,
             KOKKOS_LAMBDA(const size_t idx) {
-                // map index to possible hash_map
                 size_t mapped_idx = useHashView ? hash_array(idx) : idx;
 
-                // find nearest grid point
                 vector_type l                        = (ppview(mapped_idx) - origin) * invdx + 0.5;
                 Vector<int, Field::dim> index        = l;
                 Vector<PositionType, Field::dim> whi = l - index;
@@ -170,7 +168,6 @@ namespace ippl {
 
                 Vector<size_t, Field::dim> args = index - lDom.first() + nghost;
 
-                // scatter
                 const value_type& val = dview(mapped_idx);
                 detail::scatterToField(std::make_index_sequence<1 << Field::dim>{}, view, wlo, whi,
                                        args, val);
@@ -219,7 +216,6 @@ namespace ippl {
         Kokkos::parallel_for(
             "ParticleAttrib::gather", policy_type(0, *(this->localNum_mp)),
             KOKKOS_LAMBDA(const size_t idx) {
-                // find nearest grid point
                 vector_type l                        = (ppview(idx) - origin) * invdx + 0.5;
                 Vector<int, Field::dim> index        = l;
                 Vector<PositionType, Field::dim> whi = l - index;
@@ -227,9 +223,8 @@ namespace ippl {
 
                 Vector<size_t, Field::dim> args = index - lDom.first() + nghost;
 
-                // gather
-                value_type gathered = detail::gatherFromField(std::make_index_sequence<1 << Field::dim>{},
-                                                              view, wlo, whi, args);
+                value_type gathered = detail::gatherFromField(
+                    std::make_index_sequence<1 << Field::dim>{}, view, wlo, whi, args);
                 if (addToAttribute) {
                     dview(idx) += gathered;
                 } else {
@@ -240,10 +235,8 @@ namespace ippl {
     }
 
     template <typename T, class... Properties>
-    void ParticleAttrib<T, Properties...>::applyPermutation(
-        const hash_type& permutation) {
-
-        const auto view = this->getView();
+    void ParticleAttrib<T, Properties...>::applyPermutation(const hash_type& permutation) {
+        const auto view = this->getView();  // trimmed to localNum_mp
         const auto size = this->getParticleCount();
 
         view_type temp("copy", size);
@@ -255,31 +248,28 @@ namespace ippl {
 
         Kokkos::fence();
 
-        Kokkos::deep_copy(Kokkos::subview(view, Kokkos::make_pair<size_type, size_type>(0, size)), temp);
+        Kokkos::deep_copy(view, temp);
     }
 
-    template<typename T, class... Properties>
-    void ParticleAttrib<T, Properties...>::internalCopy(
-        const hash_type &indices) {
+    template <typename T, class... Properties>
+    void ParticleAttrib<T, Properties...>::internalCopy(const hash_type& indices) {
         auto copySize = indices.size();
-        create(copySize);
 
-        auto view = this->getView();
-        const auto size = this->getParticleCount();
+        // Snapshot the current count BEFORE create() increments localNum_mp.
+        const size_type oldSize = *(this->localNum_mp);
 
-        using policy_type = Kokkos::RangePolicy<execution_space>;
+        create(copySize);  // localNum_mp becomes oldSize + copySize
+
+        auto view = this->getView();
         Kokkos::parallel_for(
-            "Copy to temp", policy_type(0, copySize),
-            KOKKOS_LAMBDA(const size_type &i) {
-            view(size + i) = view(i);
-        });
+            "internalCopy", Kokkos::RangePolicy<execution_space>(0, copySize),
+            KOKKOS_LAMBDA(const size_type& i) { view(oldSize + i) = view(indices(i)); });
 
         Kokkos::fence();
     }
 
     /*
-     * Non-class function
-     *
+     * Non-class functions
      */
 
     /**
@@ -287,7 +277,7 @@ namespace ippl {
      *
      * This overload preserves legacy functionality by providing a default iteration policy.
      * It calls the member scatter() with a default Kokkos::RangePolicy.
-     * 
+     *
      * @note The default behaviour is to scatter all particles without any custom index mapping.
      *
      * @tparam Attrib1 The type of the particle attribute.
@@ -298,19 +288,21 @@ namespace ippl {
      * @param f The field onto which the data is scattered.
      * @param pp The ParticleAttrib representing particle positions.
      */
-    template <typename Attrib1, typename Field, typename Attrib2, 
-                typename policy_type = Kokkos::RangePolicy<typename Field::execution_space>>
+    template <typename Attrib1, typename Field, typename Attrib2,
+              typename policy_type = Kokkos::RangePolicy<typename Field::execution_space>>
     inline void scatter(const Attrib1& attrib, Field& f, const Attrib2& pp) {
-        attrib.scatter(f, pp, policy_type(0, attrib.getParticleCount())); 
+        attrib.scatter(f, pp, policy_type(0, attrib.getParticleCount()));
     }
 
     /**
-     * @brief Non-class interface for scattering with a custom iteration policy and optional index array.
+     * @brief Non-class interface for scattering with a custom iteration policy and optional index
+     * array.
      *
      * This overload allows the caller to specify a custom `Kokkos::range_policy` and an optional
      * `ippl::hash_type` array. It forwards the parameters to the member scatter() function.
-     * 
-     * @note See ParticleAttrib::scatter() for more information on the custom iteration functionality.
+     *
+     * @note See ParticleAttrib::scatter() for more information on the custom iteration
+     * functionality.
      *
      * @tparam Attrib1 The type of the particle attribute.
      * @tparam Field The type of the field.
@@ -322,33 +314,34 @@ namespace ippl {
      * @param iteration_policy A custom `Kokkos::range_policy` defining the iteration range.
      * @param hash_array An optional `ippl::hash_type` array for index mapping.
      */
-    template <typename Attrib1, typename Field, typename Attrib2, 
-                typename policy_type = Kokkos::RangePolicy<typename Field::execution_space>>
-    inline void scatter(const Attrib1& attrib, Field& f, const Attrib2& pp, 
-                        policy_type iteration_policy, typename Attrib1::hash_type hash_array = {}) {
+    template <typename Attrib1, typename Field, typename Attrib2,
+              typename policy_type = Kokkos::RangePolicy<typename Field::execution_space>>
+    inline void scatter(const Attrib1& attrib, Field& f, const Attrib2& pp,
+                        policy_type iteration_policy,
+                        typename Attrib1::hash_type hash_array = {}) {
         attrib.scatter(f, pp, iteration_policy, hash_array);
     }
 
     /**
      * @brief Non-class interface for gathering field data into a particle attribute.
-     * 
+     *
      * This interface calls the member ParticleAttrib::gather() function with the provided
      * parameters and preserving legacy behavior by assigning `addToAttribute` a default value.
-     * 
+     *
      * @note See ParticleAttrib::gather() for more information on the behavior of `addToAttribute`.
-     * 
+     *
      * @tparam Attrib1 The type of the particle attribute.
      * @tparam Field The type of the field.
      * @tparam Attrib2 The type of the particle position attribute.
      * @param attrib The particle attribute to gather data into.
      * @param f The field from which data is gathered.
      * @param pp The ParticleAttrib representing particle positions.
-     * @param addToAttribute If true, the gathered field value is added to the current attribute value;
-     *                       otherwise, the attribute value is overwritten.
+     * @param addToAttribute If true, the gathered field value is added to the current attribute
+     * value; otherwise, the attribute value is overwritten.
      */
     template <typename Attrib1, typename Field, typename Attrib2>
-    inline void gather(Attrib1& attrib, Field& f, const Attrib2& pp, 
-                        const bool addToAttribute = false) {
+    inline void gather(Attrib1& attrib, Field& f, const Attrib2& pp,
+                       const bool addToAttribute = false) {
         attrib.gather(f, pp, addToAttribute);
     }
 
@@ -374,4 +367,5 @@ namespace ippl {
     DefineParticleReduction(Max, max, if (myVal > valL) valL = myVal, std::greater)
     DefineParticleReduction(Min, min, if (myVal < valL) valL = myVal, std::less)
     DefineParticleReduction(Prod, prod, valL *= myVal, std::multiplies)
+
 }  // namespace ippl
diff --git a/src/Particle/ParticleAttribBase.h b/src/Particle/ParticleAttribBase.h
index 4dea4ac8e..4526f5dbe 100644
--- a/src/Particle/ParticleAttribBase.h
+++ b/src/Particle/ParticleAttribBase.h
@@ -73,19 +73,22 @@ namespace ippl {
 
             virtual void unpack(size_type) = 0;
 
-            virtual void serialize(Archive<memory_space>& ar, size_type nsends) = 0;
-
+            virtual void serialize(Archive<memory_space>& ar, size_type nsends)   = 0;
+            virtual void serialize(detail::Archive<memory_space>& ar, const hash_type& hash,
+                                   size_type nsends)                              = 0;
             virtual void deserialize(Archive<memory_space>& ar, size_type nrecvs) = 0;
+            virtual void deserialize(detail::Archive<memory_space>& ar, size_type offset,
+                                     size_type nrecvs)                            = 0;
 
             virtual size_type size() const = 0;
 
-            virtual ~ParticleAttribBase() = default;
+            KOKKOS_INLINE_FUNCTION virtual ~ParticleAttribBase() = default;
 
             void setParticleCount(size_type& num) { localNum_mp = &num; }
             size_type getParticleCount() const { return *localNum_mp; }
 
             virtual void applyPermutation(const hash_type&) = 0;
-            virtual void internalCopy(const hash_type&) = 0;
+            virtual void internalCopy(const hash_type&)     = 0;
 
         protected:
             const size_type* localNum_mp;
diff --git a/src/Particle/ParticleBase.h b/src/Particle/ParticleBase.h
index 9fbed2faa..9793da2a9 100644
--- a/src/Particle/ParticleBase.h
+++ b/src/Particle/ParticleBase.h
@@ -299,9 +299,18 @@ namespace ippl {
         /* This function does not alter the totalNum_m member function. It should only be called
          * during the update function where we know the number of particles remains the same.
          */
+        template <typename memory_space, typename execution_space, typename F,
+                  typename... Properties>
+        void internalDestroy(const F& invalid_functor, const size_type destroyNum);
         template <typename... Properties>
         void internalDestroy(const Kokkos::View<bool*, Properties...>& invalid,
-                             const size_type destroyNum);
+                             const size_type destroyNum) {
+            using view_type       = Kokkos::View<bool*, Properties...>;
+            using memory_space    = typename view_type::memory_space;
+            using execution_space = typename view_type::execution_space;
+            internalDestroy<memory_space, execution_space>(
+                KOKKOS_LAMBDA(size_t i) { return invalid(i); }, destroyNum);
+        }
 
         /*!
          * Sends particles to another rank
@@ -316,6 +325,9 @@ namespace ippl {
         void sendToRank(int rank, int tag, std::vector<MPI_Request>& requests,
                         const HashType& hash);
 
+        template <typename HashType>
+        MPI_Request sendToRank(int rank, int tag, const HashType& hash);
+
         /*!
          * Receives particles from another rank
          * @param rank the source rank
@@ -324,6 +336,9 @@ namespace ippl {
          */
         void recvFromRank(int rank, int tag, size_type nRecvs);
 
+        std::pair<MPI_Request, std::function<void(size_type)>> postRecvFromRank(int rank, int tag,
+                                                                                size_type nRecvs);
+
         /*!
          * Serialize to do MPI calls.
          * @param ar archive
diff --git a/src/Particle/ParticleBase.hpp b/src/Particle/ParticleBase.hpp
index a71a917d6..fbbe72638 100644
--- a/src/Particle/ParticleBase.hpp
+++ b/src/Particle/ParticleBase.hpp
@@ -62,8 +62,6 @@ namespace ippl {
             addAttribute(ID);
         }
         addAttribute(R);
-        ID.set_name("ID");
-        R.set_name("position");
     }
 
     template <class PLayout, typename... IP>
@@ -175,9 +173,9 @@ namespace ippl {
     }
 
     template <class PLayout, typename... IP>
-    template <typename... Properties>
-    void ParticleBase<PLayout, IP...>::internalDestroy(
-        const Kokkos::View<bool*, Properties...>& invalid, const size_type destroyNum) {
+    template <typename memory_space, typename execution_space, typename F, typename... Properties>
+    void ParticleBase<PLayout, IP...>::internalDestroy(const F& invalid_functor,
+                                                       const size_type destroyNum) {
         PAssert(destroyNum <= localNum_m);
 
         // If there aren't any particles to delete, do nothing
@@ -193,19 +191,18 @@ namespace ippl {
             return;
         }
 
-        using view_type       = Kokkos::View<bool*, Properties...>;
-        using memory_space    = typename view_type::memory_space;
-        using execution_space = typename view_type::execution_space;
-        using policy_type     = Kokkos::RangePolicy<execution_space>;
-        auto& locDeleteIndex  = deleteIndex_m.get<memory_space>();
-        auto& locKeepIndex    = keepIndex_m.get<memory_space>();
+        using policy_type    = Kokkos::RangePolicy<execution_space>;
+        auto& locDeleteIndex = deleteIndex_m.get<memory_space>();
+        auto& locKeepIndex   = keepIndex_m.get<memory_space>();
 
-        // Resize buffers, if necessary
+        // Resize buffers per-memory-space. The previous code mistakenly used
+        // the outer `memory_space` template parameter inside the per-space
+        // lambda, leaving every other space's del/keep buffers undersized.
         detail::runForAllSpaces([&]<typename MemorySpace>() {
-            if (attributes_m.template get<memory_space>().size() > 0) {
+            if (attributes_m.template get<MemorySpace>().size() > 0) {
                 int overalloc = Comm->getDefaultOverallocation();
-                auto& del     = deleteIndex_m.get<memory_space>();
-                auto& keep    = keepIndex_m.get<memory_space>();
+                auto& del     = deleteIndex_m.template get<MemorySpace>();
+                auto& keep    = keepIndex_m.template get<MemorySpace>();
                 if (del.size() < destroyNum) {
                     Kokkos::realloc(del, destroyNum * overalloc);
                     Kokkos::realloc(keep, destroyNum * overalloc);
@@ -220,10 +217,10 @@ namespace ippl {
         Kokkos::parallel_scan(
             "Scan in ParticleBase::destroy()", policy_type(0, localNum_m - destroyNum),
             KOKKOS_LAMBDA(const size_t i, int& idx, const bool final) {
-                if (final && invalid(i)) {
+                if (final && invalid_functor(i)) {
                     locDeleteIndex(idx) = i;
                 }
-                if (invalid(i)) {
+                if (invalid_functor(i)) {
                     idx += 1;
                 }
             });
@@ -245,10 +242,10 @@ namespace ippl {
             "Second scan in ParticleBase::destroy()",
             Kokkos::RangePolicy<size_type, execution_space>(localNum_m - destroyNum, localNum_m),
             KOKKOS_LAMBDA(const size_t i, int& idx, const bool final) {
-                if (final && !invalid(i)) {
+                if (final && !invalid_functor(i)) {
                     locKeepIndex(idx) = i;
                 }
-                if (!invalid(i)) {
+                if (!invalid_functor(i)) {
                     idx += 1;
                 }
             });
@@ -280,27 +277,39 @@ namespace ippl {
 
     template <class PLayout, typename... IP>
     template <typename HashType>
-    void ParticleBase<PLayout, IP...>::sendToRank(int rank, int tag,
-                                                  std::vector<MPI_Request>& requests,
-                                                  const HashType& hash) {
-        size_type nSends = hash.size();
-        requests.resize(requests.size() + 1);
+    MPI_Request ParticleBase<PLayout, IP...>::sendToRank(int rank, int tag, const HashType& hash) {
+        size_type nSends    = hash.size();
+        MPI_Request request = MPI_REQUEST_NULL;
 
         auto hashes = hash_container_type(hash, [&]<typename MemorySpace>() {
             return attributes_m.template get<MemorySpace>().size() > 0;
         });
-        pack(hashes);
         detail::runForAllSpaces([&]<typename MemorySpace>() {
             size_type bufSize = packedSize<MemorySpace>(nSends);
             if (bufSize == 0) {
                 return;
             }
-
             auto buf = Comm->getBuffer<MemorySpace>(bufSize);
 
-            Comm->isend(rank, tag++, *this, *buf, requests.back(), nSends);
+            forAllAttributes<MemorySpace>([&]<typename Attribute>(Attribute& att) {
+                att->serialize(*buf, hashes.template get<MemorySpace>(), nSends);
+            });
+
+            Comm->isend(rank, tag++, *buf, request);
             buf->resetWritePos();
         });
+        return request;
+    }
+
+    template <class PLayout, typename... IP>
+    template <typename HashType>
+    void ParticleBase<PLayout, IP...>::sendToRank(int rank, int tag,
+                                                  std::vector<MPI_Request>& requests,
+                                                  const HashType& hash) {
+        auto hashes = hash_container_type(hash, [&]<typename MemorySpace>() {
+            return attributes_m.template get<MemorySpace>().size() > 0;
+        });
+        requests.push_back(sendToRank(rank, tag, hash));
     }
 
     template <class PLayout, typename... IP>
@@ -313,10 +322,46 @@ namespace ippl {
 
             auto buf = Comm->getBuffer<MemorySpace>(bufSize);
 
-            Comm->recv(rank, tag++, *this, *buf, bufSize, nRecvs);
+            Comm->recv(rank, tag++, *buf, bufSize);
+            forAllAttributes<MemorySpace>([&]<typename Attribute>(Attribute& att) {
+                att->deserialize(*buf, localNum_m, nRecvs);
+            });
+
             buf->resetReadPos();
         });
-        unpack(nRecvs);
+        localNum_m += nRecvs;
+    }
+
+    template <class PLayout, typename... IP>
+    std::pair<MPI_Request, std::function<void(size_t)>>
+    ParticleBase<PLayout, IP...>::postRecvFromRank(int rank, int tag, size_type nRecvs) {
+        MPI_Request request = MPI_REQUEST_NULL;
+
+        // Collect (buf, nRecvs) per memory space for deferred deserialization
+        auto deferred = std::make_shared<std::vector<std::function<void(size_type)>>>();
+
+        detail::runForAllSpaces([&]<typename MemorySpace>() {
+            size_type bufSize = packedSize<MemorySpace>(nRecvs);
+            if (bufSize == 0)
+                return;
+
+            auto buf = Comm->getBuffer<MemorySpace>(bufSize);
+            Comm->irecv(rank, tag++, *buf, request, bufSize);
+
+            deferred->push_back([this, buf, nRecvs](size_type offset) {
+                forAllAttributes<MemorySpace>([&]<typename Attribute>(Attribute& att) {
+                    att->deserialize(*buf, offset, nRecvs);
+                });
+                buf->resetReadPos();
+            });
+        });
+
+        // Finalize takes the offset (localNum_m after destruction) and advances it
+        return {request, [this, deferred, nRecvs](size_type offset) {
+                    for (auto& fn : *deferred)
+                        fn(offset);
+                    localNum_m += nRecvs;
+                }};
     }
 
     template <class PLayout, typename... IP>
diff --git a/src/Particle/ParticleSort.h b/src/Particle/ParticleSort.h
new file mode 100644
index 000000000..f5eaa3336
--- /dev/null
+++ b/src/Particle/ParticleSort.h
@@ -0,0 +1,331 @@
+#ifndef IPPL_PARTICLE_SORT_H
+#define IPPL_PARTICLE_SORT_H
+
+#include <Kokkos_Core.hpp>
+
+#include <Kokkos_Sort.hpp>
+#include <algorithm>
+#include <cstdint>
+#include <stdexcept>
+#include <vector>
+
+#include "Particle/SortBuffer.h"
+
+#ifdef KOKKOS_ENABLE_CUDA
+#include <cub/device/device_radix_sort.cuh>
+#endif
+
+#ifdef KOKKOS_ENABLE_HIP
+#include <rocprim/rocprim.hpp>
+#endif
+
+namespace ippl {
+    namespace detail {
+
+        /**
+         * @brief Validate that all permutation indices are in bounds
+         */
+        template <typename ExecSpace, typename IndexView>
+        void validatePermutation(const IndexView& permute, size_t n,
+                                 const char* label = "sortParticles") {
+            using size_type = size_t;
+
+            size_type num_invalid = 0;
+            Kokkos::parallel_reduce(
+                label, Kokkos::RangePolicy<ExecSpace>(0, n),
+                KOKKOS_LAMBDA(size_type i, size_type & count) {
+                    if (permute(i) >= n)
+                        count++;
+                },
+                num_invalid);
+            Kokkos::fence();
+
+            if (num_invalid > 0) {
+                size_type first_invalid_idx = n;
+                Kokkos::parallel_reduce(
+                    "find_first_invalid", Kokkos::RangePolicy<ExecSpace>(0, n),
+                    KOKKOS_LAMBDA(size_type i, size_type & first_idx) {
+                        if (permute(i) >= n && i < first_idx)
+                            first_idx = i;
+                    },
+                    Kokkos::Min<size_type>(first_invalid_idx));
+                Kokkos::fence();
+
+                auto permute_host =
+                    Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, permute);
+
+                throw std::runtime_error(std::string(label) + ": Found "
+                                         + std::to_string(num_invalid)
+                                         + " invalid permutation indices. First invalid: permute["
+                                         + std::to_string(first_invalid_idx)
+                                         + "] = " + std::to_string(permute_host(first_invalid_idx))
+                                         + " (n = " + std::to_string(n) + ")");
+            }
+        }
+
+        /**
+         * @brief Compute Morton code (Z-order curve) for spatial sorting
+         */
+        template <unsigned Dim, typename T, typename IndexType = size_t>
+        KOKKOS_INLINE_FUNCTION uint64_t computeMortonCode(const Vector<T, Dim>& position,
+                                                          const Vector<T, Dim>& origin,
+                                                          const Vector<T, Dim>& invdx,
+                                                          const Vector<IndexType, Dim>& ngrid) {
+            uint64_t morton            = 0;
+            constexpr int bits_per_dim = 21;  // 21*3 = 63 bits, safe for 3D
+
+            uint32_t gridIndices[Dim];
+            for (unsigned d = 0; d < Dim; ++d) {
+                T sx = (position[d] - origin[d]) * invdx[d];
+                sx -= ngrid[d] * Kokkos::floor(sx / ngrid[d]);  // wrap to [0, ngrid)
+                gridIndices[d] = static_cast<uint32_t>(sx);
+            }
+
+            for (int bit = 0; bit < bits_per_dim; ++bit) {
+                for (unsigned d = 0; d < Dim; ++d) {
+                    if (gridIndices[d] & (1u << bit)) {
+                        morton |= (uint64_t(1) << (bit * Dim + d));
+                    }
+                }
+            }
+            return morton;
+        }
+
+        /**
+         * @brief Functor to compute Morton codes for all particles
+         */
+        template <unsigned Dim, typename PositionView, typename T, typename KeyView>
+        struct ComputeMortonCodesFunctor {
+            using size_type = size_t;
+
+            PositionView positions;
+            KeyView keys;
+            Vector<T, Dim> origin;
+            Vector<T, Dim> invdx;
+            Vector<size_type, Dim> ngrid;
+
+            KOKKOS_INLINE_FUNCTION void operator()(size_type i) const {
+                keys(i) = computeMortonCode<Dim, T, size_type>(positions(i), origin, invdx, ngrid);
+            }
+        };
+
+        // -------------------------------------------------------------------
+        // Platform-specific sort implementations
+        // -------------------------------------------------------------------
+
+        /**
+         * @brief Sort particles on host using std::sort on (morton, index) pairs
+         *
+         * @return Subview of the buffered permute array, valid until next ensureCapacity
+         */
+        template <unsigned Dim, typename T>
+        Kokkos::View<size_t*, Kokkos::HostSpace> sortParticlesHost(
+            Kokkos::View<Vector<T, Dim>*, Kokkos::HostSpace> positions,
+            const Vector<T, Dim>& origin, const Vector<T, Dim>& invdx,
+            const Vector<size_t, Dim>& ngrid, size_t n) {
+            auto& bufs = ippl::detail::getDefaultBinSortBuffers<Kokkos::HostSpace>();
+            bufs.ensureCapacity(n, /*n_bins_p1=*/1);
+
+            auto& permute = bufs.permute();
+
+            // Build (key, index) pairs, sort, then extract permutation
+            std::vector<std::pair<uint64_t, size_t>> pairs(n);
+            for (size_t i = 0; i < n; ++i) {
+                pairs[i] = {computeMortonCode<Dim, T, size_t>(positions(i), origin, invdx, ngrid),
+                            i};
+            }
+            std::sort(pairs.begin(), pairs.end());  // pair has lexicographic < by .first
+
+            for (size_t i = 0; i < n; ++i) {
+                permute(i) = pairs[i].second;
+            }
+
+            return permute;
+        }
+
+#ifdef KOKKOS_ENABLE_CUDA
+        /**
+         * @brief Sort particles on CUDA using CUB DeviceRadixSort with buffered temporaries
+         *
+         * @return Subview of the buffered permOut array containing sorted particle indices
+         */
+        template <unsigned Dim, typename T>
+        Kokkos::View<size_t*, Kokkos::CudaSpace> sortParticlesCuda(
+            Kokkos::View<Vector<T, Dim>*, Kokkos::CudaSpace> positions,
+            const Vector<T, Dim>& origin, const Vector<T, Dim>& invdx,
+            const Vector<size_t, Dim>& ngrid, size_t n) {
+            using size_type = size_t;
+            auto& bufs      = ippl::detail::getDefaultBinSortBuffers<Kokkos::CudaSpace>();
+            bufs.ensureCapacity(n, /*n_bins_p1=*/1);
+
+            // Step 1: compute Morton codes into binKeys, init permute to identity
+            auto keys    = bufs.binKeys();
+            auto permute = bufs.permute();
+
+            Kokkos::parallel_for(
+                "MortonSort::ComputeKeys", Kokkos::RangePolicy<Kokkos::Cuda>(0, n),
+                ComputeMortonCodesFunctor<Dim, decltype(positions), T, decltype(keys)>{
+                    positions, keys, origin, invdx, ngrid});
+
+            Kokkos::parallel_for(
+                "MortonSort::InitPermute", Kokkos::RangePolicy<Kokkos::Cuda>(0, n),
+                KOKKOS_LAMBDA(size_type i) { permute(i) = i; });
+            Kokkos::fence();
+
+            // Step 2: query temp storage, grow buffer if needed, then sort
+            auto keys_out = bufs.keysOut();
+            auto perm_out = bufs.permOut();
+
+            size_t temp_bytes = 0;
+            cub::DeviceRadixSort::SortPairs(nullptr, temp_bytes, keys.data(), keys_out.data(),
+                                            permute.data(), perm_out.data(), static_cast<int>(n));
+
+            bufs.ensureTempStorage(temp_bytes);
+
+            cub::DeviceRadixSort::SortPairs(bufs.tempStorage().data(), temp_bytes, keys.data(),
+                                            keys_out.data(), permute.data(), perm_out.data(),
+                                            static_cast<int>(n));
+            Kokkos::fence();
+
+            return bufs.permOut();
+        }
+#endif
+
+#ifdef KOKKOS_ENABLE_HIP
+        /**
+         * @brief Sort particles on HIP using rocPRIM radix_sort_pairs with buffered temporaries
+         *
+         * @return Subview of the buffered permOut array containing sorted particle indices
+         */
+        template <unsigned Dim, typename T>
+        Kokkos::View<size_t*, Kokkos::HIPSpace> sortParticlesHip(
+            Kokkos::View<Vector<T, Dim>*, Kokkos::HIPSpace> positions, const Vector<T, Dim>& origin,
+            const Vector<T, Dim>& invdx, const Vector<size_t, Dim>& ngrid, size_t n) {
+            using size_type = size_t;
+            auto& bufs      = ippl::detail::getDefaultBinSortBuffers<Kokkos::HIPSpace>();
+            bufs.ensureCapacity(n, /*n_bins_p1=*/1);
+
+            auto keys    = bufs.binKeys();
+            auto permute = bufs.permute();
+
+            Kokkos::parallel_for(
+                "MortonSort::ComputeKeys", Kokkos::RangePolicy<Kokkos::HIP>(0, n),
+                ComputeMortonCodesFunctor<Dim, decltype(positions), T, decltype(keys)>{
+                    positions, keys, origin, invdx, ngrid});
+
+            Kokkos::parallel_for(
+                "MortonSort::InitPermute", Kokkos::RangePolicy<Kokkos::HIP>(0, n),
+                KOKKOS_LAMBDA(size_type i) { permute(i) = i; });
+            Kokkos::fence();
+
+            auto keys_out = bufs.keysOut();
+            auto perm_out = bufs.permOut();
+
+            size_t temp_bytes = 0;
+            std::ignore = rocprim::radix_sort_pairs(nullptr, temp_bytes, keys.data(), keys_out.data(),
+                                      permute.data(), perm_out.data(), n, 0, sizeof(uint64_t) * 8,
+                                      Kokkos::HIP().hip_stream());
+
+            bufs.ensureTempStorage(temp_bytes);
+
+            std::ignore = rocprim::radix_sort_pairs(bufs.tempStorage().data(), temp_bytes, keys.data(),
+                                      keys_out.data(), permute.data(), perm_out.data(), n, 0,
+                                      sizeof(uint64_t) * 8, Kokkos::HIP().hip_stream());
+            Kokkos::fence();
+
+            return bufs.permOut();
+        }
+#endif
+
+        /**
+         * @brief Generic sort dispatcher — selects CUDA/HIP/host implementation
+         */
+        template <unsigned Dim, typename ExecSpace, typename T>
+        Kokkos::View<size_t*, typename ExecSpace::memory_space> sortParticles(
+            Kokkos::View<Vector<T, Dim>*, typename ExecSpace::memory_space> positions,
+            const Vector<T, Dim>& origin, const Vector<T, Dim>& invdx,
+            const Vector<size_t, Dim>& ngrid, size_t n) {
+            using memory_space = typename ExecSpace::memory_space;
+
+#ifdef KOKKOS_ENABLE_CUDA
+            if constexpr (std::is_same_v<ExecSpace, Kokkos::Cuda>) {
+                return sortParticlesCuda<Dim, T>(positions, origin, invdx, ngrid, n);
+            }
+#endif
+#ifdef KOKKOS_ENABLE_HIP
+            if constexpr (std::is_same_v<ExecSpace, Kokkos::HIP>) {
+                return sortParticlesHip<Dim, T>(positions, origin, invdx, ngrid, n);
+            }
+#endif
+            if constexpr (std::is_same_v<memory_space, Kokkos::HostSpace>) {
+                return sortParticlesHost<Dim, T>(positions, origin, invdx, ngrid, n);
+            } else {
+                // Generic fallback: sort on host, copy result to device buffer
+                auto positions_host =
+                    Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, positions);
+
+                Vector<size_t, Dim> ngrid_host;
+                for (unsigned d = 0; d < Dim; ++d)
+                    ngrid_host[d] = ngrid[d];
+
+                auto permute_host =
+                    sortParticlesHost<Dim, T>(positions_host, origin, invdx, ngrid_host, n);
+
+                auto& bufs = ippl::detail::getDefaultBinSortBuffers<memory_space>();
+                bufs.ensureCapacity(n, /*n_bins_p1=*/1);
+
+                Kokkos::deep_copy(Kokkos::subview(bufs.permute(), std::make_pair(size_t(0), n)),
+                                  Kokkos::subview(permute_host, std::make_pair(size_t(0), n)));
+
+                return bufs.permute();
+            }
+        }
+
+        // -------------------------------------------------------------------
+        // Permutation application helpers
+        // -------------------------------------------------------------------
+
+        template <typename SrcView, typename DstView, typename IndexView>
+        struct ApplyPermutationFunctor {
+            SrcView src;
+            DstView dst;
+            IndexView permute;
+            KOKKOS_INLINE_FUNCTION void operator()(size_t i) const { dst(i) = src(permute(i)); }
+        };
+
+        template <typename SrcView, typename DstView, typename IndexView>
+        struct ApplyInversePermutationFunctor {
+            SrcView src;
+            DstView dst;
+            IndexView permute;
+            KOKKOS_INLINE_FUNCTION void operator()(size_t i) const { dst(permute(i)) = src(i); }
+        };
+
+        /**
+         * @brief Reorder data in-place according to permutation
+         */
+        template <typename ExecSpace, typename DataView, typename IndexView>
+        void applyPermutation(DataView& data, const IndexView& permute, size_t n) {
+            DataView temp("temp", n);
+            Kokkos::parallel_for(
+                "apply_permutation", Kokkos::RangePolicy<ExecSpace>(0, n),
+                ApplyPermutationFunctor<DataView, DataView, IndexView>{data, temp, permute});
+            Kokkos::deep_copy(Kokkos::subview(data, Kokkos::make_pair(size_t(0), n)), temp);
+        }
+
+        /**
+         * @brief Scatter src → dst using inverse permutation (dst[permute[i]] = src[i])
+         */
+        template <typename ExecSpace, typename DataView, typename IndexView>
+        void applyInversePermutation(const DataView& src, DataView& dst, const IndexView& permute,
+                                     size_t n) {
+            Kokkos::parallel_for(
+                "apply_inverse_permutation", Kokkos::RangePolicy<ExecSpace>(0, n),
+                ApplyInversePermutationFunctor<DataView, DataView, IndexView>{src, dst, permute});
+            Kokkos::fence();
+        }
+
+    }  // namespace detail
+}  // namespace ippl
+
+#endif  // IPPL_PARTICLE_SORT_H
\ No newline at end of file
diff --git a/src/Particle/ParticleSpatialLayout.h b/src/Particle/ParticleSpatialLayout.h
index 568b30753..ed596e180 100644
--- a/src/Particle/ParticleSpatialLayout.h
+++ b/src/Particle/ParticleSpatialLayout.h
@@ -35,6 +35,19 @@
 
 namespace ippl {
 
+    // Controls how the send-count exchange is performed before the particle
+    // data transfer. All paths assume GPU-aware MPI (device pointers passed
+    // directly to Isend/Irecv/Alltoall).
+    enum class CountExchange {
+        // One-sided RMA: each sender writes its count directly into the
+        // receiver's window.
+        RMA,
+        // Two-sided point-to-point: per-rank Isend/Irecv over device pointers.
+        P2P,
+        // Collective Alltoall over device pointers.
+        Alltoall
+    };
+
     /*!
      * ParticleSpatialLayout class definition.
      * @tparam T value type
@@ -59,9 +72,9 @@ namespace ippl {
 
         using size_type = detail::size_type;
 
-    public:
         // constructor: this one also takes a Mesh
-        ParticleSpatialLayout(FieldLayout<Dim>&, Mesh&, bool fem = false);
+        ParticleSpatialLayout(FieldLayout<Dim>&, Mesh&, bool fem = false,
+                              CountExchange mode = CountExchange::RMA);
 
         ParticleSpatialLayout()
             : detail::ParticleLayout<T, Dim, PositionProperties...>() {}
@@ -73,21 +86,33 @@ namespace ippl {
         template <class ParticleContainer>
         void update(ParticleContainer& pc);
 
-        const RegionLayout_t& getRegionLayout() const { return rlayout_m; }
+        const RegionLayout_t& getRegionLayout() const { return *rlayout_m; }
 
     protected:
         //! The RegionLayout which determines where our particles go.
-        RegionLayout_t rlayout_m;
+        std::shared_ptr<RegionLayout_t> rlayout_m;
 
         //! The FieldLayout containing information on nearest neighbors
         FieldLayout_t& flayout_m;
 
+        //! How counts are exchanged
+        CountExchange countExchangeMode_;
+
+        //
+        // RMA Path
+        //
+
         // Vector keeping track of the recieves from all ranks
         std::vector<size_type> nRecvs_m;
 
         // MPI RMA window for one-sided communication
         mpi::rma::Window<mpi::rma::Active> window_m;
 
+        //
+        // P2P GPU Path
+        //
+        locate_type recvCounts_d_;  // [nranks]
+
         //! Type of the Kokkos view containing the local regions.
         using region_view_type = typename RegionLayout_t::view_type;
         //! Type of a single Region object.
@@ -99,6 +124,10 @@ namespace ippl {
         KOKKOS_INLINE_FUNCTION constexpr static bool positionInRegion(
             const std::index_sequence<Idx...>&, const vector_type& pos, const region_type& region);
 
+        template <size_t... Idx>
+        KOKKOS_INLINE_FUNCTION constexpr static bool positionInRegionInclusive(
+            const std::index_sequence<Idx...>&, const vector_type& pos, const region_type& region);
+
         /*!
          * Evaluates the total number of MPI ranks sharing the spatial nearest neighbors.
          * @param neighbors structure containing, for every spatial direction, a list of
@@ -109,35 +138,59 @@ namespace ippl {
 
     public:
         /*!
-         * For each particle in the bunch, determine the rank on which it should
-         * be stored based on its location
-         * @tparam ParticleContainer the particle container type
-         * @param pc the particle container
-         * @param ranks the integer view in which to store the destination ranks
-         * @param invalid the boolean view in which to store whether each particle
-         * is invalidated, i.e. needs to be sent to another rank
-         * @param nSends_dview the view storing per-rank send counts
-         * @param sends_dview the view storing per-rank send offsets
-         * @return The total number of invalidated particles
+         * For each local particle, determine its destination rank, write the
+         * indices of leaving particles into `sendIds_d_`, populate the
+         * per-rank send-count and offset buffers, build a compacted list of
+         * destination ranks, and mark each particle's "is leaving" status in
+         * `leaving_d_` so the destroy pass doesn't have to recompute it.
+         * @return number of leaving particles on this rank
          */
         template <typename ParticleContainer>
-        std::pair<size_type, size_type> locateParticles(const ParticleContainer& pc,
-                                                        locate_type& ranks, bool_type& invalid,
-                                                        locate_type& nSends_dview,
-                                                        locate_type& sends_dview) const;
-
-        /*!
-         * @param rank we sent to
-         * @param ranks a container specifying where a particle at the i-th index should go.
-         * @param hash a mapping to fill the send buffer contiguously
-         */
-        void fillHash(int rank, const locate_type& ranks, hash_type& hash);
-
-        /*!
-         * @param rank we sent to
-         * @param ranks a container specifying where a particle at the i-th index should go.
-         */
-        size_t numberOfSends(int rank, const locate_type& ranks);
+        size_t locateParticlesPacked(const ParticleContainer& pc);
+
+    private:
+        // Fixed-size scratch
+        locate_type rankSendCount_d_;  // [nRanks]
+        locate_type sendOffsets_d_;    // [nRanks+1]
+        hash_type sendIds_d_;          // [capacity >= max nInvalid seen]
+        locate_type cursor_d_;         // [nRanks] per-rank insertion cursor
+        locate_type destRanks_d_;      // [nRanks] (compacted list)
+        bool_type leaving_d_;          // [capacity >= max nLocal seen] mask
+
+        // Single scalar on device to count destinations
+        Kokkos::View<size_type, position_memory_space> nDest_d_;
+
+        // Neigbour cache
+        locate_type neighbors_d_;          // [neighborSize] cached device neighbors list
+        std::vector<int> neighbors_host_;  // flat host copy
+        bool neighbors_dirty_      = true;
+        size_t neighbors_capacity_ = 0;
+        size_type neighbors_used_  = 0;
+
+        // Host mirror buffers
+        using host_mem_space   = Kokkos::HostSpace;
+        using locate_host_type = typename detail::ViewType<int, 1, host_mem_space>::view_type;
+
+        locate_host_type rankSendCount_h_;  // [nRanks] (mirror)
+        locate_host_type sendOffsets_h_;    // [nRanks+1] (mirror)
+        locate_host_type destRanks_h_;      // [nRanks] (mirror)
+
+        // Host-side destination list
+        std::vector<int> destinationRanks_host_;
+
+        // capacities
+        size_t sendIds_capacity_ = 0;
+        size_t leaving_capacity_ = 0;
+        int nRanks_              = 0;
+
+        void initScratch(int nRanks);
+        void ensureSendCapacity(size_t nInvalid);
+        void ensureLeavingCapacity(size_t nLocal);
+        void ensureNeighborsCached();
+
+        void countExchangeRMA();
+        void countExchangeP2P();
+        void countExchangeAlltoall();
     };
 }  // namespace ippl
 
diff --git a/src/Particle/ParticleSpatialLayout.hpp b/src/Particle/ParticleSpatialLayout.hpp
index b47529aae..c2bc0093c 100644
--- a/src/Particle/ParticleSpatialLayout.hpp
+++ b/src/Particle/ParticleSpatialLayout.hpp
@@ -24,55 +24,89 @@
 #include <vector>
 
 #include "Utility/IpplTimings.h"
+#include "Utility/ParallelDispatch.h"
 
 #include "Communicate/Window.h"
 
 namespace ippl {
 
-    /*!
-     * We need this struct since Kokkos parallel_scan only accepts
-     * one variable of type ReturnType where to perform the reduction operation.
-     * For more details, see
-     * https://kokkos.github.io/kokkos-core-wiki/API/core/parallel-dispatch/parallel_scan.html.
-     */
-    struct increment_type {
-        size_t count[2];
+    template <typename T, unsigned Dim, class Mesh, typename... Properties>
+    ParticleSpatialLayout<T, Dim, Mesh, Properties...>::ParticleSpatialLayout(FieldLayout<Dim>& fl,
+                                                                              Mesh& mesh, bool fem,
+                                                                              CountExchange mode)
+        : rlayout_m(std::make_shared<RegionLayout_t>(fl, mesh, fem))
+        , flayout_m(fl)
+        , countExchangeMode_(mode) {
+        const int nRanks = Comm->size();
+        nRanks_          = nRanks;
+        initScratch(nRanks);
 
-        KOKKOS_FUNCTION void init() {
-            count[0] = 0;
-            count[1] = 0;
+        if (mode == CountExchange::RMA && nRanks > 1) {
+            nRecvs_m.resize(Comm->size());
+            window_m.create(*Comm, nRecvs_m.begin(), nRecvs_m.end());
         }
+    }
 
-        KOKKOS_INLINE_FUNCTION increment_type& operator+=(bool* values) {
-            count[0] += values[0];
-            count[1] += values[1];
-            return *this;
-        }
+    template <typename T, unsigned Dim, class Mesh, typename... Properties>
+    void ParticleSpatialLayout<T, Dim, Mesh, Properties...>::updateLayout(FieldLayout<Dim>& fl,
+                                                                          Mesh& mesh) {
+        rlayout_m->changeDomain(fl, mesh);
+        neighbors_dirty_ = true;
+    }
+
+    template <typename T, unsigned Dim, class Mesh, typename... Properties>
+    void ParticleSpatialLayout<T, Dim, Mesh, Properties...>::countExchangeRMA() {
+        std::fill(nRecvs_m.begin(), nRecvs_m.end(), 0);
+        window_m.fence(0);
 
-        KOKKOS_INLINE_FUNCTION increment_type& operator+=(increment_type values) {
-            count[0] += values.count[0];
-            count[1] += values.count[1];
-            return *this;
+        for (int rank : destinationRanks_host_) {
+            if (rank == Comm->rank())
+                continue;
+            const int* src = &rankSendCount_h_(rank);
+            window_m.put<int>(src, rank, Comm->rank());
         }
-    };
+
+        window_m.fence(0);
+    }
 
     template <typename T, unsigned Dim, class Mesh, typename... Properties>
-    ParticleSpatialLayout<T, Dim, Mesh, Properties...>::ParticleSpatialLayout(FieldLayout<Dim>& fl,
-                                                                              Mesh& mesh, bool fem)
-        : rlayout_m(fl, mesh, fem)
-        , flayout_m(fl)
-    {   
-        nRecvs_m.resize(Comm->size());
-        if (Comm->size() > 1) {
-            window_m.create(*Comm, nRecvs_m.begin(), nRecvs_m.end());
+    void ParticleSpatialLayout<T, Dim, Mesh, Properties...>::countExchangeP2P() {
+        const int myRank = Comm->rank();
+        const int tag    = Comm->next_tag(mpi::tag::P_SPATIAL_LAYOUT, mpi::tag::P_LAYOUT_CYCLE);
+
+        // Zero the receive-count buffer on the device
+        Kokkos::deep_copy(position_execution_space{}, recvCounts_d_, 0);
+        Kokkos::fence();
+
+        // Device pointer to send-count array
+        int* d_sendCounts = rankSendCount_d_.data();
+        int* d_recvCounts = recvCounts_d_.data();
+
+        std::vector<MPI_Request> reqs;
+        reqs.reserve(2 * std::max(0, nRanks_ - 1));
+
+        // Post receives from all ranks (except self)
+        for (int r = 0; r < nRanks_; ++r) {
+            if (r == myRank)
+                continue;
+            MPI_Irecv(d_recvCounts + r, 1, MPI_INT, r, tag, Comm->getCommunicator(),
+                      &reqs.emplace_back());
         }
+
+        for (int r = 0; r < nRanks_; ++r) {
+            if (r == myRank)
+                continue;
+            MPI_Isend(d_sendCounts + r, 1, MPI_INT, r, tag, Comm->getCommunicator(),
+                      &reqs.emplace_back());
+        }
+
+        MPI_Waitall(static_cast<int>(reqs.size()), reqs.data(), MPI_STATUSES_IGNORE);
     }
 
     template <typename T, unsigned Dim, class Mesh, typename... Properties>
-    void ParticleSpatialLayout<T, Dim, Mesh, Properties...>::updateLayout(FieldLayout<Dim>& fl,
-                                                                          Mesh& mesh) {
-        // flayout_m = fl;
-        rlayout_m.changeDomain(fl, mesh);
+    void ParticleSpatialLayout<T, Dim, Mesh, Properties...>::countExchangeAlltoall() {
+        MPI_Alltoall(rankSendCount_d_.data(), 1, MPI_INT, recvCounts_d_.data(), 1, MPI_INT,
+                     Comm->getCommunicator());
     }
 
     template <typename T, unsigned Dim, class Mesh, typename... Properties>
@@ -81,18 +115,20 @@ namespace ippl {
         /* Apply Boundary Conditions */
         static IpplTimings::TimerRef ParticleBCTimer = IpplTimings::getTimer("particleBC");
         IpplTimings::startTimer(ParticleBCTimer);
-        this->applyBC(pc.R, rlayout_m.getDomain());
+        this->applyBC(pc.R, rlayout_m->getDomain());
         IpplTimings::stopTimer(ParticleBCTimer);
 
         /* Update Timer for the rest of the function */
         static IpplTimings::TimerRef ParticleUpdateTimer = IpplTimings::getTimer("updateParticle");
         IpplTimings::startTimer(ParticleUpdateTimer);
 
-        int nRanks = Comm->size();
-        if (nRanks < 2) {
+        if (nRanks_ < 2) {
+            IpplTimings::stopTimer(ParticleUpdateTimer);
             return;
         }
 
+        ensureNeighborsCached();
+
         /* particle MPI exchange:
          *   1. figure out which particles need to go where -> locateParticles(...)
          *   2. fill send buffer and send particles
@@ -105,69 +141,45 @@ namespace ippl {
         static IpplTimings::TimerRef locateTimer = IpplTimings::getTimer("locateParticles");
         IpplTimings::startTimer(locateTimer);
 
-        /* Rank-local number of particles */
-        size_type localnum = pc.getLocalNum();
-
-        /* The indices correspond to the indices of the local particles,
-         * the values correspond to the ranks to which the particles need to be sent
-         */
-        locate_type particleRanks("particles' MPI ranks", localnum);
-
-        /* The indices are the indices of the particles,
-         * the boolean values describe whether the particle has left the current rank
-         * 0 --> particle valid (inside current rank)
-         * 1 --> particle invalid (left rank)
-         */
-        bool_type invalidParticles("validity of particles", localnum);
-
-        /* The indices are the MPI ranks,
-         * the values are the number of particles are sent to that rank from myrank
-         */
-        locate_type rankSendCount_dview("rankSendCount Device", nRanks);
+        const size_type nInvalid = locateParticlesPacked(pc);
 
-        /* The indices have no particluar meaning,
-         * the values are the MPI ranks to which we need to send
-         */
-        locate_type destinationRanks_dview("destinationRanks Device", nRanks);
+        // Copy metadata to host
+        size_type nDest = 0;
+        Kokkos::deep_copy(position_execution_space{}, nDest, nDest_d_);
 
-        /* nInvalid is the number of invalid particles
-         * nDestinationRanks is the number of MPI ranks we need to send to
-         */
-        auto [nInvalid, nDestinationRanks] = locateParticles(
-            pc, particleRanks, invalidParticles, rankSendCount_dview, destinationRanks_dview);
+        // destRanks prefix
+        if (nDest > 0) {
+            Kokkos::deep_copy(
+                position_execution_space{},
+                Kokkos::subview(destRanks_h_, std::make_pair(size_t(0), size_t(nDest))),
+                Kokkos::subview(destRanks_d_, std::make_pair(size_t(0), size_t(nDest))));
+        }
 
-        /* Host space copy of rankSendCount_dview */
-        auto rankSendCount_hview =
-            Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), rankSendCount_dview);
+        // counts + offsets
+        Kokkos::deep_copy(position_execution_space{}, rankSendCount_h_, rankSendCount_d_);
+        Kokkos::deep_copy(position_execution_space{}, sendOffsets_h_, sendOffsets_d_);
 
-        /* Host Space copy of destinationRanks_dview */
-        auto destinationRanks_hview =
-            Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), destinationRanks_dview);
+        // Build host destination list without allocation
+        destinationRanks_host_.clear();
+        for (size_t i = 0; i < (size_t)nDest; ++i)
+            destinationRanks_host_.push_back(destRanks_h_(i));
 
         IpplTimings::stopTimer(locateTimer);
 
         // 2. fill send buffer and send particles =============================================== //
 
-        // 2.1 Remote Memory Access window for one-sided communication
+        // 2.1 Count Exchange
 
         static IpplTimings::TimerRef preprocTimer = IpplTimings::getTimer("sendPreprocess");
         IpplTimings::startTimer(preprocTimer);
 
-        std::fill(nRecvs_m.begin(), nRecvs_m.end(), 0);
-
-        window_m.fence(0);
-
-        // Prepare RMA window for the ranks we need to send to
-        for (size_t ridx = 0; ridx < nDestinationRanks; ridx++) {
-            int rank = destinationRanks_hview[ridx];
-            if (rank == Comm->rank()) {
-                // we do not need to send to ourselves
-                continue;
-            }
-            const int* src_ptr = &rankSendCount_hview(rank);
-            window_m.put<int>(src_ptr, rank, Comm->rank());
+        if (countExchangeMode_ == CountExchange::RMA) {
+            countExchangeRMA();
+        } else if (countExchangeMode_ == CountExchange::P2P) {
+            countExchangeP2P();
+        } else {
+            countExchangeAlltoall();
         }
-        window_m.fence(0);
 
         IpplTimings::stopTimer(preprocTimer);
 
@@ -176,55 +188,96 @@ namespace ippl {
         static IpplTimings::TimerRef sendTimer = IpplTimings::getTimer("particleSend");
         IpplTimings::startTimer(sendTimer);
 
-        std::vector<MPI_Request> requests(0);
+        std::vector<MPI_Request> requests;
+        requests.reserve(destinationRanks_host_.size());
 
         int tag = Comm->next_tag(mpi::tag::P_SPATIAL_LAYOUT, mpi::tag::P_LAYOUT_CYCLE);
 
-        for (size_t ridx = 0; ridx < nDestinationRanks; ridx++) {
-            int rank = destinationRanks_hview[ridx];
-            if (rank == Comm->rank()) {
+        for (int rank : destinationRanks_host_) {
+            if (rank == Comm->rank())
                 continue;
-            }
-            hash_type hash("hash", rankSendCount_hview(rank));
-            fillHash(rank, particleRanks, hash);
-            pc.sendToRank(rank, tag, requests, hash);
+            const size_type count = static_cast<size_type>(rankSendCount_h_(rank));
+            if (count == 0)
+                continue;
+            const size_type begin = static_cast<size_type>(sendOffsets_h_(rank));
+            auto ids_sub =
+                Kokkos::subview(sendIds_d_, std::make_pair((size_t)begin, (size_t)(begin + count)));
+            requests.push_back(pc.sendToRank(rank, tag, ids_sub));
         }
 
         IpplTimings::stopTimer(sendTimer);
 
-        // 3. Internal destruction of invalid particles ======================================= //
-
-        static IpplTimings::TimerRef destroyTimer = IpplTimings::getTimer("particleDestroy");
-        IpplTimings::startTimer(destroyTimer);
-
-        pc.internalDestroy(invalidParticles, nInvalid);
-        Kokkos::fence();
-
-        IpplTimings::stopTimer(destroyTimer);
-
-        // 4. Receive Particles ================================================================ //
+        // 2.3 Post receives
 
         static IpplTimings::TimerRef recvTimer = IpplTimings::getTimer("particleRecv");
         IpplTimings::startTimer(recvTimer);
 
-        for (int rank = 0; rank < nRanks; ++rank) {
-            if (nRecvs_m[rank] > 0) {
-                pc.recvFromRank(rank, tag, nRecvs_m[rank]);
+        std::vector<std::pair<int, size_type>> recvList;
+
+        if (countExchangeMode_ == CountExchange::RMA) {
+            for (int rank = 0; rank < nRanks_; ++rank) {
+                if (nRecvs_m[rank] > 0) {
+                    recvList.push_back({rank, nRecvs_m[rank]});
+                }
             }
+        } else {
+            auto recvCounts_h =
+                Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), recvCounts_d_);
+            for (int rank = 0; rank < nRanks_; ++rank) {
+                if (recvCounts_h(rank) > 0) {
+                    recvList.push_back({rank, recvCounts_h(rank)});
+                }
+            }
+        }
+
+        std::vector<MPI_Request> recvRequests(recvList.size(), MPI_REQUEST_NULL);
+        std::vector<std::function<void(size_type)>> finalizers(recvList.size());
+
+        for (size_t i = 0; i < recvList.size(); ++i) {
+            auto [rank, count] = recvList[i];
+            auto [req, fin]    = pc.postRecvFromRank(rank, tag, count);
+            recvRequests[i]    = req;
+            finalizers[i]      = std::move(fin);
         }
+
         IpplTimings::stopTimer(recvTimer);
 
-        IpplTimings::startTimer(sendTimer);
+        // 3. Internal destruction of invalid particles ======================================= //
 
-        if (requests.size() > 0) {
-            MPI_Waitall(requests.size(), requests.data(), MPI_STATUSES_IGNORE);
+        static IpplTimings::TimerRef destroyTimer = IpplTimings::getTimer("particleDestroy");
+        IpplTimings::startTimer(destroyTimer);
+
+        // locateParticlesPacked already wrote the per-particle "is leaving"
+        // mask into leaving_d_. Reuse it instead of re-running the full
+        // region search inside the destroy predicate.
+        auto leaving = leaving_d_;
+        pc.template internalDestroy<position_memory_space, position_execution_space>(
+            KOKKOS_LAMBDA(size_t i) { return leaving(i); }, nInvalid);
+        Kokkos::fence();
+
+        IpplTimings::stopTimer(destroyTimer);
+
+        requests.insert(requests.end(), recvRequests.begin(), recvRequests.end());
+        if (!requests.empty()) {
+            MPI_Waitall(static_cast<int>(requests.size()), requests.data(), MPI_STATUSES_IGNORE);
         }
-        IpplTimings::stopTimer(sendTimer);
         Comm->freeAllBuffers();
 
+        // 5. Deserialize
+        for (auto& finalize : finalizers)
+            finalize(pc.getLocalNum());
+
         IpplTimings::stopTimer(ParticleUpdateTimer);
     }
 
+    template <typename T, unsigned Dim, class Mesh, typename... Properties>
+    template <size_t... Idx>
+    KOKKOS_INLINE_FUNCTION constexpr bool
+    ParticleSpatialLayout<T, Dim, Mesh, Properties...>::positionInRegionInclusive(
+        const std::index_sequence<Idx...>&, const vector_type& pos, const region_type& region) {
+        return ((pos[Idx] >= region[Idx].min()) && ...) && ((pos[Idx] <= region[Idx].max()) && ...);
+    };
+
     template <typename T, unsigned Dim, class Mesh, typename... Properties>
     template <size_t... Idx>
     KOKKOS_INLINE_FUNCTION constexpr bool
@@ -248,220 +301,204 @@ namespace ippl {
         return totalSize;
     }
 
-    /**
-     * @brief This function determines to which rank particles need to be sent after the iteration
-     * step. It starts by first scanning direct rank neighbors, and only does a global scan if there
-     * are still unfound particles. It then calculates how many particles need to be sent to each
-     * rank and how many ranks are sent to in total.
-     *
-     * @param pc           Particle Container
-     * @param ranks        A vector the length of the number of particles on the current rank, where
-     * each value refers to the new rank of the particle
-     * @param invalid      A vector marking the particles that need to be sent away, and thus
-     * locally deleted
-     * @param nSends_dview Device view the length of number of ranks, where each value determines
-     * the number of particles sent to that rank from the current rank
-     * @param sends_dview  Device view for the number of ranks that are sent to from current rank
-     *
-     * @return tuple with the number of particles sent away and the number of ranks sent to
-     */
     template <typename T, unsigned Dim, class Mesh, typename... Properties>
     template <typename ParticleContainer>
-    std::pair<detail::size_type, detail::size_type>
-    ParticleSpatialLayout<T, Dim, Mesh, Properties...>::locateParticles(
-        const ParticleContainer& pc, locate_type& ranks, bool_type& invalid,
-        locate_type& nSends_dview, locate_type& sends_dview) const {
-        auto positions           = pc.R.getView();
-        region_view_type Regions = rlayout_m.getdLocalRegions();
-
-        using policy_type  = Kokkos::RangePolicy<position_execution_space>;
-        using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<2>, position_execution_space>;
+    size_t ParticleSpatialLayout<T, Dim, Mesh, Properties...>::locateParticlesPacked(
+        const ParticleContainer& pc) {
+        const int nRanks       = Comm->size();
+        const size_type myRank = Comm->rank();
 
-        size_type myRank = Comm->rank();
+        ensureNeighborsCached();
 
-        const auto is = std::make_index_sequence<Dim>{};
-
-        const neighbor_list& neighbors = flayout_m.getNeighbors();
-
-        /// outsideIds: Container of particle IDs that travelled outside of the neighborhood.
-        locate_type outsideIds("Particles outside of neighborhood", size_type(pc.getLocalNum()));
+        auto positions           = pc.R.getView();
+        region_view_type Regions = rlayout_m->getdLocalRegions();
+        const auto is            = std::make_index_sequence<Dim>{};
 
-        /// outsideCount: Tracks the number of particles that travelled outside of the neighborhood.
-        size_type outsideCount = 0;
-        /// invalidCount: Tracks the number of particles that need to be sent to other ranks.
-        size_type invalidCount = 0;
+        using exec_space  = position_execution_space;
+        using policy_type = Kokkos::RangePolicy<size_t, exec_space>;
 
-        /// neighborSize: Size of a neighborhood in D dimentions.
-        const size_type neighborSize = getNeighborSize(neighbors);
+        // Reset small device buffers
+        Kokkos::deep_copy(rankSendCount_d_, size_type(0));
+        Kokkos::deep_copy(cursor_d_, size_type(0));
+        Kokkos::deep_copy(nDest_d_, size_type(0));
 
-        /// neighbors_view: Kokkos view with the IDs of the neighboring MPI ranks.
-        locate_type neighbors_view("Nearest neighbors IDs", neighborSize);
+        const size_type neighbors_used = neighbors_used_;
+        auto& neighbours_d             = neighbors_d_;
 
-        /* red_val: Used to reduce both the number of invalid particles and the number of particles
-         * outside of the neighborhood (Kokkos::parallel_scan doesn't allow multiple reduction
-         * values, so we use the helper class increment_type). First element updates InvalidCount,
-         * second one updates outsideCount.
-         */
-        increment_type red_val;
-        red_val.init();
+        // Destination rank computation (no per-particle storage)
+        auto destRankOf = KOKKOS_LAMBDA(const size_t i)->size_type {
+            if (positionInRegion(is, positions(i), Regions(myRank)))
+                return myRank;
 
-        auto neighbors_mirror =
-            Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), neighbors_view);
-
-        size_t k = 0;
+            for (int j = 0; j < static_cast<int>(neighbors_used); ++j) {
+                const int r = neighbours_d(j);
+                if (positionInRegion(is, positions(i), Regions(r)))
+                    return r;
+            }
 
-        for (const auto& componentNeighbors : neighbors) {
-            for (size_t j = 0; j < componentNeighbors.size(); ++j) {
-                neighbors_mirror(k) = componentNeighbors[j];
-                // std::cout << "Neighbor: " << neighbors_mirror(k) << std::endl;
-                k++;
+            // slow-path: global scan
+            for (int r = 0; r < static_cast<int>(Regions.extent(0)); ++r) {
+                if (positionInRegion(is, positions(i), Regions(r)))
+                    return r;
             }
-        }
 
-        Kokkos::deep_copy(neighbors_view, neighbors_mirror);
+            // Inclusive fallback: catches particles sitting exactly on a region
+            // lower boundary that the strict > check above missed (e.g. (0,0,0))
+            for (int r = 0; r < static_cast<int>(Regions.extent(0)); ++r) {
+                if (positionInRegionInclusive(is, positions(i), Regions(r)))
+                    return r;
+            }
+            return myRank;  // truly outside all regions - applyBC should have prevented this
+        };
 
-        /*! Begin Kokkos loop:
-         * Step 1: search in current rank
-         * Step 2: search in neighbors
-         * Step 3: save information on whether the particle was located
-         * Step 4: run additional loop on non-located particles
-         */
-        static IpplTimings::TimerRef neighborSearch = IpplTimings::getTimer("neighborSearch");
-        IpplTimings::startTimer(neighborSearch);
+        // Make sure the leaving-mask buffer is large enough; it's reused across
+        // updates to avoid reallocation.
+        ensureLeavingCapacity(pc.getLocalNum());
+        auto& leaving_d = leaving_d_;
 
-        Kokkos::parallel_scan(
-            "ParticleSpatialLayout::locateParticles()",
-            policy_type(0, ranks.extent(0)),
-            KOKKOS_LAMBDA(const size_type i, increment_type& val, const bool final) {
-                /* Step 1
-                 * inCurr: True if the particle hasn't left the current MPI rank.
-                 * inNeighbor: True if the particle is found in a neighboring rank.
-                 * found: True either if inCurr = True or inNeighbor = True.
-                 * increment: Helper variable to update red_val.
-                 */
-                bool inCurr     = false;
-                bool inNeighbor = false;
-                bool found      = false;
-                bool increment[2];
-
-                inCurr = positionInRegion(is, positions(i), Regions(myRank));
-
-                ranks(i)   = inCurr * myRank;
-                invalid(i) = !inCurr;
-                found      = inCurr || found;
-
-                /// Step 2
-                for (size_t j = 0; j < neighbors_view.extent(0); ++j) {
-                    size_type rank = neighbors_view(j);
-
-                    inNeighbor = positionInRegion(is, positions(i), Regions(rank));
-
-                    ranks(i) = !(inNeighbor)*ranks(i) + inNeighbor * rank;
-                    found    = inNeighbor || found;
-                }
-                /// Step 3
-                /* isOut: When the last thread has finished the search, checks whether the particle
-                 * has been found either in the current rank or in a neighboring one. Used to avoid
-                 * race conditions when updating outsideIds.
-                 */
-                if (final && !found) {
-                    outsideIds(val.count[1]) = i;
-                }
-                // outsideIds(val.count[1]) = i * isOut;
-                increment[0] = invalid(i);
-                increment[1] = !found;
-                val += increment;
+        // Pass 1: compute send counts + nInvalid + per-particle "is leaving"
+        size_type nInvalid    = 0;
+        auto& rankSendCount_d = rankSendCount_d_;
+        Kokkos::parallel_reduce(
+            "PSL::packed_count", policy_type(0, pc.getLocalNum()),
+            KOKKOS_LAMBDA(const size_t i, size_type& inval) {
+                const size_type dest = destRankOf(i);
+                const bool leaves    = (dest != myRank);
+                leaving_d(i)         = leaves;
+                inval += leaves;
+                if (leaves)
+                    Kokkos::atomic_fetch_add(&rankSendCount_d(dest), size_type(1));
             },
-            red_val);
-
+            nInvalid);
         Kokkos::fence();
 
-        invalidCount = red_val.count[0];
-        outsideCount = red_val.count[1];
-
-        IpplTimings::stopTimer(neighborSearch);
-
-        /// Step 4
-        static IpplTimings::TimerRef nonNeighboringParticles =
-            IpplTimings::getTimer("nonNeighboringParticles");
-        IpplTimings::startTimer(nonNeighboringParticles);
-        if (outsideCount > 0) {
-            Kokkos::parallel_for(
-                "ParticleSpatialLayout::leftParticles()",
-                mdrange_type({0, 0}, {outsideCount, Regions.extent(0)}),
-                KOKKOS_LAMBDA(const size_t i, const size_type j) {
-                    /// pID: (local) ID of the particle that is currently being searched.
-                    size_type pId = outsideIds(i);
-
-                    /// inRegion: Checks whether particle pID is inside region j.
-                    bool inRegion = positionInRegion(is, positions(pId), Regions(j));
-                    if (inRegion) {
-                        ranks(pId) = j;
-                    }
-                });
-            Kokkos::fence();
-        }
-        IpplTimings::stopTimer(nonNeighboringParticles);
+        // Ensure sendIds capacity after we know nInvalid
+        ensureSendCapacity(nInvalid);
 
-        Kokkos::parallel_for(
-            "Calculate nSends", policy_type(0, ranks.extent(0)),
-            KOKKOS_LAMBDA(const size_t i) {
-                size_type rank = ranks(i);
-                Kokkos::atomic_fetch_add(&nSends_dview(rank), 1);
+        // Exclusive scan to offsets (length nRanks+1)
+        auto& sendOffsets_d = sendOffsets_d_;
+        Kokkos::parallel_scan(
+            "PSL::packed_offsets", policy_type(0, (size_t)nRanks + 1),
+            KOKKOS_LAMBDA(const size_t r, size_type& upd, const bool final) {
+                if (final)
+                    sendOffsets_d(r) = upd;
+                if (r < (size_t)nRanks)
+                    upd += rankSendCount_d(r);
             });
+        Kokkos::fence();
 
-        // Number of Ranks we need to send to
-        Kokkos::View<size_type, position_memory_space> rankSends(
-            "Number of Ranks we need to send to");
+        // Pass 2: fill packed send IDs into sendIds_d_ (prefix [0, nInvalid))
+        auto& cursor_d  = cursor_d_;
+        auto& sendIds_d = sendIds_d_;
+        Kokkos::parallel_for(
+            "PSL::packed_fill", policy_type(0, pc.getLocalNum()), KOKKOS_LAMBDA(const size_t i) {
+                const size_type dest = destRankOf(i);
+                if (dest == myRank)
+                    return;
+
+                const size_type pos  = Kokkos::atomic_fetch_add(&cursor_d(dest), size_type(1));
+                const size_type base = sendOffsets_d(dest);
+
+                sendIds_d(base + pos) = static_cast<typename hash_type::non_const_value_type>(i);
+            });
+        Kokkos::fence();
 
+        // Build destination rank list on device (compact), store length in nDest_d_
+        auto& destRanks_d = destRanks_d_;
+        auto& nDest_d     = nDest_d_;
         Kokkos::parallel_for(
-            "Calculate sends", policy_type(0, nSends_dview.extent(0)),
-            KOKKOS_LAMBDA(const size_t rank) {
-                if (nSends_dview(rank) != 0) {
-                    size_type index    = Kokkos::atomic_fetch_add(&rankSends(), 1);
-                    sends_dview(index) = rank;
+            "PSL::packed_destRanks", policy_type(0, (size_t)nRanks), KOKKOS_LAMBDA(const size_t r) {
+                if ((size_type)r == myRank)
+                    return;
+                if (rankSendCount_d(r) > 0) {
+                    const size_type idx = Kokkos::atomic_fetch_add(&nDest_d(), size_type(1));
+                    destRanks_d(idx)    = static_cast<int>(r);
                 }
             });
-        size_type temp;
-        Kokkos::deep_copy(temp, rankSends);
+        Kokkos::fence();
 
-        return {invalidCount, temp};
+        return nInvalid;
     }
 
     template <typename T, unsigned Dim, class Mesh, typename... Properties>
-    void ParticleSpatialLayout<T, Dim, Mesh, Properties...>::fillHash(int rank,
-                                                                      const locate_type& ranks,
-                                                                      hash_type& hash) {
-        /* Compute the prefix sum and fill the hash
-         */
-        using policy_type = Kokkos::RangePolicy<position_execution_space>;
-        Kokkos::parallel_scan(
-            "ParticleSpatialLayout::fillHash()", policy_type(0, ranks.extent(0)),
-            KOKKOS_LAMBDA(const size_t i, int& idx, const bool final) {
-                if (final) {
-                    if (rank == ranks(i)) {
-                        hash(idx) = i;
-                    }
-                }
+    void ParticleSpatialLayout<T, Dim, Mesh, Properties...>::initScratch(int nRanks) {
+        Kokkos::realloc(rankSendCount_d_, nRanks);
+        Kokkos::realloc(sendOffsets_d_, nRanks + 1);
+        Kokkos::realloc(cursor_d_, nRanks);
+        Kokkos::realloc(destRanks_d_, nRanks);
+        Kokkos::realloc(recvCounts_d_, nRanks);
+
+        // scalar counter
+        nDest_d_ = Kokkos::View<size_type, position_memory_space>("nDest_d");
+
+        // Host mirrors
+        Kokkos::realloc(rankSendCount_h_, nRanks);
+        Kokkos::realloc(sendOffsets_h_, nRanks + 1);
+        Kokkos::realloc(destRanks_h_, nRanks);
+
+        destinationRanks_host_.clear();
+        destinationRanks_host_.reserve(nRanks);
+    }
 
-                if (rank == ranks(i)) {
-                    idx += 1;
-                }
-            });
-        Kokkos::fence();
+    template <typename T, unsigned Dim, class Mesh, typename... Properties>
+    void ParticleSpatialLayout<T, Dim, Mesh, Properties...>::ensureSendCapacity(size_t nInvalid) {
+        if (nInvalid <= sendIds_capacity_)
+            return;
+
+        // grow geometrically
+        size_t newCap = sendIds_capacity_ ? sendIds_capacity_ : size_t(1024);
+        while (newCap < nInvalid)
+            newCap *= 2;
+
+        sendIds_capacity_ = newCap;
+        Kokkos::realloc(sendIds_d_, newCap);
     }
 
     template <typename T, unsigned Dim, class Mesh, typename... Properties>
-    size_t ParticleSpatialLayout<T, Dim, Mesh, Properties...>::numberOfSends(
-        int rank, const locate_type& ranks) {
-        size_t nSends     = 0;
-        using policy_type = Kokkos::RangePolicy<position_execution_space>;
-        Kokkos::parallel_reduce(
-            "ParticleSpatialLayout::numberOfSends()", policy_type(0, ranks.extent(0)),
-            KOKKOS_LAMBDA(const size_t i, size_t& num) { num += size_t(rank == ranks(i)); },
-            nSends);
-        Kokkos::fence();
-        return nSends;
+    void ParticleSpatialLayout<T, Dim, Mesh, Properties...>::ensureLeavingCapacity(size_t nLocal) {
+        if (nLocal <= leaving_capacity_)
+            return;
+
+        size_t newCap = leaving_capacity_ ? leaving_capacity_ : size_t(1024);
+        while (newCap < nLocal)
+            newCap *= 2;
+
+        leaving_capacity_ = newCap;
+        Kokkos::realloc(leaving_d_, newCap);
+    }
+
+    template <typename T, unsigned Dim, class Mesh, typename... Properties>
+    void ParticleSpatialLayout<T, Dim, Mesh, Properties...>::ensureNeighborsCached() {
+        if (!neighbors_dirty_)
+            return;
+
+        const neighbor_list& neighbors = flayout_m.getNeighbors();
+        const size_type neighborSize   = getNeighborSize(neighbors);
+        neighbors_used_                = neighborSize;
+
+        if (neighborSize > neighbors_capacity_) {
+            neighbors_capacity_ = neighborSize;
+            Kokkos::realloc(neighbors_d_, neighbors_capacity_);
+        }
+
+        neighbors_host_.clear();
+        neighbors_host_.reserve(neighborSize);
+
+        auto neighbors_h = Kokkos::create_mirror_view(neighbors_d_);
+        size_t k         = 0;
+        for (const auto& comp : neighbors) {
+            for (size_t j = 0; j < comp.size(); ++j) {
+                neighbors_h(k) = comp[j];
+                neighbors_host_.push_back(comp[j]);
+                ++k;
+            }
+        }
+
+        Kokkos::deep_copy(
+            Kokkos::subview(neighbors_d_, std::make_pair(size_t(0), size_t(neighborSize))),
+            Kokkos::subview(neighbors_h, std::make_pair(size_t(0), size_t(neighborSize))));
+
+        neighbors_dirty_ = false;
     }
 
 }  // namespace ippl
diff --git a/src/Particle/SortBuffer.h b/src/Particle/SortBuffer.h
new file mode 100644
index 000000000..977db9f59
--- /dev/null
+++ b/src/Particle/SortBuffer.h
@@ -0,0 +1,185 @@
+#ifndef IPPL_SORT_BUFFER_H
+#define IPPL_SORT_BUFFER_H
+
+#include <Kokkos_Core.hpp>
+
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+
+namespace ippl {
+    namespace detail {
+
+        /**
+         * @brief Persistent scratch buffers for bin-sort-based particle binning.
+         *
+         * Grows on demand, never shrinks. Call finalizeBinSortBuffers() before
+         * Kokkos::finalize() to release device memory safely.
+         *
+         * Buffer layout
+         *   bin_keys      uint64_t[n_particles]   sort keys (bin index per particle)
+         *   permute       size_t  [n_particles]   identity → sorted particle order
+         *   bin_offsets   size_t  [n_bins + 1]    start of each bin in permute
+         *   keys_out      uint64_t[n_particles]   CUB radix-sort output (CUDA only)
+         *   perm_out      size_t  [n_particles]   CUB radix-sort output (CUDA only)
+         *   temp_storage  char    [temp_bytes]    CUB device-temp (CUDA only)
+         */
+        template <typename MemorySpace>
+        class BinSortBuffers {
+        public:
+            using memory_space = MemorySpace;
+            using size_type    = size_t;
+
+            BinSortBuffers()                                 = default;
+            BinSortBuffers(const BinSortBuffers&)            = delete;
+            BinSortBuffers& operator=(const BinSortBuffers&) = delete;
+            BinSortBuffers(BinSortBuffers&&)                 = default;
+            BinSortBuffers& operator=(BinSortBuffers&&)      = default;
+
+            /**
+             * @brief Ensure particle-sized buffers are large enough.
+             *
+             * @param n_particles  Number of particles.
+             * @param n_bins_p1    Number of bins + 1 (size needed for bin_offsets).
+             * @param growth       Over-allocation factor (default 1.2).
+             */
+            void ensureCapacity(size_type n_particles, size_type n_bins_p1, double growth = 1.2) {
+                if (n_particles > particle_capacity_) {
+                    const size_type cap =
+                        std::max(static_cast<size_type>(n_particles * growth), n_particles);
+
+                    bin_keys_ = Kokkos::View<uint64_t*, memory_space>("bin_keys", cap);
+                    permute_  = Kokkos::View<size_type*, memory_space>("permute", cap);
+                    keys_out_ = Kokkos::View<uint64_t*, memory_space>("keys_out", cap);
+                    perm_out_ = Kokkos::View<size_type*, memory_space>("perm_out", cap);
+
+                    particle_capacity_ = cap;
+                }
+
+                if (n_bins_p1 > bin_offset_capacity_) {
+                    const size_type cap =
+                        std::max(static_cast<size_type>(n_bins_p1 * growth), n_bins_p1);
+
+                    bin_offsets_ = Kokkos::View<size_type*, memory_space>("bin_offsets", cap);
+                    bin_offset_capacity_ = cap;
+                }
+            }
+
+            /**
+             * @brief Ensure CUB/rocPRIM temp-storage buffer is large enough.
+             * @param bytes  Required bytes.
+             */
+            void ensureTempStorage(size_t bytes) {
+                if (bytes <= temp_capacity_)
+                    return;
+                const size_t cap = static_cast<size_t>(bytes * 1.1);
+                temp_storage_    = Kokkos::View<char*, memory_space>("temp_storage", cap);
+                temp_capacity_   = cap;
+            }
+
+            // --- accessors -------------------------------------------------------
+            Kokkos::View<uint64_t*, memory_space>& binKeys() { return bin_keys_; }
+            Kokkos::View<size_type*, memory_space>& permute() { return permute_; }
+            Kokkos::View<size_type*, memory_space>& binOffsets() { return bin_offsets_; }
+            Kokkos::View<uint64_t*, memory_space>& keysOut() { return keys_out_; }
+            Kokkos::View<size_type*, memory_space>& permOut() { return perm_out_; }
+            Kokkos::View<char*, memory_space>& tempStorage() { return temp_storage_; }
+
+            size_type particleCapacity() const { return particle_capacity_; }
+            size_type binOffsetCapacity() const { return bin_offset_capacity_; }
+            size_t tempCapacity() const { return temp_capacity_; }
+
+            size_t memoryUsage() const {
+                // Sum the bytes actually held by each view rather than
+                // assuming all of them are populated — keys_out_ / perm_out_
+                // are only allocated on the CUDA fast path.
+                return bin_keys_.span() * sizeof(uint64_t)
+                       + permute_.span() * sizeof(size_type)
+                       + bin_offsets_.span() * sizeof(size_type)
+                       + keys_out_.span() * sizeof(uint64_t)
+                       + perm_out_.span() * sizeof(size_type)
+                       + temp_storage_.span();
+            }
+
+            void clear() {
+                bin_keys_            = {};
+                permute_             = {};
+                bin_offsets_         = {};
+                keys_out_            = {};
+                perm_out_            = {};
+                temp_storage_        = {};
+                particle_capacity_   = 0;
+                bin_offset_capacity_ = 0;
+                temp_capacity_       = 0;
+            }
+
+        private:
+            Kokkos::View<uint64_t*, memory_space> bin_keys_;
+            Kokkos::View<size_type*, memory_space> permute_;
+            Kokkos::View<size_type*, memory_space> bin_offsets_;
+            Kokkos::View<uint64_t*, memory_space> keys_out_;
+            Kokkos::View<size_type*, memory_space> perm_out_;
+            Kokkos::View<char*, memory_space> temp_storage_;
+
+            size_type particle_capacity_   = 0;
+            size_type bin_offset_capacity_ = 0;
+            size_t temp_capacity_          = 0;
+        };
+
+        // ---------------------------------------------------------------------------
+        // Singleton access
+        // ---------------------------------------------------------------------------
+
+        template <typename MemorySpace>
+        class BinSortBuffersHolder {
+        public:
+            static BinSortBuffersHolder& instance() {
+                static BinSortBuffersHolder h;
+                return h;
+            }
+
+            BinSortBuffers<MemorySpace>& get() {
+                if (!buffers_)
+                    buffers_ = std::make_unique<BinSortBuffers<MemorySpace>>();
+                return *buffers_;
+            }
+
+            void finalize() {
+                if (buffers_) {
+                    buffers_->clear();
+                    buffers_.reset();
+                }
+            }
+
+        private:
+            BinSortBuffersHolder()                                       = default;
+            ~BinSortBuffersHolder()                                      = default;
+            BinSortBuffersHolder(const BinSortBuffersHolder&)            = delete;
+            BinSortBuffersHolder& operator=(const BinSortBuffersHolder&) = delete;
+
+            std::unique_ptr<BinSortBuffers<MemorySpace>> buffers_;
+        };
+
+        template <typename MemorySpace>
+        BinSortBuffers<MemorySpace>& getDefaultBinSortBuffers() {
+            return BinSortBuffersHolder<MemorySpace>::instance().get();
+        }
+
+        /**
+         * @brief Release all bin-sort buffers for known memory spaces.
+         *        Call this before Kokkos::finalize().
+         */
+        inline void finalizeBinSortBuffers() {
+#ifdef KOKKOS_ENABLE_CUDA
+            BinSortBuffersHolder<Kokkos::CudaSpace>::instance().finalize();
+#endif
+#ifdef KOKKOS_ENABLE_HIP
+            BinSortBuffersHolder<Kokkos::HIPSpace>::instance().finalize();
+#endif
+            BinSortBuffersHolder<Kokkos::HostSpace>::instance().finalize();
+        }
+
+    }  // namespace detail
+}  // namespace ippl
+
+#endif  // IPPL_SORT_BUFFER_H
\ No newline at end of file
diff --git a/src/Utility/BufferView.h b/src/Utility/BufferView.h
new file mode 100644
index 000000000..732bf8375
--- /dev/null
+++ b/src/Utility/BufferView.h
@@ -0,0 +1,229 @@
+#ifndef IPPL_BUFFERVIEW_H
+#define IPPL_BUFFERVIEW_H
+
+#include <Kokkos_Core.hpp>
+#include <cstddef>
+#include <stdexcept>
+#include <string>
+
+namespace ippl {
+
+    /**
+     * @brief Helper to compute aligned offset
+     */
+    [[nodiscard]] constexpr size_t alignUp(size_t offset, size_t alignment) {
+        return (offset + alignment - 1) & ~(alignment - 1);
+    }
+
+    /**
+     *
+     * @tparam T Element type
+     * @tparam MemorySpace Kokkos memory space
+     */
+    template <typename T, typename MemorySpace = Kokkos::DefaultExecutionSpace::memory_space>
+    class BufferView {
+    public:
+        using view_type = Kokkos::View<T*, MemorySpace>;
+
+        explicit BufferView(size_t count)
+            : view_m("BufferView", count) {}
+
+        ~BufferView() = default;
+
+        // Non-copyable
+        BufferView(const BufferView&)            = delete;
+        BufferView& operator=(const BufferView&) = delete;
+
+        // Movable
+        BufferView(BufferView&&) = default;
+        BufferView& operator=(BufferView&&) = default;
+
+        view_type& getView() { return view_m; }
+        const view_type& getView() const { return view_m; }
+
+        T* data() { return view_m.data(); }
+        const T* data() const { return view_m.data(); }
+        size_t size() const { return view_m.extent(0); }
+
+    private:
+        view_type view_m;
+    };
+
+    /**
+     * @brief Allocates a single buffer and provides multiple aligned views into it.
+     *
+     * This class manages its own memory via a Kokkos View, completely independent
+     * from MPI buffer management. Use this for temporary computation buffers
+     * that don't need MPI communication.
+     *
+     * @tparam MemorySpace The Kokkos memory space for the buffer
+     */
+    template <typename MemorySpace = Kokkos::DefaultExecutionSpace::memory_space>
+    class MultiViewBuffer {
+    public:
+        using buffer_view_type = Kokkos::View<char*, MemorySpace>;
+
+        static constexpr size_t DEFAULT_ALIGNMENT = 256;
+
+        /**
+         * @brief Construct without allocation. Call allocate() before getting views.
+         */
+        MultiViewBuffer()
+            : currentOffset_m(0) {}
+
+        /**
+         * @brief Construct and allocate buffer of specified size.
+         * @param totalBytes Total size in bytes
+         * @param label Optional Kokkos label for debugging
+         */
+        explicit MultiViewBuffer(size_t totalBytes, const std::string& label = "MultiViewBuffer")
+            : buffer_m(label, totalBytes)
+            , currentOffset_m(0) {}
+
+        ~MultiViewBuffer() = default;
+
+        // Non-copyable
+        MultiViewBuffer(const MultiViewBuffer&)            = delete;
+        MultiViewBuffer& operator=(const MultiViewBuffer&) = delete;
+
+        // Movable
+        MultiViewBuffer(MultiViewBuffer&& other) noexcept
+            : buffer_m(std::move(other.buffer_m))
+            , currentOffset_m(other.currentOffset_m) {
+            other.currentOffset_m = 0;
+        }
+
+        MultiViewBuffer& operator=(MultiViewBuffer&& other) noexcept {
+            if (this != &other) {
+                buffer_m              = std::move(other.buffer_m);
+                currentOffset_m       = other.currentOffset_m;
+                other.currentOffset_m = 0;
+            }
+            return *this;
+        }
+
+        /**
+         * @brief Allocate or reallocate the underlying buffer.
+         * @param totalBytes Total size in bytes
+         * @param label Optional Kokkos label for debugging
+         */
+        void allocate(size_t totalBytes, const std::string& label = "MultiViewBuffer") {
+            buffer_m        = buffer_view_type(label, totalBytes);
+            currentOffset_m = 0;
+        }
+
+        /**
+         * @brief Get an aligned view of count elements of type T from the buffer.
+         *
+         * Views are allocated sequentially from the buffer. Each view starts at
+         * an address aligned to DEFAULT_ALIGNMENT (256 bytes).
+         *
+         * @tparam T Element type for the view
+         * @param count Number of elements
+         * @return Unmanaged Kokkos::View pointing into the buffer
+         */
+        template <typename T>
+        Kokkos::View<T*, MemorySpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+        getView(size_t count) {
+            using view_type = Kokkos::View<T*, MemorySpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
+
+            size_t alignedOffset = alignUp(currentOffset_m, DEFAULT_ALIGNMENT);
+            size_t requiredSize  = alignedOffset + count * sizeof(T);
+
+            if (requiredSize > buffer_m.size()) {
+                throw std::runtime_error(
+                    "MultiViewBuffer: insufficient space. Required: "
+                    + std::to_string(requiredSize)
+                    + ", Available: " + std::to_string(buffer_m.size()));
+            }
+
+            T* viewPtr = reinterpret_cast<T*>(buffer_m.data() + alignedOffset);
+            currentOffset_m = alignedOffset + count * sizeof(T);
+
+            return view_type(viewPtr, count);
+        }
+
+        /**
+         * @brief Reset the allocation offset to reuse the buffer for new views.
+         *
+         * Previously obtained views become invalid after this call.
+         */
+        void reset() { currentOffset_m = 0; }
+
+        /**
+         * @brief Check if buffer has been allocated.
+         */
+        bool isAllocated() const { return buffer_m.size() > 0; }
+
+        /**
+         * @brief Get the current used size in bytes.
+         */
+        size_t getUsedSize() const { return currentOffset_m; }
+
+        /**
+         * @brief Get the total available size in bytes.
+         */
+        size_t getTotalSize() const { return buffer_m.size(); }
+
+        /**
+         * @brief Get remaining available bytes.
+         */
+        size_t getRemainingSize() const { return getTotalSize() - currentOffset_m; }
+
+        /**
+         * @brief Get raw pointer to buffer start.
+         */
+        char* data() { return buffer_m.data(); }
+        const char* data() const { return buffer_m.data(); }
+
+    private:
+        buffer_view_type buffer_m;
+        size_t currentOffset_m = 0;
+    };
+
+    /**
+     * @brief Helper struct for recursive buffer size computation
+     */
+    template <typename... Ts>
+    struct BufferSizeComputer;
+
+    // Base case: single type
+    template <typename T>
+    struct BufferSizeComputer<T> {
+        static constexpr size_t compute(size_t offset, size_t count) {
+            return alignUp(offset, MultiViewBuffer<>::DEFAULT_ALIGNMENT) + count * sizeof(T);
+        }
+    };
+
+    // Recursive case: multiple types
+    template <typename T, typename... Rest>
+    struct BufferSizeComputer<T, Rest...> {
+        template <typename... Counts>
+        static constexpr size_t compute(size_t offset, size_t count, Counts... counts) {
+            static_assert(sizeof...(Rest) == sizeof...(Counts),
+                          "Number of remaining types must match remaining counts");
+            size_t alignedOffset = alignUp(offset, MultiViewBuffer<>::DEFAULT_ALIGNMENT);
+            size_t newOffset     = alignedOffset + count * sizeof(T);
+            return BufferSizeComputer<Rest...>::compute(newOffset, counts...);
+        }
+    };
+
+    /**
+     * @brief Compute total buffer size needed for multiple views with alignment.
+     *
+     * Usage: auto size = computeBufferSize<double, int, float>(100, 200, 50);
+     *
+     * @tparam Ts Element types for each view
+     * @param counts Number of elements for each view
+     * @return Total required buffer size in bytes
+     */
+    template <typename... Ts, typename... Counts>
+    constexpr size_t computeBufferSize(Counts... counts) {
+        static_assert(sizeof...(Ts) == sizeof...(Counts),
+                      "Number of types must match number of counts");
+        return BufferSizeComputer<Ts...>::compute(0, static_cast<size_t>(counts)...);
+    }
+
+}  // namespace ippl
+
+#endif  // IPPL_BUFFERVIEW_H
\ No newline at end of file
diff --git a/src/Utility/IpplTimings.cpp b/src/Utility/IpplTimings.cpp
index e0f65f36f..93fd9447b 100644
--- a/src/Utility/IpplTimings.cpp
+++ b/src/Utility/IpplTimings.cpp
@@ -28,7 +28,10 @@
 
 #include <algorithm>
 #include <fstream>
+#include <iomanip>
 #include <iostream>
+#include <limits>
+#include <sstream>
 
 #include "Utility/Inform.h"
 #include "Utility/IpplInfo.h"
@@ -53,8 +56,10 @@ const int num_colors    = sizeof(colors) / sizeof(uint32_t);
     }
 #endif
 
+// Static member initialization
 Timing* IpplTimings::instance = new Timing();
 std::stack<Timing*> IpplTimings::stashedInstance;
+const std::vector<double> Timing::emptyMeasurements;
 
 Timing::Timing()
     : TimerList()
@@ -113,6 +118,111 @@ void Timing::clearTimer(TimerRef t) {
     TimerList[t]->clear();
 }
 
+// Reset all timers - useful for warmup
+void Timing::resetAllTimers() {
+    for (unsigned int i = 0; i < TimerList.size(); ++i) {
+        TimerList[i]->clearAll();
+    }
+}
+// Get measurements for a specific timer by reference
+const std::vector<double>& Timing::getMeasurements(TimerRef t) const {
+    if (t >= TimerList.size())
+        return emptyMeasurements;
+    return TimerList[t]->measurements;
+}
+
+// Get measurements for a specific timer by name
+const std::vector<double>& Timing::getMeasurements(const std::string& name) const {
+    TimerMap_t::const_iterator loc = TimerMap.find(name);
+    if (loc == TimerMap.end())
+        return emptyMeasurements;
+    return loc->second->measurements;
+}
+
+// Get measurement count for a timer
+size_t Timing::getMeasurementCount(TimerRef t) const {
+    if (t >= TimerList.size())
+        return 0;
+    return TimerList[t]->measurements.size();
+}
+
+// Get all timer names
+std::vector<std::string> Timing::getTimerNames() const {
+    std::vector<std::string> names;
+    names.reserve(TimerList.size());
+    for (unsigned int i = 0; i < TimerList.size(); ++i) {
+        names.push_back(TimerList[i]->name);
+    }
+    return names;
+}
+
+// Dump all measurements to CSV (default format)
+void Timing::dumpToCSV(const std::string& filename) {
+    dumpToCSV(filename, ",", true);
+}
+
+// Dump all measurements to CSV with custom options
+void Timing::dumpToCSV(const std::string& filename, const std::string& delimiter,
+                       bool includeHeader) {
+    const int rank     = ippl::Comm->rank();
+    const int numRanks = ippl::Comm->size();
+
+    // Each rank serialises its measurements into a local buffer.
+    std::ostringstream localData;
+    for (unsigned int i = 0; i < TimerList.size(); ++i) {
+        TimerInfo* tptr                         = TimerList[i].get();
+        const std::string& timerName            = tptr->name;
+        const std::vector<double>& measurements = tptr->measurements;
+        for (size_t j = 0; j < measurements.size(); ++j) {
+            localData << timerName << delimiter << rank << delimiter << j << delimiter
+                      << std::setprecision(12) << measurements[j] << "\n";
+        }
+    }
+    const std::string localStr = localData.str();
+
+    // Gather sizes from every rank using a single collective rather than
+    // O(P) point-to-point pairs.
+    MPI_Aint localSize = static_cast<MPI_Aint>(localStr.size());
+    std::vector<MPI_Aint> sizes(numRanks, 0);
+    MPI_Gather(&localSize, 1, MPI_AINT, sizes.data(), 1, MPI_AINT, 0,
+               ippl::Comm->getCommunicator());
+
+    // Compute displacements and total size on rank 0; all ranks supply their
+    // bytes via MPI_Gatherv.
+    std::vector<int> intSizes(numRanks, 0);
+    std::vector<int> displs(numRanks, 0);
+    std::vector<char> all;
+    if (rank == 0) {
+        long long total = 0;
+        for (int r = 0; r < numRanks; ++r) {
+            // The MPI standard limits Gatherv counts to int. Real CSV blobs
+            // never exceed INT_MAX in practice; refuse rather than silently
+            // truncate if they do.
+            if (sizes[r] > static_cast<MPI_Aint>(std::numeric_limits<int>::max())) {
+                std::cerr << "Timing::dumpToCSV: rank " << r << " has " << sizes[r]
+                          << " bytes (> INT_MAX); truncating\n";
+            }
+            intSizes[r] = static_cast<int>(std::min<MPI_Aint>(
+                sizes[r], static_cast<MPI_Aint>(std::numeric_limits<int>::max())));
+            displs[r]   = static_cast<int>(total);
+            total += intSizes[r];
+        }
+        all.resize(static_cast<size_t>(total));
+    }
+
+    MPI_Gatherv(localStr.data(), static_cast<int>(localStr.size()), MPI_CHAR, all.data(),
+                intSizes.data(), displs.data(), MPI_CHAR, 0, ippl::Comm->getCommunicator());
+
+    if (rank == 0) {
+        std::ofstream outFile(filename);
+        if (includeHeader) {
+            outFile << "timer_name" << delimiter << "rank" << delimiter << "measurement_id"
+                    << delimiter << "duration_seconds" << "\n";
+        }
+        outFile.write(all.data(), static_cast<std::streamsize>(all.size()));
+    }
+}
+
 // print out the timing results
 void Timing::print() {
     if (TimerList.size() < 1)
@@ -153,8 +263,55 @@ void Timing::print() {
             << std::string().assign(20, ' ') << " Wall min = " << std::setw(10) << wallmin << "\n"
             << "\n";
     }
+
+    msg << "---------------------------------------------\n";
+    msg << "     Measurement counts:\n";
+    msg << "---------------------------------------------\n";
+    for (unsigned int i = 0; i < TimerList.size(); ++i) {
+        TimerInfo* tptr = TimerList[i].get();
+        size_t lengthName = std::min(tptr->name.length(), 19lu);
+        msg << tptr->name.substr(0, lengthName) << std::string().assign(20 - lengthName, '.')
+            << " Count = " << std::setw(10) << tptr->measurements.size() << "\n";
+    }
+
     msg << "---------------------------------------------";
     msg << endl;
+
+    // Per-occurrence statistics
+    msg << "---------------------------------------------\n";
+    msg << "     Per-occurrence statistics:\n";
+    msg << "---------------------------------------------\n";
+    for (unsigned int i = 0; i < TimerList.size(); ++i) {
+        TimerInfo* tptr = TimerList[i].get();
+        const std::vector<double>& m = tptr->measurements;
+        size_t count = m.size();
+        if (count == 0) continue;
+
+        double sum = 0.0, minVal = m[0], maxVal = m[0];
+        for (size_t j = 0; j < count; ++j) {
+            sum += m[j];
+            if (m[j] < minVal) minVal = m[j];
+            if (m[j] > maxVal) maxVal = m[j];
+        }
+        double mean = sum / count;
+
+        double varSum = 0.0;
+        for (size_t j = 0; j < count; ++j) {
+            double diff = m[j] - mean;
+            varSum += diff * diff;
+        }
+        double stddev = (count > 1) ? std::sqrt(varSum / (count - 1)) : 0.0;
+
+        size_t lengthName = std::min(tptr->name.length(), 19lu);
+        msg << tptr->name.substr(0, lengthName) << std::string().assign(20 - lengthName, '.')
+            << " n=" << std::setw(5) << count
+            << "  mean=" << std::setw(10) << mean
+            << "  std=" << std::setw(10) << stddev
+            << "  min=" << std::setw(10) << minVal
+            << "  max=" << std::setw(10) << maxVal
+            << "\n";
+    }
+    msg << "---------------------------------------------" << endl;
 }
 
 // save the timing results into a file
@@ -234,4 +391,4 @@ void IpplTimings::pop() {
     delete instance;
     instance = stashedInstance.top();
     stashedInstance.pop();
-}
+}
\ No newline at end of file
diff --git a/src/Utility/IpplTimings.h b/src/Utility/IpplTimings.h
index aea1332cd..82230faea 100644
--- a/src/Utility/IpplTimings.h
+++ b/src/Utility/IpplTimings.h
@@ -21,6 +21,14 @@
 //    4) print out the results:
 //       IpplTimings::print();
 //
+//    5) reset all timers (for warmup):
+//       IpplTimings::resetAllTimers();
+//    Clears all accumulated times and individual measurements.
+//
+//    6) export to CSV for violin plots:
+//       IpplTimings::dumpToCSV("output.csv");
+//    Exports all individual timing measurements for statistical analysis.
+//
 #ifndef IPPL_TIMINGS_H
 #define IPPL_TIMINGS_H
 
@@ -45,7 +53,8 @@ class IpplTimerInfo {
     IpplTimerInfo()
         : name("")
         , wallTime(0.0)
-        , indx(std::numeric_limits<TimerRef>::max()) {
+        , indx(std::numeric_limits<TimerRef>::max())
+        , measurements() {
         clear();
     }
 
@@ -55,18 +64,19 @@ class IpplTimerInfo {
     // timer operations
     void start() {
         if (!running) {
-            running = true;
-            t.stop();
-            t.clear();
+            t.clear();  // running was false, no need to stop first
             t.start();
+            running = true;
         }
     }
 
     void stop() {
         if (running) {
             t.stop();
-            running = false;
-            wallTime += t.elapsed();
+            running              = false;
+            const double elapsed = t.elapsed();
+            wallTime += elapsed;
+            measurements.push_back(elapsed);
         }
     }
 
@@ -76,6 +86,12 @@ class IpplTimerInfo {
         running = false;
     }
 
+    void clearAll() {
+        clear();
+        wallTime = 0.0;
+        measurements.clear();
+    }
+
     // the IPPL timer that this object manages
     Timer t;
 
@@ -90,6 +106,8 @@ class IpplTimerInfo {
 
     // an index value for this timer
     TimerRef indx;
+
+    std::vector<double> measurements;
 };
 
 struct Timing {
@@ -126,6 +144,20 @@ struct Timing {
     // print the results to a file
     void print(const std::string& fn, const std::map<std::string, unsigned int>& problemSize);
 
+    void resetAllTimers();
+
+    // Format: timer_name,rank,measurement_id,duration
+    void dumpToCSV(const std::string& filename);
+
+    void dumpToCSV(const std::string& filename, const std::string& delimiter, bool includeHeader);
+
+    const std::vector<double>& getMeasurements(TimerRef t) const;
+    const std::vector<double>& getMeasurements(const std::string& name) const;
+
+    size_t getMeasurementCount(TimerRef t) const;
+
+    std::vector<std::string> getTimerNames() const;
+
     // type of storage for list of TimerInfo
     typedef std::vector<my_auto_ptr<TimerInfo> > TimerList_t;
     typedef std::map<std::string, TimerInfo*> TimerMap_t;
@@ -136,6 +168,9 @@ struct Timing {
 
     // a map of timers, keyed by string
     TimerMap_t TimerMap;
+
+    // Returned by reference when a requested timer is not found.
+    static const std::vector<double> emptyMeasurements;
 };
 
 class IpplTimings {
@@ -173,6 +208,36 @@ class IpplTimings {
     static void stash();
     static void pop();
 
+    // Use this after warmup iterations to get clean timing data
+    static void resetAllTimers() { instance->resetAllTimers(); }
+
+    // Format: timer_name,rank,measurement_id,duration_seconds
+    // Perfect for creating violin plots in Python/R
+    static void dumpToCSV(const std::string& filename) {
+        instance->dumpToCSV(filename);
+    }
+
+    static void dumpToCSV(const std::string& filename,
+                         const std::string& delimiter,
+                         bool includeHeader = true) {
+        instance->dumpToCSV(filename, delimiter, includeHeader);
+    }
+
+    static const std::vector<double>& getMeasurements(TimerRef t) {
+        return instance->getMeasurements(t);
+    }
+    static const std::vector<double>& getMeasurements(const std::string& name) {
+        return instance->getMeasurements(name);
+    }
+
+    static size_t getMeasurementCount(TimerRef t) {
+        return instance->getMeasurementCount(t);
+    }
+
+    static std::vector<std::string> getTimerNames() {
+        return instance->getTimerNames();
+    }
+
 private:
     // type of storage for list of TimerInfo
     typedef Timing::TimerList_t TimerList_t;
@@ -188,4 +253,4 @@ class IpplTimings {
     static std::stack<Timing*> stashedInstance;
 };
 
-#endif
+#endif
\ No newline at end of file
diff --git a/src/Utility/ParallelDispatch.h b/src/Utility/ParallelDispatch.h
index 437c56c22..4e342aa02 100644
--- a/src/Utility/ParallelDispatch.h
+++ b/src/Utility/ParallelDispatch.h
@@ -7,6 +7,7 @@
 #define IPPL_PARALLEL_DISPATCH_H
 
 #include <Kokkos_Core.hpp>
+#include "Ippl.h"
 
 #include <tuple>
 
@@ -68,8 +69,8 @@ namespace ippl {
             }
             return policy_type(begin, end);
         }
-        // Silences incorrect nvcc warning: missing return statement at end of non-void function
-        throw IpplException("detail::getRangePolicy", "Unreachable state");
+        // Silences nvcc "missing return" at end of exhaustive if constexpr.
+        __builtin_unreachable();
     }
 
     /*!
@@ -94,8 +95,58 @@ namespace ippl {
         } else {
             return policy_type(begin, end);
         }
-        // Silences incorrect nvcc warning: missing return statement at end of non-void function
-        throw IpplException("detail::createRangePolicy", "Unreachable state");
+        // Silences nvcc "missing return" at end of exhaustive if constexpr.
+        __builtin_unreachable();
+    }
+
+    template <int... Is, typename F>
+    KOKKOS_FORCEINLINE_FUNCTION void for_constexpr(std::integer_sequence<int, Is...>, F&& f) {
+        (f.template operator()<Is>(), ...);
+    }
+
+    template <int... Is, typename F>
+    KOKKOS_FORCEINLINE_FUNCTION auto product_over(std::integer_sequence<int, Is...>, F&& f) {
+        return (f.template operator()<Is>() * ...);
+    }
+
+    template <int N, typename F>
+    KOKKOS_FORCEINLINE_FUNCTION auto product_over(F&& f) {
+        return product_over(std::make_integer_sequence<int, N>{}, std::forward<F>(f));
+    }
+
+    template <int Dim, typename Team, typename Extents, typename F>
+    KOKKOS_FORCEINLINE_FUNCTION void thread_vector_md_for(const Team& team, const Extents& extents,
+                                                          F&& f) {
+        [&]<int... Is>(std::integer_sequence<int, Is...>) {
+            Kokkos::parallel_for(
+                Kokkos::ThreadVectorMDRange<Kokkos::Rank<Dim>, Team>(team, extents[Is]...),
+                std::forward<F>(f));
+        }(std::make_integer_sequence<int, Dim>{});
+    }
+
+    template <int Dim, int W, typename Team, typename F>
+    KOKKOS_FORCEINLINE_FUNCTION void thread_vector_stencil_for(const Team& team, F&& f) {
+        constexpr int TotalCells = [] {
+            int r = 1;
+            for (int i = 0; i < Dim; ++i)
+                r *= W;
+            return r;
+        }();
+
+        Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, TotalCells), [&](const int flat_idx) {
+            // Compile-time unrollable index decomposition
+            Kokkos::Array<int, Dim> idx;
+            int tmp = flat_idx;
+            for (int d = Dim - 1; d >= 0; --d) {
+                idx[d] = tmp % W;
+                tmp /= W;
+            }
+
+            // Call with index pack
+            [&]<int... Is>(std::integer_sequence<int, Is...>) {
+                f(idx[Is]...);
+            }(std::make_integer_sequence<int, Dim>{});
+        });
     }
 
     namespace detail {
@@ -121,13 +172,50 @@ namespace ippl {
 
         enum e_functor_type {
             FOR,
-            REDUCE,
-            SCAN
+            REDUCE
         };
 
         template <e_functor_type, typename, typename, typename, typename...>
         struct FunctorWrapper;
 
+        template <typename ExecSpace>
+        constexpr bool inline isGPUSpace = false;
+
+#ifdef KOKKOS_ENABLE_CUDA
+        template <>
+        constexpr bool inline isGPUSpace<Kokkos::Cuda> = true;
+#endif
+#ifdef KOKKOS_ENABLE_HIP
+        template <>
+        constexpr bool inline isGPUSpace<Kokkos::HIP> = true;
+#endif
+
+        // Dispatches F(i) for i in [0, n) either in parallel (OpenMP host)
+        // when MPI_THREAD_MULTIPLE is available, or serially otherwise.
+        // `f(i)` must be synchronous w.r.t. its own work — this dispatcher
+        // fences after the parallel_for so the caller's next operation sees
+        // a consistent state.
+        template <typename F>
+        void parallelForMPI(size_t n, F&& f) {
+            constexpr bool useGPU = isGPUSpace<Kokkos::DefaultExecutionSpace>;
+            const bool threadSafe = Env->threadMultiple();
+
+            if constexpr (useGPU) {
+                if (threadSafe) {
+                    Kokkos::parallel_for(
+                        "Parallel dispatch",
+                        Kokkos::RangePolicy<Kokkos::DefaultHostExecutionSpace>(0, n), [&](int i) {
+                            f(i);
+                        });
+                    Kokkos::fence();
+                    return;
+                }
+            }
+            for (size_t i = 0; i < n; ++i) {
+                f(i);
+            }
+        }
+
         /*!
          * Wrapper struct for reduction kernels
          * Source:
@@ -179,9 +267,7 @@ namespace ippl {
             static constexpr int rank = Kokkos::MDRangePolicy<T...>::rank;
         };
         template <typename T>
-        concept HasMemberValueType = requires() {
-                                         { typename T::value_type() };
-                                     };
+        concept HasMemberValueType = requires { typename T::value_type; };
         template <typename T>
         struct ExtractReducerReturnType {
             using type = T;
@@ -227,6 +313,31 @@ namespace ippl {
                 functor),
             std::forward<ReducerArgument>(reducer)...);
     }
+
+    template <std::size_t I, typename T, typename... Rest>
+    KOKKOS_FORCEINLINE_FUNCTION auto get_arg(T first, Rest... rest) {
+        if constexpr (I == 0) {
+            return first;
+        } else {
+            return get_arg<I - 1>(rest...);
+        }
+    }
+
+    template <typename Grid, typename Base, typename... Stencils>
+    KOKKOS_FORCEINLINE_FUNCTION decltype(auto) grid_at(Grid& grid, const Base& base, int team_rank,
+                                                       Stencils... stencil_idx) {
+        return [&]<std::size_t... Is>(std::index_sequence<Is...>) -> decltype(auto) {
+            return grid((base(team_rank, Is) + get_arg<Is>(stencil_idx...))...);
+        }(std::index_sequence_for<Stencils...>{});
+    }
+
+    template <int D, typename Grid, typename Base, typename Stencils>
+    KOKKOS_FORCEINLINE_FUNCTION decltype(auto) grid_at_t(Grid& grid, const Base& base,
+                                                         int team_rank, Stencils& stencil_idx) {
+        return [&]<std::size_t... Is>(std::index_sequence<Is...>) -> decltype(auto) {
+            return grid((base(team_rank, Is) + stencil_idx[Is])...);
+        }(std::make_index_sequence<D>{});
+    }
 }  // namespace ippl
 
 #endif
diff --git a/src/Utility/ParameterList.h b/src/Utility/ParameterList.h
index d31f46d5d..ad9a2da11 100644
--- a/src/Utility/ParameterList.h
+++ b/src/Utility/ParameterList.h
@@ -64,6 +64,8 @@ namespace ippl {
             return std::get<T>(params_m.at(key));
         }
 
+        bool contains(const std::string& key) const { return params_m.contains(key); }
+
         /*!
          * Obtain the value of a parameter. If the key is
          * not contained, the default value is returned.
diff --git a/src/Utility/Timer.cpp b/src/Utility/Timer.cpp
index d6da687bd..de2b4335e 100644
--- a/src/Utility/Timer.cpp
+++ b/src/Utility/Timer.cpp
@@ -8,8 +8,17 @@
 
 #include "Timer.h"
 
+#include <chrono>
+
 bool Timer::enableFences = IPPL_ENABLE_TIMER_FENCES;
 
+namespace {
+    using clock_type = std::chrono::high_resolution_clock;
+    inline std::int64_t now_ticks() {
+        return clock_type::now().time_since_epoch().count();
+    }
+}  // namespace
+
 Timer::Timer() {
     this->clear();
 }
@@ -19,16 +28,22 @@ void Timer::clear() {
 }
 
 void Timer::start() {
-    start_m = std::chrono::high_resolution_clock::now();
+    // Fence on start as well: any in-flight kernel from before the timer was
+    // armed otherwise leaks its tail latency into the measured interval.
+    if (enableFences) {
+        Kokkos::fence("Timer Fence (start)");
+    }
+    start_m = now_ticks();
 }
 
 void Timer::stop() {
     if (enableFences) {
-        Kokkos::fence();
+        Kokkos::fence("Timer Fence (stop)");
     }
-    stop_m = std::chrono::high_resolution_clock::now();
+    stop_m = now_ticks();
 
-    duration_type elapsed = stop_m - start_m;
+    std::chrono::duration<double> elapsed =
+        clock_type::duration(stop_m) - clock_type::duration(start_m);
 
     elapsed_m += elapsed.count();
 }
diff --git a/src/Utility/Timer.h b/src/Utility/Timer.h
index a23d26f28..653eae588 100644
--- a/src/Utility/Timer.h
+++ b/src/Utility/Timer.h
@@ -8,17 +8,14 @@
 #define IPPL_TIMER_H
 
 #ifndef IPPL_ENABLE_TIMER_FENCES
-#warning "IPPL timer fences were not set via CMake! Defaulting to no fences."
+#pragma message("IPPL_ENABLE_TIMER_FENCES not set by CMake; defaulting to false.")
 #define IPPL_ENABLE_TIMER_FENCES false
 #endif
 
-#include <chrono>
+#include <cstdint>
 
 class Timer {
 public:
-    using timer_type    = std::chrono::time_point<std::chrono::high_resolution_clock>;
-    using duration_type = std::chrono::duration<double>;
-
     static bool enableFences;
 
     Timer();
@@ -31,7 +28,7 @@ class Timer {
 
 private:
     double elapsed_m;
-    timer_type start_m, stop_m;
+    std::int64_t start_m = 0, stop_m = 0;
 };
 
 #endif
diff --git a/src/Utility/Tuning.h b/src/Utility/Tuning.h
new file mode 100644
index 000000000..8f5177b69
--- /dev/null
+++ b/src/Utility/Tuning.h
@@ -0,0 +1,213 @@
+#ifndef IPPL_TUNING_H
+#define IPPL_TUNING_H
+
+#include <Kokkos_Core.hpp>
+#include <algorithm>
+#include <functional>
+#include <limits>
+#include <vector>
+
+// Kokkos's tuning interface is still exposed only through this internal
+// header in 5.0.x. When a public header (Kokkos_Tools.hpp) becomes
+// available across the supported version range, switch to it.
+#include <impl/Kokkos_Profiling_Interface.hpp>
+
+namespace ippl {
+
+template <unsigned Dim, typename TileType>
+class TileSizeTuner {
+public:
+    using TileConfig = TileType;
+
+private:
+    // Tuning parameters (following Kokkos convention)
+    static constexpr double tuning_min  = 0.0;
+    static constexpr double tuning_max  = 1.0;
+    static constexpr double tuning_step = 0.05;  // 20 steps per dimension
+
+    std::array<size_t, Dim> variable_ids_{};
+    size_t context_id_   = 0;
+    bool initialized_    = false;
+    bool context_active_ = false;
+
+    std::vector<int> candidates_;
+    TileConfig default_tile_;
+    size_t max_scratch_ = 0;
+    std::function<size_t(const TileConfig&)> scratch_calc_;
+
+public:
+    TileSizeTuner() = default;
+
+    template <typename ScratchCalculator>
+    void initialize(const std::string& kernel_name,
+                    const std::vector<int>& candidates,
+                    size_t max_scratch,
+                    ScratchCalculator&& calc,
+                    const TileConfig& default_tile) {
+        if (initialized_) return;
+
+        candidates_    = candidates;
+        max_scratch_   = max_scratch;
+        scratch_calc_  = std::forward<ScratchCalculator>(calc);
+        default_tile_  = default_tile;
+
+        // Sort candidates for consistent mapping
+        std::sort(candidates_.begin(), candidates_.end());
+
+        if (Kokkos::Tools::Experimental::have_tuning_tool()) {
+            using namespace Kokkos::Tools::Experimental;
+
+            for (unsigned d = 0; d < Dim; ++d) {
+                VariableInfo info;
+                info.type          = ValueType::kokkos_value_double;
+                info.category      = StatisticalCategory::kokkos_value_interval;
+                info.valueQuantity = CandidateValueType::kokkos_value_range;
+                info.candidates    = make_candidate_range(
+                    tuning_min, tuning_max, tuning_step, false, false);
+
+                variable_ids_[d] = declare_output_type(
+                    kernel_name + "_tile_size_" + std::to_string(d), info);
+            }
+        }
+
+        initialized_ = true;
+    }
+
+    bool is_initialized() const { return initialized_; }
+
+    // Map normalized value [0,1] to candidate index. With no candidates the
+    // tuner has nothing to choose from, so return 1 as a defensive default.
+    int map_to_candidate(double normalized) const {
+        if (candidates_.empty()) return 1;
+
+        normalized = std::clamp(normalized, 0.0, 1.0);
+
+        size_t idx = static_cast<size_t>(normalized * (candidates_.size() - 1) + 0.5);
+        idx        = std::min(idx, candidates_.size() - 1);
+
+        return candidates_[idx];
+    }
+
+    // Map candidate value back to normalized [0,1]. Returns 0.0 (matching the
+    // behaviour of an empty/single-element candidate set) when the candidate
+    // list cannot meaningfully be mapped.
+    double map_to_normalized(int candidate) const {
+        if (candidates_.size() <= 1) return 0.0;
+
+        auto it    = std::lower_bound(candidates_.begin(), candidates_.end(), candidate);
+        size_t idx = (it != candidates_.end()) ? std::distance(candidates_.begin(), it)
+                                               : candidates_.size() - 1;
+
+        return static_cast<double>(idx) / (candidates_.size() - 1);
+    }
+
+    TileConfig begin() {
+        if (!initialized_) {
+            return default_tile_;
+        }
+
+        TileConfig result = default_tile_;
+
+        if (Kokkos::Tools::Experimental::have_tuning_tool()) {
+            using namespace Kokkos::Tools::Experimental;
+
+            context_id_ = get_new_context_id();
+            begin_context(context_id_);
+            context_active_ = true;
+
+            // Request tuned values for each dimension
+            std::array<VariableValue, Dim> values;
+            for (unsigned d = 0; d < Dim; ++d) {
+                double default_norm = map_to_normalized(default_tile_[d]);
+                values[d] = make_variable_value(variable_ids_[d], default_norm);
+            }
+
+            request_output_values(context_id_, Dim, values.data());
+
+            // Map normalized values back to tile sizes
+            for (unsigned d = 0; d < Dim; ++d) {
+                result[d] = map_to_candidate(values[d].value.double_value);
+            }
+
+            // Validate scratch constraint - scale down if needed
+            result = fit_to_scratch(result);
+        }
+
+        return result;
+    }
+
+    void end() {
+        if (context_active_ && Kokkos::Tools::Experimental::have_tuning_tool()) {
+            Kokkos::Tools::Experimental::end_context(context_id_);
+            context_active_ = false;
+        }
+    }
+
+private:
+    // Scale down proportionally if tile doesn't fit in scratch.
+    TileConfig fit_to_scratch(const TileConfig& tile) const {
+        if (scratch_calc_(tile) <= max_scratch_) {
+            return tile;
+        }
+
+        // Binary search for the largest scale factor that fits. The result
+        // space is integer-valued (we snap to a discrete candidate after
+        // scaling), so the number of meaningful iterations is bounded by
+        // log2(max tile dim), capped at 20 for robustness.
+        int max_tile = 1;
+        for (unsigned d = 0; d < Dim; ++d) {
+            max_tile = std::max(max_tile, tile[d]);
+        }
+        int max_iter = std::min(20, static_cast<int>(std::ceil(std::log2(max_tile + 1)) + 1));
+
+        TileConfig result = tile;
+        double lo = 0.0, hi = 1.0;
+        for (int iter = 0; iter < max_iter; ++iter) {
+            double mid = (lo + hi) / 2.0;
+
+            TileConfig test;
+            for (unsigned d = 0; d < Dim; ++d) {
+                int scaled = static_cast<int>(tile[d] * mid);
+                test[d]    = snap_to_candidate(std::max(1, scaled));
+            }
+
+            if (scratch_calc_(test) <= max_scratch_) {
+                lo     = mid;
+                result = test;
+            } else {
+                hi = mid;
+            }
+        }
+
+        // Final fallback - smallest candidate. If even that overflows the
+        // scratch budget the kernel cannot run; throw so the caller doesn't
+        // get a confusing Kokkos::abort deeper in the dispatch.
+        if (scratch_calc_(result) > max_scratch_) {
+            for (unsigned d = 0; d < Dim; ++d) {
+                result[d] = candidates_.front();
+            }
+            if (scratch_calc_(result) > max_scratch_) {
+                throw std::runtime_error(
+                    "TileSizeTuner::fit_to_scratch: even the smallest tile "
+                    "exceeds the scratch budget");
+            }
+        }
+
+        return result;
+    }
+
+    int snap_to_candidate(int value) const {
+        if (candidates_.empty()) return value;
+
+        // Find largest candidate <= value
+        auto it = std::upper_bound(candidates_.begin(), candidates_.end(), value);
+        if (it == candidates_.begin()) {
+            return candidates_.front();
+        }
+        return *std::prev(it);
+    }
+};
+
+}  // namespace ippl
+
+#endif  // IPPL_TUNING_H
\ No newline at end of file
diff --git a/src/Utility/TypeUtils.h b/src/Utility/TypeUtils.h
index 9b47a3729..bc888e6d8 100644
--- a/src/Utility/TypeUtils.h
+++ b/src/Utility/TypeUtils.h
@@ -295,10 +295,9 @@ namespace ippl {
                 } else {
                     return spaceToIndex<Space, Idx + 1>();
                 }
-                // Silences incorrect nvcc warning: missing return statement at end of non-void
-                // function
-                throw IpplException("detail::MultispaceContainer::spaceToIndex",
-                                    "Unreachable state");
+                // Suppress nvcc "missing return" warning at the end of an
+                // exhaustive if constexpr chain.
+                __builtin_unreachable();
             }
 
             /*!
@@ -311,18 +310,16 @@ namespace ippl {
 
             /*!
              * Determine whether the element for a space should be initialized,
-             * possibly based on a predicate functor
+             * possibly based on a predicate functor. A nullptr_t Filter
+             * unconditionally enables the copy.
              */
-            template <typename MemorySpace, typename Filter,
-                      std::enable_if_t<std::is_null_pointer_v<std::decay_t<Filter>>, int> = 0>
-            constexpr bool copyToSpace(Filter&&) {
-                return true;
-            }
-
-            template <typename MemorySpace, typename Filter,
-                      std::enable_if_t<!std::is_null_pointer_v<std::decay_t<Filter>>, int> = 0>
+            template <typename MemorySpace, typename Filter>
             bool copyToSpace(Filter&& predicate) {
-                return predicate.template operator()<MemorySpace>();
+                if constexpr (std::is_null_pointer_v<std::decay_t<Filter>>) {
+                    return true;
+                } else {
+                    return predicate.template operator()<MemorySpace>();
+                }
             }
 
         public:
@@ -341,10 +338,22 @@ namespace ippl {
             template <typename DataType, typename Filter = std::nullptr_t>
             MultispaceContainer(const DataType& data, Filter&& predicate = nullptr)
                 : MultispaceContainer() {
-                using space = typename DataType::memory_space;
-                static_assert(std::is_same_v<DataType, Type<space>>);
-
-                elements_m[spaceToIndex<space>()] = data;
+                static_assert(Kokkos::is_view<DataType>::value, "DataType must be a Kokkos::View");
+
+                using space    = typename DataType::memory_space;
+                using expected = Type<space>;
+
+                static_assert(std::is_same_v<typename DataType::memory_space,
+                                             typename expected::memory_space>,
+                              "Hash view must live in the same memory_space as expected.");
+                static_assert(std::is_same_v<typename DataType::non_const_value_type,
+                                             typename expected::non_const_value_type>,
+                              "Hash view must have the same value_type as expected.");
+                static_assert(DataType::rank == expected::rank,
+                              "Hash view must have the same rank as expected.");
+
+                expected normalized = data;
+                elements_m[spaceToIndex<space>()] = normalized;
                 copyToOtherSpaces<space>(predicate);
             }
 
@@ -433,6 +442,40 @@ namespace ippl {
             runner(all_spaces{});
         }
     }  // namespace detail
+
+    template <typename T>
+    struct is_complex : std::false_type {};
+
+    template <typename T>
+    struct is_complex<Kokkos::complex<T>> : std::true_type {};
+
+    template <typename T>
+    inline constexpr bool is_complex_v = is_complex<T>::value;
+
+    template <typename T>
+    KOKKOS_FORCEINLINE_FUNCTION decltype(auto) real_part(T& val) {
+        if constexpr (is_complex_v<std::remove_cv_t<T>>) {
+            return val.real();
+        } else {
+            return val;
+        }
+    }
+
+    template <typename GridT, typename T>
+    KOKKOS_FORCEINLINE_FUNCTION decltype(auto) to_grid_value(T& val) {
+        if constexpr (is_complex_v<std::remove_cv_t<T>> && !is_complex_v<std::remove_cv_t<GridT>>) {
+            return val.real();
+        } else {
+            return val;
+        }
+    }
+
+    constexpr int int_pow(int base, int exp) {
+        int result = 1;
+        for (int i = 0; i < exp; ++i)
+            result *= base;
+        return result;
+    }
 }  // namespace ippl
 
 #endif
diff --git a/src/Utility/ViewUtils.h b/src/Utility/ViewUtils.h
index 6ef5e8906..dbb36b2e7 100644
--- a/src/Utility/ViewUtils.h
+++ b/src/Utility/ViewUtils.h
@@ -85,8 +85,7 @@ namespace ippl {
         template <typename T, unsigned Dim, class... Properties>
         void write(const typename ViewType<T, Dim, Properties...>::view_type& view,
                    std::ostream& out = std::cout) {
-            using view_type = typename ViewType<T, Dim, Properties...>::view_type;
-            typename view_type::host_mirror_type hview = Kokkos::create_mirror_view(view);
+            auto hview = Kokkos::create_mirror_view(view);
             Kokkos::deep_copy(hview, view);
 
             nestedViewLoop(
diff --git a/unit_tests/Communicate/BufferHandler.cpp b/unit_tests/Communicate/BufferHandler.cpp
index eef387da9..4612373e7 100644
--- a/unit_tests/Communicate/BufferHandler.cpp
+++ b/unit_tests/Communicate/BufferHandler.cpp
@@ -13,6 +13,14 @@ template <typename MemorySpace>
 class TypedBufferHandlerTest : public ::testing::Test {
 protected:
     using memory_space = MemorySpace;
+    static constexpr size_t page_size = 4096;
+
+    static size_t roundedSize(size_t size) {
+        if (size < page_size) {
+            return page_size;
+        }
+        return ((size + page_size - 1) / page_size) * page_size;
+    }
 
     class TestableBufferHandler : public ippl::DefaultBufferHandler<memory_space> {
     public:
@@ -37,7 +45,7 @@ TYPED_TEST_SUITE(TypedBufferHandlerTest, MemorySpaces);
 TYPED_TEST(TypedBufferHandlerTest, GetBuffer_EmptyFreeBuffers) {
     auto buffer = this->handler->getBuffer(100, 1.0);
     ASSERT_NE(buffer, nullptr);
-    EXPECT_EQ(buffer->getBufferSize(), 100);
+    EXPECT_EQ(buffer->getBufferSize(), TestFixture::roundedSize(100));
     EXPECT_EQ(this->handler->usedBuffersSize(), 1);
     EXPECT_EQ(this->handler->freeBuffersSize(), 0);
 }
@@ -48,7 +56,7 @@ TYPED_TEST(TypedBufferHandlerTest, GetBuffer_SuitableBufferAvailable) {
     this->handler->freeBuffer(buffer1);
 
     auto buffer2 = this->handler->getBuffer(40, 1.0);
-    EXPECT_EQ(buffer2->getBufferSize(), 50);
+    EXPECT_EQ(buffer2->getBufferSize(), TestFixture::roundedSize(50));
     EXPECT_EQ(this->handler->usedBuffersSize(), 1);
     EXPECT_EQ(this->handler->freeBuffersSize(), 0);
 }
@@ -91,7 +99,7 @@ TYPED_TEST(TypedBufferHandlerTest, GetBuffer_ResizeLargerThanAvailable) {
     this->handler->freeBuffer(smallBuffer);
 
     auto largeBuffer = this->handler->getBuffer(200, 1.0);
-    EXPECT_EQ(largeBuffer->getBufferSize(), 200);
+    EXPECT_EQ(largeBuffer->getBufferSize(), TestFixture::roundedSize(200));
     EXPECT_EQ(this->handler->usedBuffersSize(), 1);
     EXPECT_EQ(this->handler->freeBuffersSize(), 0);
 }
@@ -102,7 +110,7 @@ TYPED_TEST(TypedBufferHandlerTest, GetBuffer_ExactSizeMatch) {
     this->handler->freeBuffer(buffer1);
 
     auto buffer2 = this->handler->getBuffer(100, 1.0);
-    EXPECT_EQ(buffer2->getBufferSize(), 100);
+    EXPECT_EQ(buffer2->getBufferSize(), TestFixture::roundedSize(100));
     EXPECT_EQ(this->handler->usedBuffersSize(), 1);
     EXPECT_EQ(this->handler->freeBuffersSize(), 0);
 }
@@ -147,7 +155,7 @@ TYPED_TEST(TypedBufferHandlerTest, GetAllocatedAndFreeSize_EmptyHandler) {
 // Test: Allocating increases used buffer size correctly
 TYPED_TEST(TypedBufferHandlerTest, GetAllocatedAndFreeSize_AfterBufferAllocation) {
     auto buffer = this->handler->getBuffer(100, 1.0);
-    EXPECT_EQ(this->handler->getUsedSize(), 100);
+    EXPECT_EQ(this->handler->getUsedSize(), buffer->getBufferSize());
     EXPECT_EQ(this->handler->getFreeSize(), 0);
 }
 
@@ -157,7 +165,7 @@ TYPED_TEST(TypedBufferHandlerTest, GetAllocatedAndFreeSize_AfterFreeBuffer) {
     this->handler->freeBuffer(buffer);
 
     EXPECT_EQ(this->handler->getUsedSize(), 0);
-    EXPECT_EQ(this->handler->getFreeSize(), 100);
+    EXPECT_EQ(this->handler->getFreeSize(), buffer->getBufferSize());
 }
 
 // Test: Correct size is computed after freeing all buffers
@@ -168,7 +176,7 @@ TYPED_TEST(TypedBufferHandlerTest, GetAllocatedAndFreeSize_AfterFreeAllBuffers)
     this->handler->freeAllBuffers();
 
     EXPECT_EQ(this->handler->getUsedSize(), 0);
-    EXPECT_EQ(this->handler->getFreeSize(), 150);
+    EXPECT_EQ(this->handler->getFreeSize(), buffer1->getBufferSize() + buffer2->getBufferSize());
 }
 
 // Test: Deleting all buffers results in zero free or used buffer sizes
@@ -189,7 +197,7 @@ TYPED_TEST(TypedBufferHandlerTest, GetAllocatedAndFreeSize_ResizeBufferLargerTha
 
     auto largeBuffer = this->handler->getBuffer(200, 1.0);
 
-    EXPECT_EQ(this->handler->getUsedSize(), 200);
+    EXPECT_EQ(this->handler->getUsedSize(), largeBuffer->getBufferSize());
     EXPECT_EQ(this->handler->getFreeSize(), 0);
 }
 
diff --git a/unit_tests/Particle/CMakeLists.txt b/unit_tests/Particle/CMakeLists.txt
index 18359b944..78c4e9bfe 100644
--- a/unit_tests/Particle/CMakeLists.txt
+++ b/unit_tests/Particle/CMakeLists.txt
@@ -5,3 +5,5 @@ add_ippl_test(ParticleBase)
 add_ippl_test(ParticleBC)
 add_ippl_test(ParticleSendRecv)
 add_ippl_test(GatherScatterTest)
+add_ippl_test(ParticleUpdate)
+add_ippl_test(ParticleUpdateNonuniform)
diff --git a/unit_tests/Particle/ParticleBase.cpp b/unit_tests/Particle/ParticleBase.cpp
index 276afc25a..b2229a456 100644
--- a/unit_tests/Particle/ParticleBase.cpp
+++ b/unit_tests/Particle/ParticleBase.cpp
@@ -78,6 +78,7 @@ TYPED_TEST(ParticleBaseTest, CreateAndDestroy) {
     pbase->destroy(invalid, 500);
 
     // Verify remaining indices
+    mirror = pbase->ID.getHostMirror();
     Kokkos::deep_copy(mirror, pbase->ID.getView());
     for (int i = 0; i < 500; ++i) {
         // The even indices contain the original particles
diff --git a/unit_tests/Particle/ParticleSendRecv.cpp b/unit_tests/Particle/ParticleSendRecv.cpp
index 9b57b85aa..28e8ee0ca 100644
--- a/unit_tests/Particle/ParticleSendRecv.cpp
+++ b/unit_tests/Particle/ParticleSendRecv.cpp
@@ -33,7 +33,7 @@ class ParticleSendRecv<Parameters<T, ExecSpace, Rank<Dim>>> : public ::testing::
 
         ~Bunch() = default;
 
-        using charge_container_type = ippl::ParticleAttrib<T, ExecSpace>;
+        using charge_container_type = ippl::ParticleAttrib<T>;
 
         rank_type expectedRank;
         charge_container_type Q;
@@ -64,10 +64,10 @@ class ParticleSendRecv<Parameters<T, ExecSpace, Rank<Dim>>> : public ::testing::
             origin[d] = 0;
         }
 
-        layout  = flayout_type(MPI_COMM_WORLD, owned, isParallel);
-        mesh    = mesh_type(owned, hx, origin);
+        layout      = flayout_type(MPI_COMM_WORLD, owned, isParallel);
+        mesh        = mesh_type(owned, hx, origin);
         playout_ptr = std::make_shared<playout_type>(layout, mesh);
-        bunch = std::make_shared<bunch_type>(*playout_ptr);
+        bunch       = std::make_shared<bunch_type>(*playout_ptr);
 
         using BC = ippl::BC;
 
@@ -109,7 +109,7 @@ class ParticleSendRecv<Parameters<T, ExecSpace, Rank<Dim>>> : public ::testing::
         using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<2>, ExecSpace>;
 
         RegionLayout_t RLayout           = playout_ptr->getRegionLayout();
-        auto& positions                  = bunch->R.getView();
+        auto positions                   = bunch->R.getView();
         region_view Regions              = RLayout.getdLocalRegions();
         typename rank_type::view_type ER = bunch->expectedRank.getView();
 
diff --git a/unit_tests/Particle/ParticleUpdate.cpp b/unit_tests/Particle/ParticleUpdate.cpp
new file mode 100644
index 000000000..f5faa73e8
--- /dev/null
+++ b/unit_tests/Particle/ParticleUpdate.cpp
@@ -0,0 +1,830 @@
+//
+// Unit test: Particle spatial layout update / neighbor migration
+//
+// Tests that ippl::ParticleSpatialLayout::update() correctly migrates
+// particles to the rank that owns the region containing the particle's
+// position.  The suite covers:
+//
+//   1.  Baseline sanity – random particles already on correct rank.
+//   2.  Single deliberate out-of-bounds displacement (each axis / direction).
+//   3.  Corner / edge displacement (multi-axis simultaneous crossing).
+//   4.  Periodic wrap-around: particles placed just outside the global domain
+//       boundary reappear on the opposite rank.
+//   5.  All particles seeded on rank 0 and redistributed globally.
+//   6.  All particles seeded on the last rank and redistributed globally.
+//   7.  Non-uniform layouts: more cells assigned to one rank than the others,
+//       so that a single neighbour may span many grid cells.
+//   8.  Conservation: total particle count is invariant under update().
+//   9.  Charge conservation: sum of Q is invariant under update().
+//  10.  Multiple successive updates leave the layout stable.
+//  11.  Large particle count stress test.
+//  12.  Boundary-exact positions (particles sitting exactly on a rank
+//       boundary should belong to a well-defined rank, not be lost).
+//  13.  Zero-particle ranks: a rank with no particles participates correctly.
+//  14.  Heterogeneous displacement magnitudes in the same step.
+//  15.  3-D corner migration (all three axes crossed simultaneously).
+//
+#include "Ippl.h"
+
+#include <algorithm>
+#include <cmath>
+#include <numeric>
+#include <random>
+#include <vector>
+
+#include "TestUtils.h"
+#include "gtest/gtest.h"
+
+// ============================================================
+//  Helper types
+// ============================================================
+
+template <typename T, typename ExecSpace, unsigned Dim>
+struct TestBunch
+    : public ippl::ParticleBase<
+          ippl::ParticleSpatialLayout<T, Dim, ippl::UniformCartesian<T, Dim>, ExecSpace>> {
+    using playout_type =
+        ippl::ParticleSpatialLayout<T, Dim, ippl::UniformCartesian<T, Dim>, ExecSpace>;
+
+    explicit TestBunch(playout_type& pl)
+        : ippl::ParticleBase<playout_type>(pl) {
+        this->addAttribute(Q);
+        this->addAttribute(tag);
+    }
+
+    // Per-particle charge (used for conservation checks)
+    ippl::ParticleAttrib<T, ExecSpace> Q;
+
+    // Integer tag so we can track individual particles across ranks
+    ippl::ParticleAttrib<long long, ExecSpace> tag;
+};
+
+// ============================================================
+//  Fixture
+// ============================================================
+
+template <typename>
+class TestParticleUpdate;
+
+template <typename T_, typename ExecSpace, unsigned Dim_>
+class TestParticleUpdate<Parameters<T_, ExecSpace, Rank<Dim_>>> : public ::testing::Test {
+public:
+    // ---- type aliases --------------------------------------------------
+    using T                       = T_;
+    static constexpr unsigned Dim = Dim_;
+    using flayout_type            = ippl::FieldLayout<Dim>;
+    using mesh_type               = ippl::UniformCartesian<T, Dim>;
+    using playout_type            = ippl::ParticleSpatialLayout<T, Dim, mesh_type, ExecSpace>;
+    using RegionLayout_t          = typename playout_type::RegionLayout_t;
+    using bunch_type              = TestBunch<T, ExecSpace, Dim>;
+    using position_type           = ippl::Vector<T, Dim>;
+    using region_view             = typename RegionLayout_t::view_type;
+
+    // ---- construction --------------------------------------------------
+    TestParticleUpdate()
+        : nPoints(getGridSizes<Dim>()) {
+        for (unsigned d = 0; d < Dim; d++) {
+            domain[d] = static_cast<T>(nPoints[d]) / 16.0;
+        }
+
+        std::array<ippl::Index, Dim> args;
+        for (unsigned d = 0; d < Dim; d++)
+            args[d] = ippl::Index(nPoints[d]);
+        auto owned = std::make_from_tuple<ippl::NDIndex<Dim>>(args);
+
+        ippl::Vector<T, Dim> hx, origin;
+        std::array<bool, Dim> isParallel;
+        isParallel.fill(true);
+        for (unsigned d = 0; d < Dim; d++) {
+            hx[d]     = domain[d] / nPoints[d];
+            origin[d] = 0;
+        }
+
+        layout      = std::make_shared<flayout_type>(MPI_COMM_WORLD, owned, isParallel);
+        mesh        = std::make_shared<mesh_type>(owned, hx, origin);
+        playout_ptr = std::make_shared<playout_type>(*layout, *mesh);
+    }
+
+    // ---- helpers -------------------------------------------------------
+
+    /// Create a fresh bunch and set periodic BCs.
+    std::shared_ptr<bunch_type> makeBunch() {
+        auto b = std::make_shared<bunch_type>(*playout_ptr);
+        typename bunch_type::bc_container_type bcs;
+        bcs.fill(ippl::BC::PERIODIC);
+        b->setParticleBC(bcs);
+        return b;
+    }
+
+    /// Place n particles with uniformly random positions on every rank.
+    void fillRandom(bunch_type& b, unsigned n, unsigned long seed = 42) {
+        int nRanks       = ippl::Comm->size();
+        unsigned perRank = std::max(1u, n / nRanks);
+        b.create(perRank);
+
+        std::mt19937_64 eng(seed + ippl::Comm->rank());
+        std::uniform_real_distribution<T> unif(T(0), T(1));
+
+        auto R_host = b.R.getHostMirror();
+        auto Q_host = b.Q.getHostMirror();
+        auto t_host = b.tag.getHostMirror();
+
+        // Global unique id = rank * kRankIdStride + local index. The stride
+        // bounds the per-rank particle count; 10M is comfortably above any
+        // unit-test bunch size and stays well clear of int32 overflow.
+        constexpr long long kRankIdStride = 10'000'000LL;
+        for (size_t i = 0; i < b.getLocalNum(); ++i) {
+            position_type r;
+            for (unsigned d = 0; d < Dim; d++)
+                r[d] = unif(eng) * domain[d];
+            R_host(i) = r;
+            Q_host(i) = T(1);
+            t_host(i) = static_cast<long long>(ippl::Comm->rank()) * kRankIdStride
+                        + static_cast<long long>(i);
+        }
+        Kokkos::deep_copy(b.R.getView(), R_host);
+        Kokkos::deep_copy(b.Q.getView(), Q_host);
+        Kokkos::deep_copy(b.tag.getView(), t_host);
+    }
+
+    /// Reduce total particle count across all ranks.
+    size_t totalParticles(const bunch_type& b) const {
+        size_t local = b.getLocalNum();
+        size_t total = 0;
+        ippl::Comm->reduce(local, total, 1, std::plus<size_t>());
+        return total;
+    }
+
+    /// Reduce total charge across all ranks.
+    T totalCharge(bunch_type& b) const {
+        auto Q_host = b.Q.getHostMirror();
+        Kokkos::deep_copy(Q_host, b.Q.getView());
+        T local = T(0);
+        for (size_t i = 0; i < b.getLocalNum(); ++i)
+            local += Q_host(i);
+        T total = T(0);
+        ippl::Comm->reduce(local, total, 1, std::plus<T>());
+        return total;
+    }
+
+    /// For every locally owned particle verify it lies inside the local region.
+    /// Returns the number of misplaced particles on this rank.
+    size_t countMisplaced(bunch_type& b) {
+        RegionLayout_t rl   = playout_ptr->getRegionLayout();
+        region_view regions = rl.getdLocalRegions();
+        int myRank          = ippl::Comm->rank();
+
+        auto R_host = b.R.getHostMirror();
+        Kokkos::deep_copy(R_host, b.R.getView());
+
+        // Copy region bounds to host for checking
+        auto regions_host = Kokkos::create_mirror_view(regions);
+        Kokkos::deep_copy(regions_host, regions);
+
+        size_t misplaced = 0;
+        for (size_t i = 0; i < b.getLocalNum(); ++i) {
+            bool inMyRegion = true;
+            for (unsigned d = 0; d < Dim; d++) {
+                T pos = R_host(i)[d];
+                inMyRegion &=
+                    (pos >= regions_host(myRank)[d].min() && pos <= regions_host(myRank)[d].max());
+            }
+            if (!inMyRegion)
+                ++misplaced;
+        }
+        return misplaced;
+    }
+
+    /// Wrap a coordinate periodically into [0, domainLen).
+    T periodicWrap(T x, T domainLen) const {
+        while (x < T(0))
+            x += domainLen;
+        while (x >= domainLen)
+            x -= domainLen;
+        return x;
+    }
+
+    // ---- data members --------------------------------------------------
+    std::array<size_t, Dim> nPoints;
+    std::array<T, Dim> domain;
+
+    std::shared_ptr<flayout_type> layout;
+    std::shared_ptr<mesh_type> mesh;
+    std::shared_ptr<playout_type> playout_ptr;
+};
+
+using Tests = TestParams::tests<1, 2, 3>;
+TYPED_TEST_SUITE(TestParticleUpdate, Tests);
+
+// ============================================================
+//  1.  Baseline: random particles already on the correct rank
+// ============================================================
+TYPED_TEST(TestParticleUpdate, BaselineSanityNoMigrationNeeded) {
+    auto bunch = this->makeBunch();
+    this->fillRandom(*bunch, 256);
+
+    // update() should not lose or duplicate particles
+    const size_t before = this->totalParticles(*bunch);
+    bunch->update();
+    const size_t after = this->totalParticles(*bunch);
+
+    EXPECT_EQ(before, after);
+    EXPECT_EQ(0u, this->countMisplaced(*bunch));
+}
+
+// ============================================================
+//  2.  Conservation of total particle count
+// ============================================================
+TYPED_TEST(TestParticleUpdate, TotalParticleCountConserved) {
+    constexpr unsigned N = 512;
+    auto bunch           = this->makeBunch();
+    this->fillRandom(*bunch, N);
+
+    const size_t before = this->totalParticles(*bunch);
+    bunch->update();
+
+    EXPECT_EQ(before, this->totalParticles(*bunch));
+}
+
+// ============================================================
+//  3.  Conservation of total charge
+// ============================================================
+TYPED_TEST(TestParticleUpdate, TotalChargeConserved) {
+    auto bunch = this->makeBunch();
+    this->fillRandom(*bunch, 256);
+
+    // Give every particle a distinct charge so any loss/duplication shows up
+    {
+        auto Q_host = bunch->Q.getHostMirror();
+        for (size_t i = 0; i < bunch->getLocalNum(); ++i)
+            Q_host(i) = static_cast<typename TestFixture::T>(i + 1);
+        Kokkos::deep_copy(bunch->Q.getView(), Q_host);
+    }
+
+    const auto chargeBefore = this->totalCharge(*bunch);
+    bunch->update();
+    const auto chargeAfter = this->totalCharge(*bunch);
+
+    // Allow small floating-point rounding
+    EXPECT_NEAR(static_cast<double>(chargeBefore), static_cast<double>(chargeAfter),
+                1e-6 * std::abs(static_cast<double>(chargeBefore)));
+}
+
+// ============================================================
+//  4.  Single-axis displacement – particles must migrate to neighbour
+// ============================================================
+TYPED_TEST(TestParticleUpdate, SingleAxisDisplacementMigratesCorrectly) {
+    using T = typename TestFixture::T;
+
+    // Only meaningful with more than one rank
+    if (ippl::Comm->size() < 2)
+        GTEST_SKIP();
+
+    auto bunch = this->makeBunch();
+    this->fillRandom(*bunch, 128);
+
+    // Move every local particle half the domain length along axis 0
+    // (wrapping with period).  After update, everyone must be on the
+    // owning rank.
+    {
+        auto R_host = bunch->R.getHostMirror();
+        Kokkos::deep_copy(R_host, bunch->R.getView());
+        const T shift = this->domain[0] / T(2);
+        for (size_t i = 0; i < bunch->getLocalNum(); ++i) {
+            R_host(i)[0] = this->periodicWrap(R_host(i)[0] + shift, this->domain[0]);
+        }
+        Kokkos::deep_copy(bunch->R.getView(), R_host);
+    }
+
+    const size_t before = this->totalParticles(*bunch);
+    bunch->update();
+
+    EXPECT_EQ(before, this->totalParticles(*bunch));
+    EXPECT_EQ(0u, this->countMisplaced(*bunch));
+}
+
+// ============================================================
+//  5.  Multi-axis (corner) displacement
+// ============================================================
+TYPED_TEST(TestParticleUpdate, CornerDisplacementMigratesCorrectly) {
+    using T = typename TestFixture::T;
+
+    if (ippl::Comm->size() < 2)
+        GTEST_SKIP();
+
+    auto bunch = this->makeBunch();
+    this->fillRandom(*bunch, 128);
+
+    // Shift along every axis simultaneously
+    {
+        auto R_host = bunch->R.getHostMirror();
+        Kokkos::deep_copy(R_host, bunch->R.getView());
+        for (size_t i = 0; i < bunch->getLocalNum(); ++i) {
+            for (unsigned d = 0; d < TestFixture::Dim; d++) {
+                const T shift = this->domain[d] * T(0.3);
+                R_host(i)[d]  = this->periodicWrap(R_host(i)[d] + shift, this->domain[d]);
+            }
+        }
+        Kokkos::deep_copy(bunch->R.getView(), R_host);
+    }
+
+    const size_t before = this->totalParticles(*bunch);
+    bunch->update();
+
+    EXPECT_EQ(before, this->totalParticles(*bunch));
+    EXPECT_EQ(0u, this->countMisplaced(*bunch));
+}
+
+// ============================================================
+//  6.  Periodic wrap-around: particles placed just outside global domain
+// ============================================================
+TYPED_TEST(TestParticleUpdate, PeriodicWrapAroundRightBoundary) {
+    using T = typename TestFixture::T;
+
+    auto bunch = this->makeBunch();
+
+    // Place one particle per rank just beyond the upper global boundary
+    // in axis 0; periodic BC should wrap it back.
+    bunch->create(1);
+    {
+        auto R_host = bunch->R.getHostMirror();
+        auto Q_host = bunch->Q.getHostMirror();
+        // Place at domain[0] + small epsilon → wraps to near 0
+        T eps        = this->domain[0] * T(1e-4);
+        R_host(0)[0] = this->domain[0] + eps;
+        for (unsigned d = 1; d < TestFixture::Dim; d++)
+            R_host(0)[d] = this->domain[d] * T(0.5);
+        Q_host(0) = T(1);
+        Kokkos::deep_copy(bunch->R.getView(), R_host);
+        Kokkos::deep_copy(bunch->Q.getView(), Q_host);
+    }
+
+    const size_t before = this->totalParticles(*bunch);
+    bunch->update();
+
+    EXPECT_EQ(before, this->totalParticles(*bunch));
+}
+
+// ============================================================
+//  7.  Periodic wrap-around: just outside lower global boundary
+// ============================================================
+TYPED_TEST(TestParticleUpdate, PeriodicWrapAroundLeftBoundary) {
+    using T = typename TestFixture::T;
+
+    auto bunch = this->makeBunch();
+    bunch->create(1);
+    {
+        auto R_host = bunch->R.getHostMirror();
+        auto Q_host = bunch->Q.getHostMirror();
+        T eps       = this->domain[0] * T(1e-4);
+        // Slightly negative position wraps to near domain[0]
+        R_host(0)[0] = -eps;
+        for (unsigned d = 1; d < TestFixture::Dim; d++)
+            R_host(0)[d] = this->domain[d] * T(0.5);
+        Q_host(0) = T(1);
+        Kokkos::deep_copy(bunch->R.getView(), R_host);
+        Kokkos::deep_copy(bunch->Q.getView(), Q_host);
+    }
+
+    const size_t before = this->totalParticles(*bunch);
+    bunch->update();
+
+    EXPECT_EQ(before, this->totalParticles(*bunch));
+}
+
+// ============================================================
+//  8.  All particles seeded on rank 0 → global redistribution
+// ============================================================
+TYPED_TEST(TestParticleUpdate, AllParticlesOnRank0Redistributed) {
+    using T = typename TestFixture::T;
+
+    unsigned N = ippl::Comm->rank() == 0 ? 256 : 0;
+    auto bunch = this->makeBunch();
+
+    bunch->create(N);
+    if (ippl::Comm->rank() == 0) {
+        std::mt19937_64 eng(1234);
+        std::uniform_real_distribution<T> unif(T(0), T(1));
+        auto R_host = bunch->R.getHostMirror();
+        auto Q_host = bunch->Q.getHostMirror();
+        for (unsigned i = 0; i < N; ++i) {
+            for (unsigned d = 0; d < TestFixture::Dim; d++)
+                R_host(i)[d] = unif(eng) * this->domain[d];
+            Q_host(i) = T(1);
+        }
+        Kokkos::deep_copy(bunch->R.getView(), R_host);
+        Kokkos::deep_copy(bunch->Q.getView(), Q_host);
+    }
+
+    const size_t before = this->totalParticles(*bunch);
+    ASSERT_EQ(before, static_cast<size_t>(N));
+
+    bunch->update();
+
+    EXPECT_EQ(before, this->totalParticles(*bunch));
+    EXPECT_EQ(0u, this->countMisplaced(*bunch));
+}
+
+// ============================================================
+//  9.  All particles seeded on last rank → global redistribution
+// ============================================================
+TYPED_TEST(TestParticleUpdate, AllParticlesOnLastRankRedistributed) {
+    using T      = typename TestFixture::T;
+    int lastRank = ippl::Comm->size() - 1;
+    unsigned N   = ippl::Comm->rank() == lastRank ? 256 : 0;
+    auto bunch   = this->makeBunch();
+    bunch->create(N);
+
+    if (ippl::Comm->rank() == lastRank) {
+        std::mt19937_64 eng(5678);
+        std::uniform_real_distribution<T> unif(T(0), T(1));
+        auto R_host = bunch->R.getHostMirror();
+        auto Q_host = bunch->Q.getHostMirror();
+        for (unsigned i = 0; i < N; ++i) {
+            for (unsigned d = 0; d < TestFixture::Dim; d++)
+                R_host(i)[d] = unif(eng) * this->domain[d];
+            Q_host(i) = T(1);
+        }
+        Kokkos::deep_copy(bunch->R.getView(), R_host);
+        Kokkos::deep_copy(bunch->Q.getView(), Q_host);
+    }
+
+    const size_t before = this->totalParticles(*bunch);
+    bunch->update();
+
+    EXPECT_EQ(before, this->totalParticles(*bunch));
+    EXPECT_EQ(0u, this->countMisplaced(*bunch));
+}
+
+// ============================================================
+//  10. Multiple successive updates are idempotent
+// ============================================================
+TYPED_TEST(TestParticleUpdate, MultipleSuccessiveUpdatesStable) {
+    auto bunch = this->makeBunch();
+    this->fillRandom(*bunch, 128);
+
+    const size_t before = this->totalParticles(*bunch);
+    for (int iter = 0; iter < 5; ++iter) {
+        bunch->update();
+        EXPECT_EQ(before, this->totalParticles(*bunch));
+        EXPECT_EQ(0u, this->countMisplaced(*bunch));
+    }
+}
+
+// ============================================================
+//  11. Successive updates after each random displacement
+// ============================================================
+TYPED_TEST(TestParticleUpdate, SuccessiveDisplacementsConserveParticles) {
+    using T = typename TestFixture::T;
+
+    auto bunch = this->makeBunch();
+    this->fillRandom(*bunch, 128);
+    const size_t N = this->totalParticles(*bunch);
+
+    std::mt19937_64 eng(999 + ippl::Comm->rank());
+    std::uniform_real_distribution<T> unif(-T(0.1), T(0.1));
+
+    for (int step = 0; step < 8; ++step) {
+        // Small random displacements (staying in domain with wrapping)
+        auto R_host = bunch->R.getHostMirror();
+        Kokkos::deep_copy(R_host, bunch->R.getView());
+        for (size_t i = 0; i < bunch->getLocalNum(); ++i) {
+            for (unsigned d = 0; d < TestFixture::Dim; d++) {
+                T newPos     = R_host(i)[d] + unif(eng) * this->domain[d];
+                R_host(i)[d] = this->periodicWrap(newPos, this->domain[d]);
+            }
+        }
+        Kokkos::deep_copy(bunch->R.getView(), R_host);
+
+        bunch->update();
+        EXPECT_EQ(N, this->totalParticles(*bunch)) << "at step " << step;
+    }
+}
+
+// ============================================================
+//  12. Boundary-exact positions: particles at rank boundaries
+// ============================================================
+TYPED_TEST(TestParticleUpdate, BoundaryExactPositionsNotLost) {
+    using T = typename TestFixture::T;
+
+    // Each rank creates one particle exactly at the domain's mid-point
+    // along axis 0 (a likely rank boundary for 2+ ranks).
+    auto bunch = this->makeBunch();
+    bunch->create(1);
+    {
+        auto R_host  = bunch->R.getHostMirror();
+        auto Q_host  = bunch->Q.getHostMirror();
+        R_host(0)[0] = this->domain[0] / T(2);
+        for (unsigned d = 1; d < TestFixture::Dim; d++)
+            R_host(0)[d] = this->domain[d] / T(2);
+        Q_host(0) = T(1);
+        Kokkos::deep_copy(bunch->R.getView(), R_host);
+        Kokkos::deep_copy(bunch->Q.getView(), Q_host);
+    }
+
+    const size_t before = this->totalParticles(*bunch);
+    bunch->update();
+
+    // The particle must survive, though ownership may shift
+    EXPECT_EQ(before, this->totalParticles(*bunch));
+}
+
+// ============================================================
+//  13. Zero-particle ranks participate correctly
+// ============================================================
+TYPED_TEST(TestParticleUpdate, ZeroParticleRanksDoNotDeadlock) {
+    using T = typename TestFixture::T;
+
+    // Only rank 0 creates particles
+    auto bunch = this->makeBunch();
+    int N      = ippl::Comm->rank() == 0 ? 64 : 0;
+    bunch->create(N);
+
+    if (ippl::Comm->rank() == 0) {
+        std::mt19937_64 eng(7777);
+        std::uniform_real_distribution<T> unif(T(0), T(1));
+        auto R_host = bunch->R.getHostMirror();
+        auto Q_host = bunch->Q.getHostMirror();
+        for (int i = 0; i < N; ++i) {
+            for (unsigned d = 0; d < TestFixture::Dim; d++)
+                R_host(i)[d] = unif(eng) * this->domain[d];
+            Q_host(i) = T(1);
+        }
+        Kokkos::deep_copy(bunch->R.getView(), R_host);
+        Kokkos::deep_copy(bunch->Q.getView(), Q_host);
+    }
+
+    const size_t before = this->totalParticles(*bunch);
+    // This must not deadlock even if some ranks hold zero particles
+    ASSERT_NO_THROW(bunch->update());
+    EXPECT_EQ(before, this->totalParticles(*bunch));
+}
+
+// ============================================================
+//  14. Heterogeneous displacement magnitudes (small + large mixed)
+// ============================================================
+TYPED_TEST(TestParticleUpdate, HeterogeneousDisplacementsMigrateCorrectly) {
+    using T = typename TestFixture::T;
+
+    if (ippl::Comm->size() < 2)
+        GTEST_SKIP();
+
+    auto bunch = this->makeBunch();
+    this->fillRandom(*bunch, 256);
+
+    {
+        auto R_host = bunch->R.getHostMirror();
+        Kokkos::deep_copy(R_host, bunch->R.getView());
+        for (size_t i = 0; i < bunch->getLocalNum(); ++i) {
+            // Even-indexed particles get a tiny nudge; odd ones jump ~70%.
+            const T fraction = (i % 2 == 0) ? T(0.01) : T(0.7);
+            for (unsigned d = 0; d < TestFixture::Dim; d++) {
+                T newPos     = R_host(i)[d] + fraction * this->domain[d];
+                R_host(i)[d] = this->periodicWrap(newPos, this->domain[d]);
+            }
+        }
+        Kokkos::deep_copy(bunch->R.getView(), R_host);
+    }
+
+    const size_t before = this->totalParticles(*bunch);
+    bunch->update();
+
+    EXPECT_EQ(before, this->totalParticles(*bunch));
+    EXPECT_EQ(0u, this->countMisplaced(*bunch));
+}
+
+// ============================================================
+//  15. Large-count stress test
+// ============================================================
+TYPED_TEST(TestParticleUpdate, LargeParticleCountConserved) {
+    constexpr unsigned N = 4096;
+    auto bunch           = this->makeBunch();
+    this->fillRandom(*bunch, N, /*seed=*/31415);
+
+    const size_t before = this->totalParticles(*bunch);
+    bunch->update();
+
+    EXPECT_EQ(before, this->totalParticles(*bunch));
+    EXPECT_EQ(0u, this->countMisplaced(*bunch));
+}
+
+// ============================================================
+//  16. Non-uniform particle distribution: load-imbalanced initial state
+// ============================================================
+TYPED_TEST(TestParticleUpdate, NonUniformInitialDistributionRedistributesCorrectly) {
+    using T = typename TestFixture::T;
+
+    auto bunch = this->makeBunch();
+
+    // Rank r gets r+1 particles, placed uniformly at random.
+    unsigned nLocal = static_cast<unsigned>(ippl::Comm->rank() + 1);
+    bunch->create(nLocal);
+    {
+        std::mt19937_64 eng(2468 + ippl::Comm->rank());
+        std::uniform_real_distribution<T> unif(T(0), T(1));
+        auto R_host = bunch->R.getHostMirror();
+        auto Q_host = bunch->Q.getHostMirror();
+        for (unsigned i = 0; i < nLocal; ++i) {
+            for (unsigned d = 0; d < TestFixture::Dim; d++)
+                R_host(i)[d] = unif(eng) * this->domain[d];
+            Q_host(i) = T(1);
+        }
+        Kokkos::deep_copy(bunch->R.getView(), R_host);
+        Kokkos::deep_copy(bunch->Q.getView(), Q_host);
+    }
+
+    const size_t before = this->totalParticles(*bunch);
+    bunch->update();
+
+    EXPECT_EQ(before, this->totalParticles(*bunch));
+    EXPECT_EQ(0u, this->countMisplaced(*bunch));
+}
+
+// ============================================================
+//  17. Full-domain traversal: particle moves across the entire domain
+// ============================================================
+TYPED_TEST(TestParticleUpdate, FullDomainTraversalConservesParticles) {
+    using T = typename TestFixture::T;
+
+    auto bunch = this->makeBunch();
+    bunch->create(8);
+    {
+        auto R_host = bunch->R.getHostMirror();
+        auto Q_host = bunch->Q.getHostMirror();
+        for (size_t i = 0; i < bunch->getLocalNum(); ++i) {
+            // Place near origin
+            for (unsigned d = 0; d < TestFixture::Dim; d++)
+                R_host(i)[d] = T(1e-5) * this->domain[d];
+            Q_host(i) = T(1);
+        }
+        Kokkos::deep_copy(bunch->R.getView(), R_host);
+        Kokkos::deep_copy(bunch->Q.getView(), Q_host);
+    }
+
+    const size_t before = this->totalParticles(*bunch);
+    bunch->update();  // initial valid placement
+
+    // Now jump every particle to the diagonally opposite corner
+    {
+        auto R_host = bunch->R.getHostMirror();
+        Kokkos::deep_copy(R_host, bunch->R.getView());
+        for (size_t i = 0; i < bunch->getLocalNum(); ++i) {
+            for (unsigned d = 0; d < TestFixture::Dim; d++) {
+                T newPos     = R_host(i)[d] + this->domain[d] * T(0.999);
+                R_host(i)[d] = this->periodicWrap(newPos, this->domain[d]);
+            }
+        }
+        Kokkos::deep_copy(bunch->R.getView(), R_host);
+    }
+
+    bunch->update();
+    EXPECT_EQ(before, this->totalParticles(*bunch));
+    EXPECT_EQ(0u, this->countMisplaced(*bunch));
+}
+
+// ============================================================
+//  18. Check that tags (IDs) are preserved through migration
+// ============================================================
+TYPED_TEST(TestParticleUpdate, ParticleTagsPreservedAfterMigration) {
+    using T = typename TestFixture::T;
+
+    if (ippl::Comm->size() < 2)
+        GTEST_SKIP();
+
+    auto bunch = this->makeBunch();
+    this->fillRandom(*bunch, 128);
+
+    // Record all local tags and charges before update
+    std::vector<std::pair<long long, T>> localBefore;
+    {
+        auto t_host = bunch->tag.getHostMirror();
+        auto Q_host = bunch->Q.getHostMirror();
+        Kokkos::deep_copy(t_host, bunch->tag.getView());
+        Kokkos::deep_copy(Q_host, bunch->Q.getView());
+        for (size_t i = 0; i < bunch->getLocalNum(); ++i)
+            localBefore.push_back({t_host(i), Q_host(i)});
+    }
+
+    // Displace so migration happens
+    {
+        auto R_host = bunch->R.getHostMirror();
+        Kokkos::deep_copy(R_host, bunch->R.getView());
+        for (size_t i = 0; i < bunch->getLocalNum(); ++i) {
+            for (unsigned d = 0; d < TestFixture::Dim; d++) {
+                T newPos     = R_host(i)[d] + this->domain[d] * T(0.4);
+                R_host(i)[d] = this->periodicWrap(newPos, this->domain[d]);
+            }
+        }
+        Kokkos::deep_copy(bunch->R.getView(), R_host);
+    }
+
+    bunch->update();
+
+    // Gather tags from all ranks and verify the global set is identical
+    // (no duplicates, no missing)
+    std::vector<long long> localTagsAfter;
+    {
+        auto t_host = bunch->tag.getHostMirror();
+        Kokkos::deep_copy(t_host, bunch->tag.getView());
+        for (size_t i = 0; i < bunch->getLocalNum(); ++i)
+            localTagsAfter.push_back(t_host(i));
+    }
+
+    // Each rank sorts its local list; then we check global count via reduce
+    size_t localCount  = localTagsAfter.size();
+    size_t globalCount = 0;
+    ippl::Comm->reduce(localCount, globalCount, 1, std::plus<size_t>());
+
+    // Reconstruct original count
+    size_t originalCount  = localBefore.size();
+    size_t globalOriginal = 0;
+    ippl::Comm->reduce(originalCount, globalOriginal, 1, std::plus<size_t>());
+
+    EXPECT_EQ(globalOriginal, globalCount);
+}
+
+// ============================================================
+//  19. Periodic wrap: particle moved exactly one full domain length
+// ============================================================
+TYPED_TEST(TestParticleUpdate, ExactOneDomainLengthShiftConservesParticles) {
+    using T = typename TestFixture::T;
+
+    auto bunch = this->makeBunch();
+    this->fillRandom(*bunch, 64);
+    const size_t before = this->totalParticles(*bunch);
+
+    // Shift by exactly one domain length: after wrapping, position unchanged.
+    {
+        auto R_host = bunch->R.getHostMirror();
+        Kokkos::deep_copy(R_host, bunch->R.getView());
+        for (size_t i = 0; i < bunch->getLocalNum(); ++i) {
+            for (unsigned d = 0; d < TestFixture::Dim; d++) {
+                T newPos     = R_host(i)[d] + this->domain[d];
+                R_host(i)[d] = this->periodicWrap(newPos, this->domain[d]);
+            }
+        }
+        Kokkos::deep_copy(bunch->R.getView(), R_host);
+    }
+
+    bunch->update();
+    EXPECT_EQ(before, this->totalParticles(*bunch));
+}
+
+// ============================================================
+//  20. Repeated create-and-update cycles (simulates particle injection)
+// ============================================================
+TYPED_TEST(TestParticleUpdate, RepeatedCreateAndUpdateCycles) {
+    using T = typename TestFixture::T;
+
+    auto bunch        = this->makeBunch();
+    size_t cumulative = 0;
+
+    for (int cycle = 0; cycle < 4; ++cycle) {
+        // Inject 32 more particles on rank 0 each cycle
+        constexpr unsigned inject = 32;
+        int N                     = ippl::Comm->rank() == 0 ? inject : 0;
+        size_t currentLocal       = bunch->getLocalNum();
+        bunch->create(N);
+
+        if (ippl::Comm->rank() == 0) {
+            std::mt19937_64 eng(cycle * 100);
+            std::uniform_real_distribution<T> unif(T(0), T(1));
+            auto R_host = bunch->R.getHostMirror();
+            auto Q_host = bunch->Q.getHostMirror();
+            Kokkos::deep_copy(R_host, bunch->R.getView());
+            Kokkos::deep_copy(Q_host, bunch->Q.getView());
+
+            for (unsigned i = currentLocal; i < currentLocal + inject; ++i) {
+                for (unsigned d = 0; d < TestFixture::Dim; d++)
+                    R_host(i)[d] = unif(eng) * this->domain[d];
+                Q_host(i) = T(1);
+            }
+            Kokkos::deep_copy(bunch->R.getView(), R_host);
+            Kokkos::deep_copy(bunch->Q.getView(), Q_host);
+            cumulative += inject;
+        }
+        ippl::Comm->barrier();
+
+        bunch->update();
+
+        size_t total = this->totalParticles(*bunch);
+        if (ippl::Comm->rank() == 0) {
+            EXPECT_EQ(cumulative, total) << "at cycle " << cycle;
+        }
+        EXPECT_EQ(0u, this->countMisplaced(*bunch)) << "at cycle " << cycle;
+    }
+}
+
+// ============================================================
+//  Entry point
+// ============================================================
+int main(int argc, char* argv[]) {
+    int success = 1;
+    ippl::initialize(argc, argv);
+    {
+        ::testing::InitGoogleTest(&argc, argv);
+        success = RUN_ALL_TESTS();
+    }
+    ippl::finalize();
+    return success;
+}
\ No newline at end of file
diff --git a/unit_tests/Particle/ParticleUpdateNonuniform.cpp b/unit_tests/Particle/ParticleUpdateNonuniform.cpp
new file mode 100644
index 000000000..331e577c4
--- /dev/null
+++ b/unit_tests/Particle/ParticleUpdateNonuniform.cpp
@@ -0,0 +1,937 @@
+//
+// Unit test: Particle spatial layout update with non-uniform (ORB) decompositions
+//
+// These tests exercise particle migration after OrthogonalRecursiveBisection
+// has repartitioned the domain.  The ORB algorithm can produce highly asymmetric
+// layouts where a single rank may span many more grid cells than its neighbour,
+// so the usual assumption "every rank has the same width" does not hold.
+//
+// Minimum rank requirement:  all tests that exercise multi-rank migration are
+// guarded by GTEST_SKIP() when fewer than MIN_RANKS MPI processes are available.
+// The recommended launch is  mpirun -n 4  (or 8 for the higher-coverage tests).
+//
+// Test inventory
+// ──────────────
+//  1.  ORB layout created with uniform density  →  baseline count conservation
+//  2.  ORB layout created with a Gaussian density bump  →  asymmetric widths
+//  3.  Particles displaced uniformly after ORB  →  all migrate to correct owner
+//  4.  Particles displaced toward the heavy region  →  burst migration into narrow rank
+//  5.  Repeated ORB repartition between steps  →  counts conserved at each step
+//  6.  All particles on rank 0, ORB applied, then update  →  full redistribution
+//  7.  Charge conservation through ORB + update cycle
+//  8.  Tags (IDs) survive ORB + update
+//  9.  Zero-particle ranks after ORB do not deadlock
+// 10.  Large-count stress test with ORB decomposition
+// 11.  Periodic wrap across ORB boundaries
+// 12.  Multiple ORB repartitions with particle injection between steps
+// 13.  ORB with a step-function density  →  one rank gets 1 cell, another N-1
+// 14.  Successive displacements across ORB boundaries
+// 15.  ORB repartition in 3-D with diagonal displacement
+//
+#include "Ippl.h"
+
+#include <algorithm>
+#include <cmath>
+#include <functional>
+#include <memory>
+#include <numeric>
+#include <random>
+#include <vector>
+
+#include "Decomposition/OrthogonalRecursiveBisection.h"
+#include "TestUtils.h"
+#include "gtest/gtest.h"
+
+// ============================================================
+//  Minimum ranks required for multi-rank migration tests
+// ============================================================
+static constexpr int MIN_RANKS  = 2;  // absolute minimum
+static constexpr int PREF_RANKS = 4;  // preferred for asymmetry tests
+
+// ============================================================
+//  Particle bunch
+// ============================================================
+template <typename T, typename ExecSpace, unsigned Dim>
+struct OrbBunch
+    : public ippl::ParticleBase<
+          ippl::ParticleSpatialLayout<T, Dim, ippl::UniformCartesian<T, Dim>, ExecSpace>> {
+    using playout_type =
+        ippl::ParticleSpatialLayout<T, Dim, ippl::UniformCartesian<T, Dim>, ExecSpace>;
+
+    explicit OrbBunch(playout_type& pl)
+        : ippl::ParticleBase<playout_type>(pl) {
+        this->addAttribute(Q);
+        this->addAttribute(tag);
+    }
+
+    ippl::ParticleAttrib<T, ExecSpace> Q;            // per-particle charge
+    ippl::ParticleAttrib<long long, ExecSpace> tag;  // unique ID
+};
+
+// ============================================================
+//  Fixture
+// ============================================================
+template <typename>
+class TestParticleUpdateORB;
+
+template <typename T_, typename ExecSpace, unsigned Dim_>
+class TestParticleUpdateORB<Parameters<T_, ExecSpace, Rank<Dim_>>> : public ::testing::Test {
+public:
+    using T = T_;
+    static constexpr unsigned Dim = Dim_;
+    // ---- type aliases --------------------------------------------------
+    using flayout_type   = ippl::FieldLayout<Dim>;
+    using mesh_type      = ippl::UniformCartesian<T, Dim>;
+    using playout_type   = ippl::ParticleSpatialLayout<T, Dim, mesh_type, ExecSpace>;
+    using RegionLayout_t = typename playout_type::RegionLayout_t;
+    using bunch_type     = OrbBunch<T, ExecSpace, Dim>;
+    using position_type  = ippl::Vector<T, Dim>;
+    using Mesh_t = ippl::UniformCartesian<double, Dim>;
+    using Centering_t = Mesh_t::DefaultCentering;
+
+    // Density field type used by ORB
+    using field_type = ippl::Field<T, Dim, mesh_type, Centering_t, ExecSpace>;
+    using orb_type   = ippl::OrthogonalRecursiveBisection<field_type, T>;
+
+    // ---- construction --------------------------------------------------
+    TestParticleUpdateORB()
+        : nPoints(getGridSizes<Dim>()) {
+        for (unsigned d = 0; d < Dim; d++)
+            domain[d] = static_cast<T>(nPoints[d]) / 16.0;
+
+        std::array<ippl::Index, Dim> args;
+        for (unsigned d = 0; d < Dim; d++)
+            args[d] = ippl::Index(nPoints[d]);
+        gDomain = std::make_from_tuple<ippl::NDIndex<Dim>>(args);
+
+        ippl::Vector<T, Dim> hx, origin;
+        std::array<bool, Dim> isParallel;
+        isParallel.fill(true);
+        for (unsigned d = 0; d < Dim; d++) {
+            hx[d]     = domain[d] / nPoints[d];
+            origin[d] = 0;
+        }
+
+        layout      = std::make_shared<flayout_type>(MPI_COMM_WORLD, gDomain, isParallel);
+        mesh        = std::make_shared<mesh_type>(gDomain, hx, origin);
+        playout_ptr = std::make_shared<playout_type>(*layout, *mesh);
+    }
+
+    // ---- ORB helpers ---------------------------------------------------
+
+    /// Build a uniform-density field (all weights = 1) and run ORB.
+    bool orbUniform() {
+        field_type rho(*mesh, *layout);
+        rho = T(1);
+        orb_type orb;
+        orb.initialize(*layout, *mesh, rho);
+        // binaryRepartition needs particle positions; pass empty attrib
+        ippl::ParticleAttrib<position_type, ExecSpace> emptyR;
+        return orb.binaryRepartition(emptyR, *layout, /*isFirstRepartition=*/true);
+    }
+
+    /// Build a Gaussian density bump centred in the domain and run ORB.
+    /// The heavier region forces ORB to cut near the centre, creating
+    /// asymmetric sub-domains along each axis.
+    bool orbGaussian(T sigma = T(0.15)) {
+        field_type rho(*mesh, *layout);
+
+        // Fill the density field on-device: Gaussian centred at 0.5
+        auto view   = rho.getView();
+        auto hx_vec = mesh->getMeshSpacing();
+        auto orig   = mesh->getOrigin();
+
+        // Kokkos::parallel_for("GaussianRho", ippl::createRangePolicy<Dim, ExecSpace>(rho.getOwned()),
+        //                      KOKKOS_LAMBDA(/* index args */){
+        //                          // Generic lambda for any Dim via index_array_type
+        //                      });
+
+        // Simpler: fill via host mirror
+        auto rho_host = rho.getHostMirror();
+        auto lDomH    = layout->getLocalNDIndex();
+        for (int gz = (Dim > 2 ? lDomH[2].first() : 0); gz <= (Dim > 2 ? lDomH[2].last() : 0);
+             ++gz) {
+            for (int gy = (Dim > 1 ? lDomH[1].first() : 0); gy <= (Dim > 1 ? lDomH[1].last() : 0);
+                 ++gy) {
+                for (int gx = lDomH[0].first(); gx <= lDomH[0].last(); ++gx) {
+                    T val = T(1);
+                    // Compute normalised coordinate and Gaussian weight
+                    auto computeCoord = [&](int idx, unsigned d) -> T {
+                        return (static_cast<T>(idx) + T(0.5)) / static_cast<T>(nPoints[d]);
+                    };
+                    T x  = computeCoord(gx, 0) - T(0.5);
+                    T r2 = x * x;
+                    if constexpr (Dim > 1) {
+                        T y = computeCoord(gy, 1) - T(0.5);
+                        r2 += y * y;
+                    }
+                    if constexpr (Dim > 2) {
+                        T z = computeCoord(gz, 2) - T(0.5);
+                        r2 += z * z;
+                    }
+                    val = std::exp(-r2 / (T(2) * sigma * sigma));
+
+                    // Map global index to local+ghost index
+                    int lx = gx - lDomH[0].first() + rho.getNghost();
+                    if constexpr (Dim == 1)
+                        rho_host(lx) = val;
+                    else if constexpr (Dim == 2) {
+                        int ly           = gy - lDomH[1].first() + rho.getNghost();
+                        rho_host(lx, ly) = val;
+                    } else {
+                        int ly               = gy - lDomH[1].first() + rho.getNghost();
+                        int lz               = gz - lDomH[2].first() + rho.getNghost();
+                        rho_host(lx, ly, lz) = val;
+                    }
+                }
+            }
+        }
+        Kokkos::deep_copy(rho.getView(), rho_host);
+
+        orb_type orb;
+        orb.initialize(*layout, *mesh, rho);
+        ippl::ParticleAttrib<position_type, ExecSpace> emptyR;
+        return orb.binaryRepartition(emptyR, *layout, /*isFirstRepartition=*/true);
+    }
+
+    /// Build a step-function density: cells in [0, nPoints[0]/4) get weight 100,
+    /// the rest get weight 1.  This forces one rank to be very narrow (few cells)
+    /// while the others are wide.
+    bool orbStepFunction() {
+        field_type rho(*mesh, *layout);
+        auto rho_host = rho.getHostMirror();
+        auto lDomH    = layout->getLocalNDIndex();
+        int cutGlobal = static_cast<int>(nPoints[0]) / 4;
+
+        for (int gz = (Dim > 2 ? lDomH[2].first() : 0); gz <= (Dim > 2 ? lDomH[2].last() : 0);
+             ++gz) {
+            for (int gy = (Dim > 1 ? lDomH[1].first() : 0); gy <= (Dim > 1 ? lDomH[1].last() : 0);
+                 ++gy) {
+                for (int gx = lDomH[0].first(); gx <= lDomH[0].last(); ++gx) {
+                    T val  = (gx < cutGlobal) ? T(100) : T(1);
+                    int lx = gx - lDomH[0].first() + rho.getNghost();
+                    if constexpr (Dim == 1)
+                        rho_host(lx) = val;
+                    else if constexpr (Dim == 2) {
+                        int ly           = gy - lDomH[1].first() + rho.getNghost();
+                        rho_host(lx, ly) = val;
+                    } else {
+                        int ly               = gy - lDomH[1].first() + rho.getNghost();
+                        int lz               = gz - lDomH[2].first() + rho.getNghost();
+                        rho_host(lx, ly, lz) = val;
+                    }
+                }
+            }
+        }
+        Kokkos::deep_copy(rho.getView(), rho_host);
+
+        orb_type orb;
+        orb.initialize(*layout, *mesh, rho);
+        ippl::ParticleAttrib<position_type, ExecSpace> emptyR;
+        return orb.binaryRepartition(emptyR, *layout, /*isFirstRepartition=*/true);
+    }
+
+    /// Rebuild the playout after the layout has been repartitioned.
+    void rebuildPlayout() { playout_ptr = std::make_shared<playout_type>(*layout, *mesh); }
+
+    // ---- bunch helpers -------------------------------------------------
+
+    std::shared_ptr<bunch_type> makeBunch() {
+        auto b = std::make_shared<bunch_type>(*playout_ptr);
+        typename bunch_type::bc_container_type bcs;
+        bcs.fill(ippl::BC::PERIODIC);
+        b->setParticleBC(bcs);
+        return b;
+    }
+
+    void fillRandom(bunch_type& b, unsigned n, unsigned long seed = 42) {
+        int nRanks   = ippl::Comm->size();
+        unsigned per = std::max(1u, n / nRanks);
+        b.create(per);
+
+        std::mt19937_64 eng(seed + ippl::Comm->rank());
+        std::uniform_real_distribution<T> unif(T(0), T(1));
+
+        auto R_host = b.R.getHostMirror();
+        auto Q_host = b.Q.getHostMirror();
+        auto t_host = b.tag.getHostMirror();
+
+        for (size_t i = 0; i < b.getLocalNum(); ++i) {
+            position_type r;
+            for (unsigned d = 0; d < Dim; d++)
+                r[d] = unif(eng) * domain[d];
+            R_host(i) = r;
+            Q_host(i) = T(1);
+            t_host(i) =
+                static_cast<long long>(ippl::Comm->rank()) * 10000000LL + static_cast<long long>(i);
+        }
+        Kokkos::deep_copy(b.R.getView(), R_host);
+        Kokkos::deep_copy(b.Q.getView(), Q_host);
+        Kokkos::deep_copy(b.tag.getView(), t_host);
+    }
+
+    // ---- assertion helpers ---------------------------------------------
+
+    size_t totalParticles(const bunch_type& b) const {
+        size_t local = b.getLocalNum(), total = 0;
+        ippl::Comm->reduce(local, total, 1, std::plus<size_t>());
+        return total;
+    }
+
+    T totalCharge(bunch_type& b) const {
+        auto Q_host = b.Q.getHostMirror();
+        Kokkos::deep_copy(Q_host, b.Q.getView());
+        T local = T(0);
+        for (size_t i = 0; i < b.getLocalNum(); ++i)
+            local += Q_host(i);
+        T total = T(0);
+        ippl::Comm->reduce(local, total, 1, std::plus<T>());
+        return total;
+    }
+
+    /// Count locally owned particles that lie outside the owning rank's region.
+    size_t countMisplaced(bunch_type& b) {
+        RegionLayout_t rl = playout_ptr->getRegionLayout();
+        auto regions      = rl.getdLocalRegions();
+        auto regions_host = Kokkos::create_mirror_view(regions);
+        Kokkos::deep_copy(regions_host, regions);
+        int myRank = ippl::Comm->rank();
+
+        auto R_host = b.R.getHostMirror();
+        Kokkos::deep_copy(R_host, b.R.getView());
+
+        size_t bad = 0;
+        for (size_t i = 0; i < b.getLocalNum(); ++i) {
+            bool ok = true;
+            for (unsigned d = 0; d < Dim; d++) {
+                T p = R_host(i)[d];
+                ok &= (p >= regions_host(myRank)[d].min() && p <= regions_host(myRank)[d].max());
+            }
+            if (!ok)
+                ++bad;
+        }
+        return bad;
+    }
+
+    T periodicWrap(T x, T L) const {
+        while (x < T(0))
+            x += L;
+        while (x >= L)
+            x -= L;
+        return x;
+    }
+
+    /// Verify that the widths of local domains differ between ranks (i.e. ORB
+    /// actually produced a non-uniform decomposition along at least one axis).
+    /// Returns true if at least one pair of ranks has different widths.
+    bool isNonUniform() const {
+        auto lDom    = layout->getLocalNDIndex();
+        int myWidth  = lDom[0].length();
+        int maxWidth = 0, minWidth = 0;
+        ippl::Comm->allreduce(myWidth, maxWidth, 1, std::greater<int>{});
+        ippl::Comm->allreduce(myWidth, minWidth, 1, std::less<int>{});
+        return (maxWidth != minWidth);
+    }
+
+    // ---- data members --------------------------------------------------
+    std::array<size_t, Dim> nPoints;
+    std::array<T, Dim> domain;
+    ippl::NDIndex<Dim> gDomain;
+
+    std::shared_ptr<flayout_type> layout;
+    std::shared_ptr<mesh_type> mesh;
+    std::shared_ptr<playout_type> playout_ptr;
+};
+
+// Run over 1-D, 2-D, 3-D with default scalar / exec space
+using Tests = TestParams::tests<1, 2, 3>;
+TYPED_TEST_SUITE(TestParticleUpdateORB, Tests);
+
+// ============================================================
+//  Helper macro: skip test if rank count is below threshold
+// ============================================================
+#define REQUIRE_RANKS(n)                                                  \
+    do {                                                                  \
+        if (ippl::Comm->size() < (n)) {                                   \
+            GTEST_SKIP() << "Test requires at least " << (n) << " ranks"; \
+        }                                                                 \
+    } while (false)
+
+// ============================================================
+//  1. ORB with uniform density – baseline conservation
+// ============================================================
+TYPED_TEST(TestParticleUpdateORB, UniformOrbBaselineConservation) {
+    REQUIRE_RANKS(MIN_RANKS);
+
+    bool ok = this->orbUniform();
+    if (!ok)
+        GTEST_SKIP() << "ORB returned planar decomposition – skipping";
+
+    this->rebuildPlayout();
+    auto bunch = this->makeBunch();
+    this->fillRandom(*bunch, 256);
+
+    const size_t before = this->totalParticles(*bunch);
+    bunch->update();
+
+    EXPECT_EQ(before, this->totalParticles(*bunch));
+    EXPECT_EQ(0u, this->countMisplaced(*bunch));
+}
+
+// ============================================================
+//  2. ORB with Gaussian density – verify non-uniform decomposition
+//     and that particles still migrate correctly
+// ============================================================
+TYPED_TEST(TestParticleUpdateORB, GaussianOrbCreatesNonUniformLayout) {
+    REQUIRE_RANKS(PREF_RANKS);
+
+    bool ok = this->orbGaussian();
+    if (!ok)
+        GTEST_SKIP() << "ORB returned planar decomposition – skipping";
+
+    // With 4+ ranks and a Gaussian bump the domains should be unequal
+    if (ippl::Comm->size() >= PREF_RANKS) {
+        EXPECT_TRUE(this->isNonUniform()) << "Expected ORB to produce a non-uniform decomposition";
+    }
+
+    this->rebuildPlayout();
+    auto bunch = this->makeBunch();
+    this->fillRandom(*bunch, 256);
+
+    const size_t before = this->totalParticles(*bunch);
+    bunch->update();
+
+    EXPECT_EQ(before, this->totalParticles(*bunch));
+    EXPECT_EQ(0u, this->countMisplaced(*bunch));
+}
+
+// ============================================================
+//  3. Uniform displacement after ORB – all particles cross at least
+//     one ORB boundary and reach the correct owner
+// ============================================================
+TYPED_TEST(TestParticleUpdateORB, UniformDisplacementAfterOrb) {
+    REQUIRE_RANKS(MIN_RANKS);
+    using T = typename TestFixture::T;
+
+    bool ok = this->orbGaussian();
+    if (!ok)
+        GTEST_SKIP();
+
+    this->rebuildPlayout();
+    auto bunch = this->makeBunch();
+    this->fillRandom(*bunch, 256);
+
+    // Shift by 30 % of domain along every axis
+    {
+        auto R_host = bunch->R.getHostMirror();
+        Kokkos::deep_copy(R_host, bunch->R.getView());
+        for (size_t i = 0; i < bunch->getLocalNum(); ++i)
+            for (unsigned d = 0; d < TestFixture::Dim; d++) {
+                T np         = R_host(i)[d] + T(0.3) * this->domain[d];
+                R_host(i)[d] = this->periodicWrap(np, this->domain[d]);
+            }
+        Kokkos::deep_copy(bunch->R.getView(), R_host);
+    }
+
+    const size_t before = this->totalParticles(*bunch);
+    bunch->update();
+
+    EXPECT_EQ(before, this->totalParticles(*bunch));
+    EXPECT_EQ(0u, this->countMisplaced(*bunch));
+}
+
+// ============================================================
+//  4. Burst migration into the narrow rank after step-function ORB
+//     One rank is very narrow; we push all particles toward it.
+// ============================================================
+TYPED_TEST(TestParticleUpdateORB, BurstMigrationIntoNarrowRankAfterStepOrb) {
+    REQUIRE_RANKS(MIN_RANKS);
+    using T = typename TestFixture::T;
+
+    bool ok = this->orbStepFunction();
+    if (!ok)
+        GTEST_SKIP();
+
+    this->rebuildPlayout();
+    auto bunch = this->makeBunch();
+    this->fillRandom(*bunch, 256);
+
+    // Move every particle into the first quarter of the domain (the narrow rank)
+    {
+        auto R_host = bunch->R.getHostMirror();
+        Kokkos::deep_copy(R_host, bunch->R.getView());
+        std::mt19937_64 eng(77 + ippl::Comm->rank());
+        std::uniform_real_distribution<T> unif(T(0), T(0.25));
+        for (size_t i = 0; i < bunch->getLocalNum(); ++i) {
+            R_host(i)[0] = unif(eng) * this->domain[0];
+            for (unsigned d = 1; d < TestFixture::Dim; d++)
+                R_host(i)[d] = T(0.5) * this->domain[d];  // safe middle
+        }
+        Kokkos::deep_copy(bunch->R.getView(), R_host);
+    }
+
+    const size_t before = this->totalParticles(*bunch);
+    bunch->update();
+
+    EXPECT_EQ(before, this->totalParticles(*bunch));
+    EXPECT_EQ(0u, this->countMisplaced(*bunch));
+}
+
+// ============================================================
+//  5. Repeated ORB repartition between particle steps
+// ============================================================
+TYPED_TEST(TestParticleUpdateORB, RepeatedOrbRepartitionConservesParticles) {
+    REQUIRE_RANKS(MIN_RANKS);
+    using T = typename TestFixture::T;
+
+    // Perform 3 ORB repartitions, each time with a different density,
+    // interleaved with particle displacements.
+    constexpr unsigned N = 128;
+
+    bool firstOk = this->orbUniform();
+    if (!firstOk)
+        GTEST_SKIP();
+    this->rebuildPlayout();
+
+    auto bunch = this->makeBunch();
+    this->fillRandom(*bunch, N);
+    const size_t total = this->totalParticles(*bunch);
+
+    // Lambda: displace by fraction f and call update
+    auto displace = [&](T f) {
+        auto R_host = bunch->R.getHostMirror();
+        Kokkos::deep_copy(R_host, bunch->R.getView());
+        for (size_t i = 0; i < bunch->getLocalNum(); ++i)
+            for (unsigned d = 0; d < TestFixture::Dim; d++) {
+                T np         = R_host(i)[d] + f * this->domain[d];
+                R_host(i)[d] = this->periodicWrap(np, this->domain[d]);
+            }
+        Kokkos::deep_copy(bunch->R.getView(), R_host);
+        bunch->update();
+    };
+
+    displace(T(0.15));
+    EXPECT_EQ(total, this->totalParticles(*bunch));
+
+    // Second ORB (Gaussian)
+    bool ok2 = this->orbGaussian();
+    if (ok2) {
+        this->rebuildPlayout();
+        // Re-attach the playout to the existing bunch
+        bunch = this->makeBunch();
+        this->fillRandom(*bunch, N);
+        displace(T(0.25));
+        EXPECT_EQ(total, this->totalParticles(*bunch));
+    }
+
+    // Third ORB (step-function)
+    bool ok3 = this->orbStepFunction();
+    if (ok3) {
+        this->rebuildPlayout();
+        bunch = this->makeBunch();
+        this->fillRandom(*bunch, N);
+        displace(T(0.1));
+        EXPECT_EQ(total, this->totalParticles(*bunch));
+    }
+}
+
+// ============================================================
+//  6. All particles on rank 0, ORB applied, then update
+// ============================================================
+TYPED_TEST(TestParticleUpdateORB, AllParticlesOnRank0AfterOrb) {
+    REQUIRE_RANKS(MIN_RANKS);
+    using T = typename TestFixture::T;
+
+    bool ok = this->orbGaussian();
+    if (!ok)
+        GTEST_SKIP();
+    this->rebuildPlayout();
+
+    unsigned N = ippl::Comm->rank() == 0 ? 256 : 0;
+    auto bunch = this->makeBunch();
+    bunch->create(N);
+
+    if (ippl::Comm->rank() == 0) {
+        std::mt19937_64 eng(12);
+        std::uniform_real_distribution<T> unif(T(0), T(1));
+        auto R_host = bunch->R.getHostMirror();
+        auto Q_host = bunch->Q.getHostMirror();
+        for (unsigned i = 0; i < N; ++i) {
+            for (unsigned d = 0; d < TestFixture::Dim; d++)
+                R_host(i)[d] = unif(eng) * this->domain[d];
+            Q_host(i) = T(1);
+        }
+        Kokkos::deep_copy(bunch->R.getView(), R_host);
+        Kokkos::deep_copy(bunch->Q.getView(), Q_host);
+    }
+
+    const size_t before = this->totalParticles(*bunch);
+    ASSERT_EQ(before, static_cast<size_t>(N));
+
+    bunch->update();
+
+    EXPECT_EQ(before, this->totalParticles(*bunch));
+    EXPECT_EQ(0u, this->countMisplaced(*bunch));
+}
+
+// ============================================================
+//  7. Charge conservation through ORB + update cycle
+// ============================================================
+TYPED_TEST(TestParticleUpdateORB, ChargeConservedAfterOrbAndUpdate) {
+    REQUIRE_RANKS(MIN_RANKS);
+    using T = typename TestFixture::T;
+
+    bool ok = this->orbGaussian();
+    if (!ok)
+        GTEST_SKIP();
+    this->rebuildPlayout();
+
+    auto bunch = this->makeBunch();
+    this->fillRandom(*bunch, 256);
+
+    // Assign unique charges
+    {
+        auto Q_host = bunch->Q.getHostMirror();
+        for (size_t i = 0; i < bunch->getLocalNum(); ++i)
+            Q_host(i) = static_cast<T>(ippl::Comm->rank() * 100000 + i + 1);
+        Kokkos::deep_copy(bunch->Q.getView(), Q_host);
+    }
+
+    const T chargeBefore = this->totalCharge(*bunch);
+
+    // Displace and update
+    {
+        auto R_host = bunch->R.getHostMirror();
+        Kokkos::deep_copy(R_host, bunch->R.getView());
+        for (size_t i = 0; i < bunch->getLocalNum(); ++i)
+            for (unsigned d = 0; d < TestFixture::Dim; d++) {
+                T np         = R_host(i)[d] + T(0.35) * this->domain[d];
+                R_host(i)[d] = this->periodicWrap(np, this->domain[d]);
+            }
+        Kokkos::deep_copy(bunch->R.getView(), R_host);
+    }
+    bunch->update();
+
+    const T chargeAfter = this->totalCharge(*bunch);
+    EXPECT_NEAR(static_cast<double>(chargeBefore), static_cast<double>(chargeAfter),
+                1e-6 * std::abs(static_cast<double>(chargeBefore)));
+}
+
+// ============================================================
+//  8. Particle tags survive ORB + update
+// ============================================================
+TYPED_TEST(TestParticleUpdateORB, TagsPreservedAfterOrbAndUpdate) {
+    REQUIRE_RANKS(MIN_RANKS);
+    using T = typename TestFixture::T;
+
+    bool ok = this->orbUniform();
+    if (!ok)
+        GTEST_SKIP();
+    this->rebuildPlayout();
+
+    auto bunch = this->makeBunch();
+    this->fillRandom(*bunch, 128);
+
+    // Record global original count of unique tags
+    size_t origLocal  = bunch->getLocalNum();
+    size_t origGlobal = 0;
+    ippl::Comm->reduce(origLocal, origGlobal, 1, std::plus<size_t>());
+
+    // Displace
+    {
+        auto R_host = bunch->R.getHostMirror();
+        Kokkos::deep_copy(R_host, bunch->R.getView());
+        for (size_t i = 0; i < bunch->getLocalNum(); ++i)
+            for (unsigned d = 0; d < TestFixture::Dim; d++) {
+                T np         = R_host(i)[d] + T(0.45) * this->domain[d];
+                R_host(i)[d] = this->periodicWrap(np, this->domain[d]);
+            }
+        Kokkos::deep_copy(bunch->R.getView(), R_host);
+    }
+    bunch->update();
+
+    size_t afterLocal  = bunch->getLocalNum();
+    size_t afterGlobal = 0;
+    ippl::Comm->reduce(afterLocal, afterGlobal, 1, std::plus<size_t>());
+
+    EXPECT_EQ(origGlobal, afterGlobal);
+}
+
+// ============================================================
+//  9. Zero-particle ranks after ORB do not deadlock
+// ============================================================
+TYPED_TEST(TestParticleUpdateORB, ZeroParticleRanksAfterOrbDoNotDeadlock) {
+    REQUIRE_RANKS(MIN_RANKS);
+    using T = typename TestFixture::T;
+
+    bool ok = this->orbStepFunction();
+    if (!ok)
+        GTEST_SKIP();
+    this->rebuildPlayout();
+
+    // Only rank 0 creates particles
+    auto bunch = this->makeBunch();
+    unsigned N = ippl::Comm->rank() == 0 ? 64 : 0;
+    bunch->create(N);
+    if (ippl::Comm->rank() == 0) {
+        std::mt19937_64 eng(55);
+        std::uniform_real_distribution<T> unif(T(0), T(1));
+        auto R_host = bunch->R.getHostMirror();
+        auto Q_host = bunch->Q.getHostMirror();
+        for (unsigned i = 0; i < N; ++i) {
+            for (unsigned d = 0; d < TestFixture::Dim; d++)
+                R_host(i)[d] = unif(eng) * this->domain[d];
+            Q_host(i) = T(1);
+        }
+        Kokkos::deep_copy(bunch->R.getView(), R_host);
+        Kokkos::deep_copy(bunch->Q.getView(), Q_host);
+    }
+
+    const size_t before = this->totalParticles(*bunch);
+    ASSERT_NO_THROW(bunch->update());
+    EXPECT_EQ(before, this->totalParticles(*bunch));
+}
+
+// ============================================================
+// 10. Large-count stress test with Gaussian ORB decomposition
+// ============================================================
+TYPED_TEST(TestParticleUpdateORB, LargeParticleCountGaussianOrb) {
+    REQUIRE_RANKS(MIN_RANKS);
+
+    bool ok = this->orbGaussian();
+    if (!ok)
+        GTEST_SKIP();
+    this->rebuildPlayout();
+
+    auto bunch = this->makeBunch();
+    this->fillRandom(*bunch, 4096, /*seed=*/98765);
+
+    const size_t before = this->totalParticles(*bunch);
+    bunch->update();
+
+    EXPECT_EQ(before, this->totalParticles(*bunch));
+    EXPECT_EQ(0u, this->countMisplaced(*bunch));
+}
+
+// ============================================================
+// 11. Periodic wrap across ORB boundaries
+// ============================================================
+TYPED_TEST(TestParticleUpdateORB, PeriodicWrapAcrossOrbBoundaries) {
+    REQUIRE_RANKS(MIN_RANKS);
+    using T = typename TestFixture::T;
+
+    bool ok = this->orbGaussian();
+    if (!ok)
+        GTEST_SKIP();
+    this->rebuildPlayout();
+
+    auto bunch = this->makeBunch();
+
+    // Each rank places particles just beyond the global domain's upper edge
+    // along axis 0 so they wrap back to near 0.
+    bunch->create(4);
+    {
+        auto R_host = bunch->R.getHostMirror();
+        auto Q_host = bunch->Q.getHostMirror();
+        std::mt19937_64 eng(33 + ippl::Comm->rank());
+        std::uniform_real_distribution<T> tiny(T(1e-5), T(1e-3));
+        for (size_t i = 0; i < bunch->getLocalNum(); ++i) {
+            R_host(i)[0] = this->domain[0] + tiny(eng) * this->domain[0];
+            for (unsigned d = 1; d < TestFixture::Dim; d++)
+                R_host(i)[d] = T(0.5) * this->domain[d];
+            Q_host(i) = T(1);
+        }
+        Kokkos::deep_copy(bunch->R.getView(), R_host);
+        Kokkos::deep_copy(bunch->Q.getView(), Q_host);
+    }
+
+    const size_t before = this->totalParticles(*bunch);
+    bunch->update();
+
+    EXPECT_EQ(before, this->totalParticles(*bunch));
+}
+
+// ============================================================
+// 12. Particle injection between ORB repartitions
+//     Simulates a typical PIC workflow: inject → ORB → move → update → repeat
+// ============================================================
+TYPED_TEST(TestParticleUpdateORB, ParticleInjectionBetweenOrbRepartitions) {
+    REQUIRE_RANKS(MIN_RANKS);
+    using T = typename TestFixture::T;
+
+    constexpr unsigned injectPerCycle = 32;
+    constexpr int cycles              = 3;
+
+    for (int c = 0; c < cycles; ++c) {
+        // Repartition with Gaussian for odd cycles, uniform for even
+        bool ok = (c % 2 == 0) ? this->orbUniform() : this->orbGaussian();
+        if (!ok)
+            continue;
+        this->rebuildPlayout();
+
+        auto bunch = this->makeBunch();
+
+        // Inject on rank 0
+        unsigned N = ippl::Comm->rank() == 0 ? injectPerCycle : 0;
+        bunch->create(N);
+        if (ippl::Comm->rank() == 0) {
+            std::mt19937_64 eng(c * 1000);
+            std::uniform_real_distribution<T> unif(T(0), T(1));
+            auto R_host = bunch->R.getHostMirror();
+            auto Q_host = bunch->Q.getHostMirror();
+            for (unsigned i = 0; i < injectPerCycle; ++i) {
+                for (unsigned d = 0; d < TestFixture::Dim; d++)
+                    R_host(i)[d] = unif(eng) * this->domain[d];
+                Q_host(i) = T(1);
+            }
+            Kokkos::deep_copy(bunch->R.getView(), R_host);
+            Kokkos::deep_copy(bunch->Q.getView(), Q_host);
+        }
+
+        bunch->update();
+
+        auto total_particles = this->totalParticles(*bunch);
+        if (ippl::Comm->rank() == 0) {
+            EXPECT_EQ(static_cast<size_t>(injectPerCycle), total_particles) << "at cycle " << c;
+        }
+        EXPECT_EQ(0u, this->countMisplaced(*bunch)) << "at cycle " << c;
+    }
+}
+
+// ============================================================
+// 13. Step-function ORB: narrow rank correctly receives migrating particles
+//     and does not receive more than the domain capacity
+// ============================================================
+TYPED_TEST(TestParticleUpdateORB, NarrowRankReceivesCorrectlyAfterStepOrb) {
+    REQUIRE_RANKS(PREF_RANKS);
+    using T = typename TestFixture::T;
+
+    bool ok = this->orbStepFunction();
+    if (!ok)
+        GTEST_SKIP();
+    this->rebuildPlayout();
+
+    auto bunch = this->makeBunch();
+    this->fillRandom(*bunch, 512);
+
+    // Move all particles into the heavy region (second three-quarters)
+    {
+        auto R_host = bunch->R.getHostMirror();
+        Kokkos::deep_copy(R_host, bunch->R.getView());
+        std::mt19937_64 eng(808 + ippl::Comm->rank());
+        std::uniform_real_distribution<T> heavyX(T(0.25), T(1.0));
+        for (size_t i = 0; i < bunch->getLocalNum(); ++i) {
+            R_host(i)[0] = heavyX(eng) * this->domain[0];
+            for (unsigned d = 1; d < TestFixture::Dim; d++)
+                R_host(i)[d] = T(0.5) * this->domain[d];
+        }
+        Kokkos::deep_copy(bunch->R.getView(), R_host);
+    }
+
+    const size_t before = this->totalParticles(*bunch);
+    bunch->update();
+
+    EXPECT_EQ(before, this->totalParticles(*bunch));
+    EXPECT_EQ(0u, this->countMisplaced(*bunch));
+}
+
+// ============================================================
+// 14. Successive displacements across ORB boundaries
+// ============================================================
+TYPED_TEST(TestParticleUpdateORB, SuccessiveDisplacementsAcrossOrbBoundaries) {
+    REQUIRE_RANKS(MIN_RANKS);
+    using T = typename TestFixture::T;
+
+    bool ok = this->orbGaussian();
+    if (!ok)
+        GTEST_SKIP();
+    this->rebuildPlayout();
+
+    auto bunch = this->makeBunch();
+    this->fillRandom(*bunch, 128);
+    const size_t N = this->totalParticles(*bunch);
+
+    std::mt19937_64 eng(2025 + ippl::Comm->rank());
+    std::uniform_real_distribution<T> step(-T(0.2), T(0.2));
+
+    for (int s = 0; s < 6; ++s) {
+        auto R_host = bunch->R.getHostMirror();
+        Kokkos::deep_copy(R_host, bunch->R.getView());
+        for (size_t i = 0; i < bunch->getLocalNum(); ++i)
+            for (unsigned d = 0; d < TestFixture::Dim; d++) {
+                T np         = R_host(i)[d] + step(eng) * this->domain[d];
+                R_host(i)[d] = this->periodicWrap(np, this->domain[d]);
+            }
+        Kokkos::deep_copy(bunch->R.getView(), R_host);
+
+        bunch->update();
+        EXPECT_EQ(N, this->totalParticles(*bunch)) << "at step " << s;
+        EXPECT_EQ(0u, this->countMisplaced(*bunch)) << "at step " << s;
+    }
+}
+
+// ============================================================
+// 15. 3-D corner migration with ORB decomposition
+//     (only runs when Dim == 3)
+// ============================================================
+TYPED_TEST(TestParticleUpdateORB, ThreeDCornerMigrationAfterOrb) {
+    if constexpr (TestFixture::Dim != 3) {
+        GTEST_SKIP() << "3-D specific test";
+    }
+    REQUIRE_RANKS(PREF_RANKS);
+    using T = typename TestFixture::T;
+
+    bool ok = this->orbGaussian(T(0.2));
+    if (!ok)
+        GTEST_SKIP();
+    this->rebuildPlayout();
+
+    auto bunch = this->makeBunch();
+
+    // Seed all particles near the origin
+    bunch->create(64);
+    {
+        auto R_host = bunch->R.getHostMirror();
+        auto Q_host = bunch->Q.getHostMirror();
+        for (size_t i = 0; i < bunch->getLocalNum(); ++i) {
+            for (unsigned d = 0; d < 3; d++)
+                R_host(i)[d] = T(0.02) * this->domain[d];
+            Q_host(i) = T(1);
+        }
+        Kokkos::deep_copy(bunch->R.getView(), R_host);
+        Kokkos::deep_copy(bunch->Q.getView(), Q_host);
+    }
+
+    const size_t before = this->totalParticles(*bunch);
+    bunch->update();  // settle near origin
+
+    // Jump diagonally to the opposite corner (wraps periodically)
+    {
+        auto R_host = bunch->R.getHostMirror();
+        Kokkos::deep_copy(R_host, bunch->R.getView());
+        for (size_t i = 0; i < bunch->getLocalNum(); ++i)
+            for (unsigned d = 0; d < 3; d++) {
+                T np         = R_host(i)[d] + T(0.98) * this->domain[d];
+                R_host(i)[d] = this->periodicWrap(np, this->domain[d]);
+            }
+        Kokkos::deep_copy(bunch->R.getView(), R_host);
+    }
+
+    bunch->update();
+    EXPECT_EQ(before, this->totalParticles(*bunch));
+    EXPECT_EQ(0u, this->countMisplaced(*bunch));
+}
+
+// ============================================================
+//  Entry point
+// ============================================================
+int main(int argc, char* argv[]) {
+    int success = 1;
+    ippl::initialize(argc, argv);
+    {
+        ::testing::InitGoogleTest(&argc, argv);
+        success = RUN_ALL_TESTS();
+    }
+    ippl::finalize();
+    return success;
+}

From 97828eda6a5cd8c29b8c80ad3f11a501ef8a3b05 Mon Sep 17 00:00:00 2001
From: Fischill Paul <paul.fischill@inf.ethz.ch>
Date: Mon, 1 Jun 2026 10:56:04 +0200
Subject: [PATCH 02/14] Fix HIP archive warnings and particle view bindings

Consume CUDA/HIP runtime return values in Archive so HIP nodiscard annotations do not trigger warnings. Update particle benchmark/test callers to store ParticleAttrib::getView() by value now that it returns a live subview instead of a stable lvalue reference.
---
 src/Communicate/Archive.hpp               | 20 ++++++++++----------
 test/particle/PICnd.cpp                   |  2 +-
 test/particle/benchmarkParticleUpdate.cpp |  2 +-
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/Communicate/Archive.hpp b/src/Communicate/Archive.hpp
index 19e7b2423..bfba0815c 100644
--- a/src/Communicate/Archive.hpp
+++ b/src/Communicate/Archive.hpp
@@ -69,9 +69,9 @@ namespace ippl {
 #endif
             void* ptr = nullptr;
 #if defined(KOKKOS_ENABLE_CUDA)
-            cudaMalloc(&ptr, size);
+            (void)cudaMalloc(&ptr, size);
 #else
-            hipMalloc(&ptr, size);
+            (void)hipMalloc(&ptr, size);
 #endif
             buffer_ptr_m  = static_cast<pointer_type>(ptr);
             buffer_size_m = size;
@@ -81,9 +81,9 @@ namespace ippl {
         void Archive<Properties...>::gpuFree() {
             if (buffer_ptr_m) {
 #if defined(KOKKOS_ENABLE_CUDA)
-                cudaFree(buffer_ptr_m);
+                (void)cudaFree(buffer_ptr_m);
 #else
-                hipFree(buffer_ptr_m);
+                (void)hipFree(buffer_ptr_m);
 #endif
                 buffer_ptr_m  = nullptr;
                 buffer_size_m = 0;
@@ -113,19 +113,19 @@ namespace ippl {
             pointer_type new_ptr = nullptr;
             void* vptr           = nullptr;
 #if defined(KOKKOS_ENABLE_CUDA)
-            cudaMalloc(&vptr, size);
+            (void)cudaMalloc(&vptr, size);
 #else
-            hipMalloc(&vptr, size);
+            (void)hipMalloc(&vptr, size);
 #endif
             new_ptr = static_cast<pointer_type>(vptr);
 
             if (buffer_ptr_m && buffer_size_m > 0) {
 #if defined(KOKKOS_ENABLE_CUDA)
-                cudaMemcpy(new_ptr, buffer_ptr_m, buffer_size_m, cudaMemcpyDeviceToDevice);
-                cudaFree(buffer_ptr_m);
+                (void)cudaMemcpy(new_ptr, buffer_ptr_m, buffer_size_m, cudaMemcpyDeviceToDevice);
+                (void)cudaFree(buffer_ptr_m);
 #else
-                hipMemcpy(new_ptr, buffer_ptr_m, buffer_size_m, hipMemcpyDeviceToDevice);
-                hipFree(buffer_ptr_m);
+                (void)hipMemcpy(new_ptr, buffer_ptr_m, buffer_size_m, hipMemcpyDeviceToDevice);
+                (void)hipFree(buffer_ptr_m);
 #endif
             }
 
diff --git a/test/particle/PICnd.cpp b/test/particle/PICnd.cpp
index c3e6e93ed..03a9bc369 100644
--- a/test/particle/PICnd.cpp
+++ b/test/particle/PICnd.cpp
@@ -265,7 +265,7 @@ class ChargedParticles : public ippl::ParticleBase<PLayout> {
     }
 
     void dumpData(int iteration) {
-        ParticleAttrib<Vector_t>::view_type& view = P.getView();
+        auto view = P.getView();
 
         double Energy = 0.0;
 
diff --git a/test/particle/benchmarkParticleUpdate.cpp b/test/particle/benchmarkParticleUpdate.cpp
index 857e637c1..c4d57700d 100644
--- a/test/particle/benchmarkParticleUpdate.cpp
+++ b/test/particle/benchmarkParticleUpdate.cpp
@@ -94,7 +94,7 @@ class ChargedParticles : public ippl::ParticleBase<PLayout> {
     void dumpData(int iteration) {
         double Energy = 0.0;
 
-        ParticleAttrib<Vector_t>::view_type& view = P.getView();
+        auto view = P.getView();
         Inform csvout(NULL, "data/energy.csv", Inform::APPEND);
         csvout.precision(10);
         csvout.setf(std::ios::scientific, std::ios::floatfield);

From 855b2ef49b2eae403106c9d35084c823c1739a78 Mon Sep 17 00:00:00 2001
From: Fischill Paul <paul.fischill@inf.ethz.ch>
Date: Mon, 1 Jun 2026 16:50:38 +0200
Subject: [PATCH 03/14] Fix ALPINE particle view handle lifetimes

---
 alpine/BumponTailInstabilityManager.h | 10 +++++-----
 alpine/LandauDampingManager.h         |  8 ++++----
 alpine/PenningTrapManager.h           |  8 ++++----
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/alpine/BumponTailInstabilityManager.h b/alpine/BumponTailInstabilityManager.h
index 7525f9af1..5e03a0712 100644
--- a/alpine/BumponTailInstabilityManager.h
+++ b/alpine/BumponTailInstabilityManager.h
@@ -274,10 +274,10 @@ class BumponTailInstabilityManager : public AlpineManager<T, Dim> {
 
         this->pcontainer_m->create(nlocal);
 
-        view_type* R = &(this->pcontainer_m->R.getView());
-        samplingR.generate(*R, rand_pool64);
+        view_type R = this->pcontainer_m->R.getView();
+        samplingR.generate(R, rand_pool64);
 
-        view_type* P = &(this->pcontainer_m->P.getView());
+        view_type P = this->pcontainer_m->P.getView();
 
         double mu[Dim];
         double sd[Dim];
@@ -288,12 +288,12 @@ class BumponTailInstabilityManager : public AlpineManager<T, Dim> {
         // sample first nlocBulk with muBulk as mean velocity
         mu[Dim - 1] = muBulk_m;
         Kokkos::parallel_for(Kokkos::RangePolicy<int>(0, nlocBulk),
-                             ippl::random::randn<double, Dim>(*P, rand_pool64, mu, sd));
+                             ippl::random::randn<double, Dim>(P, rand_pool64, mu, sd));
 
         // sample remaining with muBeam as mean velocity
         mu[Dim - 1] = muBeam_m;
         Kokkos::parallel_for(Kokkos::RangePolicy<int>(nlocBulk, nlocal),
-                             ippl::random::randn<double, Dim>(*P, rand_pool64, mu, sd));
+                             ippl::random::randn<double, Dim>(P, rand_pool64, mu, sd));
 
         Kokkos::fence();
         ippl::Comm->barrier();
diff --git a/alpine/LandauDampingManager.h b/alpine/LandauDampingManager.h
index b8491c7a5..23dcc632a 100644
--- a/alpine/LandauDampingManager.h
+++ b/alpine/LandauDampingManager.h
@@ -226,10 +226,10 @@ class LandauDampingManager : public AlpineManager<T, Dim> {
 
         this->pcontainer_m->create(nlocal);
 
-        view_type* R = &(this->pcontainer_m->R.getView());
-        samplingR.generate(*R, rand_pool64);
+        view_type R = this->pcontainer_m->R.getView();
+        samplingR.generate(R, rand_pool64);
 
-        view_type* P = &(this->pcontainer_m->P.getView());
+        view_type P = this->pcontainer_m->P.getView();
 
         double mu[Dim];
         double sd[Dim];
@@ -237,7 +237,7 @@ class LandauDampingManager : public AlpineManager<T, Dim> {
             mu[i] = 0.0;
             sd[i] = 1.0;
         }
-        Kokkos::parallel_for(nlocal, ippl::random::randn<double, Dim>(*P, rand_pool64, mu, sd));
+        Kokkos::parallel_for(nlocal, ippl::random::randn<double, Dim>(P, rand_pool64, mu, sd));
         Kokkos::fence();
         ippl::Comm->barrier();
 
diff --git a/alpine/PenningTrapManager.h b/alpine/PenningTrapManager.h
index b721f3ec8..c7e81de43 100644
--- a/alpine/PenningTrapManager.h
+++ b/alpine/PenningTrapManager.h
@@ -202,14 +202,14 @@ class PenningTrapManager : public AlpineManager<T, Dim> {
 
         this->pcontainer_m->create(nlocal);
 
-        view_type* R = &(this->pcontainer_m->R.getView());
-        samplingR.generate(*R, rand_pool64);
+        view_type R = this->pcontainer_m->R.getView();
+        samplingR.generate(R, rand_pool64);
 
-        view_type* P = &(this->pcontainer_m->P.getView());
+        view_type P = this->pcontainer_m->P.getView();
 
         double muP[Dim] = {0.0, 0.0, 0.0};
         double sdP[Dim] = {1.0, 1.0, 1.0};
-        Kokkos::parallel_for(nlocal, ippl::random::randn<double, Dim>(*P, rand_pool64, muP, sdP));
+        Kokkos::parallel_for(nlocal, ippl::random::randn<double, Dim>(P, rand_pool64, muP, sdP));
 
         Kokkos::fence();
         ippl::Comm->barrier();

From 816e1a53eabb43cf5ec08c3668b34a14ed7475fa Mon Sep 17 00:00:00 2001
From: Andreas Adelmann <andreas.adelmann@psi.ch>
Date: Tue, 2 Jun 2026 10:52:47 +0200
Subject: [PATCH 04/14] Add particle update diagnostic timers

---
 PR501_SPLIT_MAP.md                     | 177 +++++++++----------------
 src/Particle/ParticleSpatialLayout.hpp |  20 ++-
 2 files changed, 85 insertions(+), 112 deletions(-)

diff --git a/PR501_SPLIT_MAP.md b/PR501_SPLIT_MAP.md
index 90dd59c21..939ccec16 100644
--- a/PR501_SPLIT_MAP.md
+++ b/PR501_SPLIT_MAP.md
@@ -280,137 +280,92 @@ Risk:
 
 - PIF examples currently depend on the full stack: particle update, scatter/gather, FFT transform refactor, NUFFT.
 
-## Minimal Split Alternative
+# LUMI Results 
 
-If six PRs is too much process overhead, a practical minimum is:
+| Benchmark | Problem size | Nodes | Ranks | master | pr501-pcg | pr501-com | pr501-fft | pr501-hosg | pr501-nufft |
+| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: |
+| FEM | `513_10` | 8 | 64 | 28.23 | 27.77 (-2%) | 27.71 (-2%) | 27.88 (-1%) | 28.23 (0%) | 27.95 (-1%) |
+| FFT | `512_10` | 4 | 32 | 4.39 | 4.41 (0%) | 11.54 (+163%) | 11.53 (+162%) | 11.59 (+164%) | 11.53 (+162%) |
+| FFT | `512_10` | 16 | 128 | 1.65 | 1.65 (0%) | 2.88 (+74%) | 2.88 (+75%) | 2.93 (+78%) | 2.93 (+78%) |
+| PCG | `512_10` | 1 | 8 | 72.31 | 70.01 (-3%) |  | 76.99 (+6%) |  | 76.75 (+6%) |
+| PCG | `512_10` | 4 | 32 | 34.60 | 32.93 (-5%) | 43.71 (+26%) | 33.62 (-3%) | 36.32 (+5%) | 33.55 (-3%) |
+| PCG | `512_10` | 64 | 512 | 25.00 | 23.85 (-5%) | 22.89 (-8%) | 22.40 (-10%) | 22.74 (-9%) | 22.58 (-10%) |
 
-1. PCG allocation fix.
-2. Particle communication/update infrastructure.
-3. Scatter/gather + FFT backend + NUFFT core.
-4. Alpine PIF examples and benchmarks.
 
-This is less clean, but still much better than one 23k-line PR.
+## Update Timer Investigation
 
-## Next Mapping Step
+Observed issue:
 
-Prototype extraction order:
+- On LUMI, the `update` / `updateParticle` timer can increase strongly when moving from `pr501-pcg` to `pr501-communication-particle-update` / `pr501-com`.
+- A first code inspection did not find an FFT solver change in this branch step: `pr501-pcg..pr501-communication-particle-update` has no changes under `src/FFT` or `src/PoissonSolvers`.
+- The branch step does change particle migration, communication archives, field layout / halo `nghost` plumbing, and timing infrastructure.
 
-1. Create a branch from `origin/master` for PCG.
-2. Cherry-pick `40438d17` and `be5f794c`.
-3. Build and run solver tests.
-4. If clean, this becomes the first small PR.
+Most likely immediate explanation:
 
-Then try the particle/communication split because it is probably the most valuable non-PIF improvement and has direct tests:
+- In the communication branch, `ParticleSpatialLayout::update()` posts sends and receives, then performs `MPI_Waitall` plus deferred receive finalization/deserialization inside the outer `updateParticle` timer, but outside the child timers `particleSend`, `particleRecv`, and `sendPreprocess`.
+- This can make `updateParticle` look much larger while the child timers look artificially small.
+- On the LUMI timing data this pattern is visible: for `PCG_strong_scaling/512_10/nodes_64`, `locateParticles`, `particleSend`, and `particleRecv` are small in `pr501-com`, while the unexplained part of `updateParticle` is large.
 
-- `unit_tests/Particle/ParticleUpdate.cpp`
-- `unit_tests/Particle/ParticleUpdateNonuniform.cpp`
-- existing `ParticleSendRecv`
-- existing `GatherScatterTest`
+Secondary suspect:
 
-## Progress
+- `Archive` contiguous scalar serialization/deserialization changed from `Kokkos::deep_copy` on unmanaged byte views to custom byte-copy kernels plus fences.
+- On GPU, especially with GPU-aware MPI, that may be slower or introduce extra synchronization compared with the previous optimized device copy path.
+- This would affect particle migration directly, because `ParticleBase::sendToRank()` serializes every active particle attribute before `MPI_Isend`, and receive finalizers deserialize after the wait.
 
-### 2026-05-31: PR 1 Prototype - PCG Allocation / Solver Performance
+Local Mac OpenMP check:
 
-Status: extracted cleanly.
+- Built clean comparison worktrees for:
+  - `pr501-pcg`
+  - `pr501-communication-particle-update`
+- Configuration:
+  - `IPPL_PLATFORMS=OPENMP`
+  - `IPPL_ENABLE_ALPINE=ON`
+  - `IPPL_ENABLE_UNIT_TESTS=ON`
+  - `IPPL_ENABLE_FFT=ON`
+  - Kokkos `5.0.0`
+  - Homebrew LLVM `clang++` and `libomp`
+- AppleClang did not find OpenMP automatically; explicit `libomp` include/library paths were required.
 
-Worktree:
+Mac OpenMP miniapp runs:
 
-```text
-/private/tmp/ippl-pr501-pcg
-```
-
-Branch:
-
-```text
-pr501-pcg-split
-```
-
-Base:
-
-```text
-origin/master @ da43fd66a18848b65dcd82af90001f5b64a798ab
-```
-
-Cherry-picked commits:
-
-- `40438d17` -> `a76097f0` (`PCG/Preconditioner: hoist per-iteration allocations out of solve loop`)
-- `be5f794c` -> `092a04e5` (`PCG: pass Field by reference through OperatorF`)
-
-Resulting diff:
-
-- `src/LinearSolvers/PCG.h`
-- `src/LinearSolvers/Preconditioner.h`
-- `src/PoissonSolvers/PoissonCG.h`
-- 3 files changed, 388 insertions, 250 deletions
-
-Validation:
+```bash
+mpiexec -x OMP_NUM_THREADS=4 -x OMP_PROC_BIND=false -n 2 \
+  ./LandauDamping 32 32 32 20000 5 FFT 0.01 LeapFrog --overallocate 2.0 --info 5
 
-```text
-cmake -S . -B build-pcg-debug -DCMAKE_BUILD_TYPE=Debug -DIPPL_ENABLE_UNIT_TESTS=ON -DIPPL_ENABLE_FFT=ON -DIPPL_DEFAULT_TEST_PROCS=1
-cmake --build build-pcg-debug
-ctest --test-dir build-pcg-debug --output-on-failure -O build-pcg-debug/ctest-rank1.log
+mpiexec -x OMP_NUM_THREADS=2 -x OMP_PROC_BIND=false -n 4 \
+  ./LandauDamping 32 32 32 20000 5 FFT 0.01 LeapFrog --overallocate 2.0 --info 5
 ```
 
-Result:
+Mac OpenMP wall-max timings:
 
-```text
-100% tests passed, 0 tests failed out of 34
-Total Test time (real) = 54.17 sec
-```
-
-Assessment:
-
-- This is a good standalone first PR.
-- It has a narrow file set and clear motivation: remove repeated allocation work from the PCG/preconditioner loop.
-- Recommended next validation before opening: run the relevant solver integration tests or CSCS GPU tests that originally showed PCG allocation/slowdown symptoms.
-
-### 2026-05-31: PR 2 Prototype - Communication And Particle Update Infrastructure
-
-Status: extracted on top of `pr501-pcg`.
-
-Branch:
-
-```text
-pr501-communication-particle-update
-```
+| Ranks | Branch | `update` | `updateParticle` | `locateParticles` | `sendPreprocess` | `particleSend` | `particleRecv` |
+| ---: | --- | ---: | ---: | ---: | ---: | ---: | ---: |
+| 2 | `pr501-pcg` | 0.0911 | 0.1090 | 0.0906 | 0.0187 | 0.00431 | 0.00451 |
+| 2 | `pr501-com` | 0.0250 | 0.0266 | 0.00577 | 0.000863 | 0.00150 | 0.000064 |
+| 4 | `pr501-pcg` | 0.1550 | 0.1923 | 0.1303 | 0.0391 | 0.0180 | 0.00923 |
+| 4 | `pr501-com` | 0.0495 | 0.0534 | 0.00872 | 0.00677 | 0.00291 | 0.000131 |
 
-Included PR501 areas:
+Conclusion so far:
 
-- Communication buffer/archive updates from `9507a796` and `f2820064`
-- Particle layout rewrite and sort buffers from `2878b90b`
-- Particle update regression tests from the particle portion of `f0560f76`
-- Local integration fixes needed to keep this split independent from the later interpolation/FFT/NUFFT PRs
+- The Mac OpenMP run does not reproduce the LUMI slowdown. The communication branch is faster locally for this small CPU/OpenMP test.
+- That points away from a generic algorithmic slowdown in the particle update rewrite.
+- The remaining likely causes are GPU/MPI-specific behavior on LUMI: GPU-aware MPI wait behavior, archive device-buffer serialization/deserialization, or timer attribution around deferred receive finalization.
 
-Deliberately excluded:
+Recommended next diagnostic patch:
 
-- Higher-order interpolation/gather/scatter APIs
-- FFT backend and NUFFT transform integration
-- PIF example/application changes
-
-Validation:
-
-```text
-cmake -S . -B build-pr501-communication-debug -DCMAKE_BUILD_TYPE=Debug -DIPPL_ENABLE_UNIT_TESTS=ON -DIPPL_ENABLE_FFT=ON -DIPPL_DEFAULT_TEST_PROCS=1
-cmake --build build-pr501-communication-debug --parallel 8
-ctest --test-dir build-pr501-communication-debug --output-on-failure
-/opt/homebrew/Cellar/open-mpi/5.0.8/bin/mpiexec -n 2 build-pr501-communication-debug/unit_tests/Particle/ParticleSendRecv
-/opt/homebrew/Cellar/open-mpi/5.0.8/bin/mpiexec -n 2 build-pr501-communication-debug/unit_tests/Particle/ParticleUpdate
-/opt/homebrew/Cellar/open-mpi/5.0.8/bin/mpiexec -n 2 build-pr501-communication-debug/unit_tests/Particle/ParticleUpdateNonuniform
-```
-
-Result:
-
-```text
-100% tests passed, 0 tests failed out of 36
-Total Test time (real) = 38.64 sec
-
-2-rank ParticleSendRecv: passed
-2-rank ParticleUpdate: passed
-2-rank ParticleUpdateNonuniform: passed
-```
+- Add or move timers around:
+  - `MPI_Waitall` as `particleWait`, or charge it back into `particleSend`.
+  - deferred receive finalizers as `particleDeserialize`, or charge them into `particleRecv`.
+  - optionally `Archive::serialize(hash)` and `Archive::deserialize(offset)` if the wait/deserialization split is still unclear.
+- Re-run the LUMI cases before changing algorithms, so the slowdown is attributed to wait time, packing/unpacking, or actual communication.
 
-Assessment:
+Diagnostic patch applied locally:
 
-- This is a coherent second PR after `pr501-pcg`.
-- It is still larger than the PCG split, but it avoids the major algorithmic dependencies from interpolation, FFT, NUFFT, and PIF.
-- Recommended next validation before opening: run the same particle tests on CUDA/HIP builds and larger MPI rank counts, especially the original `ParticleSendRecv` failure configurations.
+- `ParticleSpatialLayout::update()` now splits the previously hidden tail of `updateParticle` into:
+  - `particleWait`: combined send/receive `MPI_Waitall`
+  - `particleFreeBuffers`: communicator buffer release
+  - `particleDeserialize`: deferred receive finalizers / unpacking into particle attributes
+- Expected interpretation on LUMI:
+  - If `particleWait` dominates, the issue is likely GPU-aware MPI progress / synchronization or a load imbalance exposed by the nonblocking exchange.
+  - If `particleDeserialize` dominates, inspect `Archive::deserialize(offset)` and the custom byte-copy kernels.
+  - If neither dominates but `updateParticle` remains high, there is still another untimed section in the particle update path.
diff --git a/src/Particle/ParticleSpatialLayout.hpp b/src/Particle/ParticleSpatialLayout.hpp
index c2bc0093c..8b927846e 100644
--- a/src/Particle/ParticleSpatialLayout.hpp
+++ b/src/Particle/ParticleSpatialLayout.hpp
@@ -257,15 +257,33 @@ namespace ippl {
 
         IpplTimings::stopTimer(destroyTimer);
 
+        static IpplTimings::TimerRef waitTimer = IpplTimings::getTimer("particleWait");
+        IpplTimings::startTimer(waitTimer);
+
         requests.insert(requests.end(), recvRequests.begin(), recvRequests.end());
         if (!requests.empty()) {
             MPI_Waitall(static_cast<int>(requests.size()), requests.data(), MPI_STATUSES_IGNORE);
         }
+
+        IpplTimings::stopTimer(waitTimer);
+
+        static IpplTimings::TimerRef freeBufferTimer = IpplTimings::getTimer("particleFreeBuffers");
+        IpplTimings::startTimer(freeBufferTimer);
+
         Comm->freeAllBuffers();
 
+        IpplTimings::stopTimer(freeBufferTimer);
+
         // 5. Deserialize
-        for (auto& finalize : finalizers)
+        static IpplTimings::TimerRef deserializeTimer =
+            IpplTimings::getTimer("particleDeserialize");
+        IpplTimings::startTimer(deserializeTimer);
+
+        for (auto& finalize : finalizers) {
             finalize(pc.getLocalNum());
+        }
+
+        IpplTimings::stopTimer(deserializeTimer);
 
         IpplTimings::stopTimer(ParticleUpdateTimer);
     }

From 6741b4ef3f3eb825bf1f3a22b4109d5b6df840fa Mon Sep 17 00:00:00 2001
From: Andreas Adelmann <andreas.adelmann@psi.ch>
Date: Tue, 2 Jun 2026 11:08:07 +0200
Subject: [PATCH 05/14] Add particle update load diagnostics

---
 src/Particle/ParticleSpatialLayout.hpp | 55 ++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

diff --git a/src/Particle/ParticleSpatialLayout.hpp b/src/Particle/ParticleSpatialLayout.hpp
index 8b927846e..d9441af25 100644
--- a/src/Particle/ParticleSpatialLayout.hpp
+++ b/src/Particle/ParticleSpatialLayout.hpp
@@ -23,6 +23,8 @@
 #include <numeric>
 #include <vector>
 
+#include "Ippl.h"
+
 #include "Utility/IpplTimings.h"
 #include "Utility/ParallelDispatch.h"
 
@@ -128,6 +130,7 @@ namespace ippl {
         }
 
         ensureNeighborsCached();
+        const size_type localBefore = pc.getLocalNum();
 
         /* particle MPI exchange:
          *   1. figure out which particles need to go where -> locateParticles(...)
@@ -213,11 +216,15 @@ namespace ippl {
         IpplTimings::startTimer(recvTimer);
 
         std::vector<std::pair<int, size_type>> recvList;
+        size_type totalRecvs = 0;
+        size_type maxRecv    = 0;
 
         if (countExchangeMode_ == CountExchange::RMA) {
             for (int rank = 0; rank < nRanks_; ++rank) {
                 if (nRecvs_m[rank] > 0) {
                     recvList.push_back({rank, nRecvs_m[rank]});
+                    totalRecvs += nRecvs_m[rank];
+                    maxRecv = std::max(maxRecv, nRecvs_m[rank]);
                 }
             }
         } else {
@@ -226,6 +233,8 @@ namespace ippl {
             for (int rank = 0; rank < nRanks_; ++rank) {
                 if (recvCounts_h(rank) > 0) {
                     recvList.push_back({rank, recvCounts_h(rank)});
+                    totalRecvs += static_cast<size_type>(recvCounts_h(rank));
+                    maxRecv = std::max(maxRecv, static_cast<size_type>(recvCounts_h(rank)));
                 }
             }
         }
@@ -254,6 +263,7 @@ namespace ippl {
         pc.template internalDestroy<position_memory_space, position_execution_space>(
             KOKKOS_LAMBDA(size_t i) { return leaving(i); }, nInvalid);
         Kokkos::fence();
+        const size_type localAfterDestroy = pc.getLocalNum();
 
         IpplTimings::stopTimer(destroyTimer);
 
@@ -275,6 +285,51 @@ namespace ippl {
         IpplTimings::stopTimer(freeBufferTimer);
 
         // 5. Deserialize
+        if (Info && Info->getOutputLevel() >= 5) {
+            const size_type recvSources = static_cast<size_type>(recvList.size());
+            size_type minRecvs          = totalRecvs;
+            size_type maxRecvs          = totalRecvs;
+            size_type sumRecvs          = totalRecvs;
+            size_type minSources        = recvSources;
+            size_type maxSources        = recvSources;
+            size_type sumSources        = recvSources;
+            size_type minMaxRecv        = maxRecv;
+            size_type maxMaxRecv        = maxRecv;
+            size_type minInvalid        = nInvalid;
+            size_type maxInvalid        = nInvalid;
+            size_type sumInvalid        = nInvalid;
+
+            Comm->allreduce(minRecvs, 1, std::less<size_type>());
+            Comm->allreduce(maxRecvs, 1, std::greater<size_type>());
+            Comm->allreduce(sumRecvs, 1, std::plus<size_type>());
+            Comm->allreduce(minSources, 1, std::less<size_type>());
+            Comm->allreduce(maxSources, 1, std::greater<size_type>());
+            Comm->allreduce(sumSources, 1, std::plus<size_type>());
+            Comm->allreduce(minMaxRecv, 1, std::less<size_type>());
+            Comm->allreduce(maxMaxRecv, 1, std::greater<size_type>());
+            Comm->allreduce(minInvalid, 1, std::less<size_type>());
+            Comm->allreduce(maxInvalid, 1, std::greater<size_type>());
+            Comm->allreduce(sumInvalid, 1, std::plus<size_type>());
+
+            Inform localDiag("ParticleUpdateDiag", INFORM_ALL_NODES);
+            localDiag << level5 << "rank=" << Comm->rank() << " localBefore=" << localBefore
+                      << " nInvalid=" << nInvalid << " localAfterDestroy=" << localAfterDestroy
+                      << " recvSources=" << recvSources << " totalRecvs=" << totalRecvs
+                      << " maxRecv=" << maxRecv
+                      << " localAfterRecv=" << localAfterDestroy + totalRecvs << endl;
+
+            Inform summaryDiag("ParticleUpdateDiag");
+            const double ranks = static_cast<double>(nRanks_);
+            summaryDiag << level5 << "summary"
+                        << " recvs[min,max,avg]=" << minRecvs << "," << maxRecvs << ","
+                        << static_cast<double>(sumRecvs) / ranks
+                        << " recvSources[min,max,avg]=" << minSources << "," << maxSources << ","
+                        << static_cast<double>(sumSources) / ranks
+                        << " maxRecv[min,max]=" << minMaxRecv << "," << maxMaxRecv
+                        << " invalid[min,max,avg]=" << minInvalid << "," << maxInvalid << ","
+                        << static_cast<double>(sumInvalid) / ranks << endl;
+        }
+
         static IpplTimings::TimerRef deserializeTimer =
             IpplTimings::getTimer("particleDeserialize");
         IpplTimings::startTimer(deserializeTimer);

From ed6b926f7140825d8916cccbeb126de0ac97b83e Mon Sep 17 00:00:00 2001
From: Andreas Adelmann <andreas.adelmann@psi.ch>
Date: Tue, 2 Jun 2026 11:24:28 +0200
Subject: [PATCH 06/14] Pre-reserve particle receive storage

---
 src/Particle/ParticleAttrib.h          |  4 +++-
 src/Particle/ParticleAttrib.hpp        |  7 +++++++
 src/Particle/ParticleAttribBase.h      |  4 ++++
 src/Particle/ParticleSpatialLayout.hpp | 19 +++++++++++++++++++
 4 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/src/Particle/ParticleAttrib.h b/src/Particle/ParticleAttrib.h
index 7a720475f..bbdce337a 100644
--- a/src/Particle/ParticleAttrib.h
+++ b/src/Particle/ParticleAttrib.h
@@ -62,6 +62,8 @@ namespace ippl {
         // discarded.
         void alloc(size_type) override;
 
+        void reserve(size_type) override;
+
         /*!
          * Particle deletion function. Partition the particles into a valid region
          * and an invalid region.
@@ -91,7 +93,7 @@ namespace ippl {
 
         void deserialize(detail::Archive<memory_space>& ar, size_type offset,
                          size_type nrecvs) override {
-            this->resize(offset + nrecvs);
+            this->reserve(offset + nrecvs);
             ar.deserialize(dview_m, offset, nrecvs);
         }
 
diff --git a/src/Particle/ParticleAttrib.hpp b/src/Particle/ParticleAttrib.hpp
index 955f83db5..95a4a2321 100644
--- a/src/Particle/ParticleAttrib.hpp
+++ b/src/Particle/ParticleAttrib.hpp
@@ -49,6 +49,13 @@ namespace ippl {
         this->realloc(n * overalloc);
     }
 
+    template <typename T, class... Properties>
+    void ParticleAttrib<T, Properties...>::reserve(size_type n) {
+        if (this->size() < n) {
+            this->resize(n);
+        }
+    }
+
     template <typename T, class... Properties>
     void ParticleAttrib<T, Properties...>::destroy(const hash_type& deleteIndex,
                                                    const hash_type& keepIndex,
diff --git a/src/Particle/ParticleAttribBase.h b/src/Particle/ParticleAttribBase.h
index 4526f5dbe..3e9baa197 100644
--- a/src/Particle/ParticleAttribBase.h
+++ b/src/Particle/ParticleAttribBase.h
@@ -66,6 +66,10 @@ namespace ippl {
             // prior entries survive a capacity grow.
             virtual void create(size_type, bool non_destructive = false) = 0;
 
+            // Grow internal capacity to at least N particles while preserving existing
+            // entries. Does not shrink and does not touch the logical particle count.
+            virtual void reserve(size_type) = 0;
+
             virtual void destroy(const hash_type&, const hash_type&, size_type) = 0;
             virtual size_type packedSize(const size_type) const                 = 0;
 
diff --git a/src/Particle/ParticleSpatialLayout.hpp b/src/Particle/ParticleSpatialLayout.hpp
index d9441af25..6732b781a 100644
--- a/src/Particle/ParticleSpatialLayout.hpp
+++ b/src/Particle/ParticleSpatialLayout.hpp
@@ -19,6 +19,7 @@
 //   frequency of load balancing (N), or may supply a function to
 //   determine if load balancing should be done or not.
 //
+#include <algorithm>
 #include <memory>
 #include <numeric>
 #include <vector>
@@ -334,10 +335,28 @@ namespace ippl {
             IpplTimings::getTimer("particleDeserialize");
         IpplTimings::startTimer(deserializeTimer);
 
+        static IpplTimings::TimerRef deserializeResizeTimer =
+            IpplTimings::getTimer("particleDeserializeResize");
+        IpplTimings::startTimer(deserializeResizeTimer);
+
+        if (totalRecvs > 0) {
+            const size_type receiveCapacity = localAfterDestroy + totalRecvs;
+            pc.forAllAttributes([&]<typename Attribute>(Attribute*& attribute) {
+                attribute->reserve(receiveCapacity);
+            });
+        }
+
+        IpplTimings::stopTimer(deserializeResizeTimer);
+
+        static IpplTimings::TimerRef deserializeCopyTimer =
+            IpplTimings::getTimer("particleDeserializeCopy");
+        IpplTimings::startTimer(deserializeCopyTimer);
+
         for (auto& finalize : finalizers) {
             finalize(pc.getLocalNum());
         }
 
+        IpplTimings::stopTimer(deserializeCopyTimer);
         IpplTimings::stopTimer(deserializeTimer);
 
         IpplTimings::stopTimer(ParticleUpdateTimer);

From 484c5e2473dc412c071b1b633708a8a5c0601eac Mon Sep 17 00:00:00 2001
From: Andreas Adelmann <andreas.adelmann@psi.ch>
Date: Tue, 2 Jun 2026 12:03:47 +0200
Subject: [PATCH 07/14] Clean particle deserialize diagnostics

---
 src/Particle/ParticleSpatialLayout.hpp | 53 +-------------------------
 1 file changed, 2 insertions(+), 51 deletions(-)

diff --git a/src/Particle/ParticleSpatialLayout.hpp b/src/Particle/ParticleSpatialLayout.hpp
index 6732b781a..e0590974c 100644
--- a/src/Particle/ParticleSpatialLayout.hpp
+++ b/src/Particle/ParticleSpatialLayout.hpp
@@ -131,7 +131,6 @@ namespace ippl {
         }
 
         ensureNeighborsCached();
-        const size_type localBefore = pc.getLocalNum();
 
         /* particle MPI exchange:
          *   1. figure out which particles need to go where -> locateParticles(...)
@@ -218,14 +217,12 @@ namespace ippl {
 
         std::vector<std::pair<int, size_type>> recvList;
         size_type totalRecvs = 0;
-        size_type maxRecv    = 0;
 
         if (countExchangeMode_ == CountExchange::RMA) {
             for (int rank = 0; rank < nRanks_; ++rank) {
                 if (nRecvs_m[rank] > 0) {
                     recvList.push_back({rank, nRecvs_m[rank]});
                     totalRecvs += nRecvs_m[rank];
-                    maxRecv = std::max(maxRecv, nRecvs_m[rank]);
                 }
             }
         } else {
@@ -235,7 +232,6 @@ namespace ippl {
                 if (recvCounts_h(rank) > 0) {
                     recvList.push_back({rank, recvCounts_h(rank)});
                     totalRecvs += static_cast<size_type>(recvCounts_h(rank));
-                    maxRecv = std::max(maxRecv, static_cast<size_type>(recvCounts_h(rank)));
                 }
             }
         }
@@ -286,57 +282,12 @@ namespace ippl {
         IpplTimings::stopTimer(freeBufferTimer);
 
         // 5. Deserialize
-        if (Info && Info->getOutputLevel() >= 5) {
-            const size_type recvSources = static_cast<size_type>(recvList.size());
-            size_type minRecvs          = totalRecvs;
-            size_type maxRecvs          = totalRecvs;
-            size_type sumRecvs          = totalRecvs;
-            size_type minSources        = recvSources;
-            size_type maxSources        = recvSources;
-            size_type sumSources        = recvSources;
-            size_type minMaxRecv        = maxRecv;
-            size_type maxMaxRecv        = maxRecv;
-            size_type minInvalid        = nInvalid;
-            size_type maxInvalid        = nInvalid;
-            size_type sumInvalid        = nInvalid;
-
-            Comm->allreduce(minRecvs, 1, std::less<size_type>());
-            Comm->allreduce(maxRecvs, 1, std::greater<size_type>());
-            Comm->allreduce(sumRecvs, 1, std::plus<size_type>());
-            Comm->allreduce(minSources, 1, std::less<size_type>());
-            Comm->allreduce(maxSources, 1, std::greater<size_type>());
-            Comm->allreduce(sumSources, 1, std::plus<size_type>());
-            Comm->allreduce(minMaxRecv, 1, std::less<size_type>());
-            Comm->allreduce(maxMaxRecv, 1, std::greater<size_type>());
-            Comm->allreduce(minInvalid, 1, std::less<size_type>());
-            Comm->allreduce(maxInvalid, 1, std::greater<size_type>());
-            Comm->allreduce(sumInvalid, 1, std::plus<size_type>());
-
-            Inform localDiag("ParticleUpdateDiag", INFORM_ALL_NODES);
-            localDiag << level5 << "rank=" << Comm->rank() << " localBefore=" << localBefore
-                      << " nInvalid=" << nInvalid << " localAfterDestroy=" << localAfterDestroy
-                      << " recvSources=" << recvSources << " totalRecvs=" << totalRecvs
-                      << " maxRecv=" << maxRecv
-                      << " localAfterRecv=" << localAfterDestroy + totalRecvs << endl;
-
-            Inform summaryDiag("ParticleUpdateDiag");
-            const double ranks = static_cast<double>(nRanks_);
-            summaryDiag << level5 << "summary"
-                        << " recvs[min,max,avg]=" << minRecvs << "," << maxRecvs << ","
-                        << static_cast<double>(sumRecvs) / ranks
-                        << " recvSources[min,max,avg]=" << minSources << "," << maxSources << ","
-                        << static_cast<double>(sumSources) / ranks
-                        << " maxRecv[min,max]=" << minMaxRecv << "," << maxMaxRecv
-                        << " invalid[min,max,avg]=" << minInvalid << "," << maxInvalid << ","
-                        << static_cast<double>(sumInvalid) / ranks << endl;
-        }
-
         static IpplTimings::TimerRef deserializeTimer =
             IpplTimings::getTimer("particleDeserialize");
         IpplTimings::startTimer(deserializeTimer);
 
         static IpplTimings::TimerRef deserializeResizeTimer =
-            IpplTimings::getTimer("particleDeserializeResize");
+            IpplTimings::getTimer("particleDeserResize");
         IpplTimings::startTimer(deserializeResizeTimer);
 
         if (totalRecvs > 0) {
@@ -349,7 +300,7 @@ namespace ippl {
         IpplTimings::stopTimer(deserializeResizeTimer);
 
         static IpplTimings::TimerRef deserializeCopyTimer =
-            IpplTimings::getTimer("particleDeserializeCopy");
+            IpplTimings::getTimer("particleDeserCopy");
         IpplTimings::startTimer(deserializeCopyTimer);
 
         for (auto& finalize : finalizers) {

From b800e3f377c503650ae68e93cd1c5558138c9dd3 Mon Sep 17 00:00:00 2001
From: Andreas Adelmann <andreas.adelmann@psi.ch>
Date: Tue, 2 Jun 2026 12:10:45 +0200
Subject: [PATCH 08/14] Document particle receive pre-reserve fix

---
 PR501_SPLIT_MAP.md | 49 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 48 insertions(+), 1 deletion(-)

diff --git a/PR501_SPLIT_MAP.md b/PR501_SPLIT_MAP.md
index 939ccec16..aa893c3a3 100644
--- a/PR501_SPLIT_MAP.md
+++ b/PR501_SPLIT_MAP.md
@@ -345,7 +345,7 @@ Mac OpenMP wall-max timings:
 | 4 | `pr501-pcg` | 0.1550 | 0.1923 | 0.1303 | 0.0391 | 0.0180 | 0.00923 |
 | 4 | `pr501-com` | 0.0495 | 0.0534 | 0.00872 | 0.00677 | 0.00291 | 0.000131 |
 
-Conclusion so far:
+Initial conclusion:
 
 - The Mac OpenMP run does not reproduce the LUMI slowdown. The communication branch is faster locally for this small CPU/OpenMP test.
 - That points away from a generic algorithmic slowdown in the particle update rewrite.
@@ -369,3 +369,50 @@ Diagnostic patch applied locally:
   - If `particleWait` dominates, the issue is likely GPU-aware MPI progress / synchronization or a load imbalance exposed by the nonblocking exchange.
   - If `particleDeserialize` dominates, inspect `Archive::deserialize(offset)` and the custom byte-copy kernels.
   - If neither dominates but `updateParticle` remains high, there is still another untimed section in the particle update path.
+
+### Receive-Side Pre-Reserve Fix
+
+Root cause:
+
+- The LUMI runs showed that the slowdown was not in `MPI_Waitall` and not in communicator buffer release.
+- The hidden cost was almost entirely in deferred receive finalization, reported by the new `particleDeserialize` timer.
+- Each receive finalizer called `ParticleAttrib::deserialize(offset, nrecvs)`.
+- That path called `ParticleAttrib::resize(offset + nrecvs)` for every source rank and every attribute.
+- `Kokkos::resize` preserves existing entries when growing. On GPU this repeatedly copied already-live particle storage during one update step.
+- The cost was amplified when a rank received particles from many source ranks, even when the actual receive-count imbalance was modest.
+
+Fix:
+
+- Add a grow-only `ParticleAttrib::reserve(size_type)` operation.
+- Before running deferred receive finalizers, compute the final receive capacity once:
+  `localAfterDestroy + totalRecvs`.
+- Reserve every particle attribute once to that final capacity.
+- Change receive-side `ParticleAttrib::deserialize(offset, nrecvs)` to call `reserve(offset + nrecvs)` instead of unconditional `resize`.
+- Keep `particleDeserialize` split into:
+  - `particleDeserResize`: one-time attribute reserve before finalizers
+  - `particleDeserCopy`: archive unpack / copy into particle attributes
+
+LUMI validation:
+
+| Ranks | Metric | Before pre-reserve | After pre-reserve | Improvement |
+| ---: | --- | ---: | ---: | ---: |
+| 32 | `updateParticle` wall max | 8.08981 | 0.338958 | 23.9x |
+| 32 | `particleDeserialize` wall max | 7.88320 | 0.0106297 | 741x |
+| 128 | `updateParticle` wall max | 1.59376 | 0.214328 | 7.4x |
+| 128 | `particleDeserialize` wall max | 1.40305 | 0.0132888 | 106x |
+
+Post-fix timer split:
+
+| Ranks | `particleDeserialize` wall max | `particleDeserResize` wall max | `particleDeserCopy` wall max |
+| ---: | ---: | ---: | ---: |
+| 32 | 0.0106297 | 0.0000129 | 0.0105913 |
+| 128 | 0.0132888 | 0.0000196 | 0.0132491 |
+
+Interpretation:
+
+- The repeated preserving resize was the dominant regression.
+- After pre-reserving, deserialize time is small and almost entirely the actual archive copy.
+- `updateParticle` is again balanced across ranks in the 128-rank run:
+  before `max/avg/min = 1.594 / 0.864 / 0.811`,
+  after `max/avg/min = 0.214 / 0.211 / 0.207`.
+- This fix belongs in PR 2 because it is specific to the communication/particle-update receive path and independent of higher-order scatter/gather, FFT, NUFFT, and PIF.

From ecb4edb238d648e3d82cde2a45ef305a3f4cd0a6 Mon Sep 17 00:00:00 2001
From: Andreas Adelmann <andreas.adelmann@psi.ch>
Date: Tue, 16 Jun 2026 21:52:54 +0200
Subject: [PATCH 09/14] remouve douplicate

---
 src/LinearSolvers/PCG.h | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/src/LinearSolvers/PCG.h b/src/LinearSolvers/PCG.h
index 74d33c5c9..d2c826f28 100644
--- a/src/LinearSolvers/PCG.h
+++ b/src/LinearSolvers/PCG.h
@@ -336,13 +336,6 @@ namespace ippl {
                  FieldRHS>()
             , preconditioner_m(nullptr) {};
 
-        void initializeFields(mesh_type& mesh, layout_type& layout) override {
-            CG<OperatorRet, LowerRet, UpperRet, UpperLowerRet, InverseDiagRet, DiagRet, FieldLHS,
-               FieldRHS>::initializeFields(mesh, layout);
-            s.initialize(mesh, layout);
-            pcond_out.initialize(mesh, layout);
-        }
-
         /*
          * Extends the CG workspace with the preconditioner result buffers s and
          * pcond_out so operator() does not allocate per solve. pcond_out keeps

From 72f6e8c3ba5267a29cd56b652e59cf551b76ddb9 Mon Sep 17 00:00:00 2001
From: Andreas Adelmann <andreas.adelmann@psi.ch>
Date: Tue, 16 Jun 2026 22:17:02 +0200
Subject: [PATCH 10/14] Fix ParticleSendRecv host mirror extent after migration

ParticleSendRecv timed out under CTest with two MPI ranks, but the
timeout was only a secondary symptom. One rank threw before the final
MPI barrier with:

  Kokkos::deep_copy extents of views don't match:
  ParticleAttrib::dview_mirror(64) ParticleAttrib::dview(56)

and the peer rank then waited until CTest killed the test.

The failing path created a host mirror for expectedRank after
bunch->update(), then resized that mirror to expectedRank.size() before
copying from expectedRank.getView(). On this branch, particle attributes
separate capacity from the live particle count: size() reports the
backing capacity, while getView() is trimmed to the live particle range.
After migration those values can differ across ranks.

Remove the explicit resize to expectedRank.size(). getHostMirror()
already returns a mirror compatible with the live view, so the deep_copy
now uses matching extents after particle migration.

Validated with:

  cmake --build build --target ParticleSendRecv -j 8
  ctest --test-dir build -R ParticleSendRecv --output-on-failure -V

The CTest entry runs ParticleSendRecv with mpiexec -n 2 and now passes
all 12 typed cases.
---
 unit_tests/Particle/ParticleSendRecv.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/unit_tests/Particle/ParticleSendRecv.cpp b/unit_tests/Particle/ParticleSendRecv.cpp
index 28e8ee0ca..1d84cad41 100644
--- a/unit_tests/Particle/ParticleSendRecv.cpp
+++ b/unit_tests/Particle/ParticleSendRecv.cpp
@@ -149,7 +149,6 @@ TYPED_TEST(ParticleSendRecv, SendAndRecieve) {
     typename TestFixture::rank_type::view_type::host_mirror_type ER_host =
         bunch->expectedRank.getHostMirror();
 
-    Kokkos::resize(ER_host, bunch->expectedRank.size());
     Kokkos::deep_copy(ER_host, bunch->expectedRank.getView());
 
     for (size_t i = 0; i < bunch->getLocalNum(); ++i) {

From 62946ada1e3a685c85dcfa3af6f9cad2ffc81be8 Mon Sep 17 00:00:00 2001
From: Andreas Adelmann <andreas.adelmann@psi.ch>
Date: Tue, 16 Jun 2026 22:38:36 +0200
Subject: [PATCH 11/14] Fix benchmarkParticleUpdate host mirror resize after
 migration

benchmarkParticleUpdate failed under the two-rank CTest configuration with
a Kokkos deep_copy extent mismatch:

  ParticleAttrib::dview_mirror(5000) ParticleAttrib::dview(4921)

This is the same capacity-vs-live-count issue fixed for ParticleSendRecv.
After particle migration, ParticleAttrib::size() reports the backing
capacity, while getView() exposes only the live particle range. The
benchmark resized the host mirrors for P and R to P->P.size() and
P->R.size(), then copied from P->P.getView() / P->R.getView(). When the
local particle count differed from the reserved capacity, the destination
mirror and source view extents no longer matched.

Resize the host mirrors to P->getLocalNum() instead. This matches the
live extent used by getView() and keeps the benchmark valid after uneven
particle migration across ranks.

Validated with:

  cmake --build build --target benchmarkParticleUpdate -j 8
  ctest --test-dir build -R benchmarkParticleUpdate --output-on-failure -V
  ctest --test-dir build -R 'ParticleSendRecv|benchmarkParticleUpdate' --output-on-failure

Both ParticleSendRecv and benchmarkParticleUpdate pass with the configured
two-rank MPI launch.
---
 test/particle/benchmarkParticleUpdate.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/particle/benchmarkParticleUpdate.cpp b/test/particle/benchmarkParticleUpdate.cpp
index c4d57700d..e1a654d99 100644
--- a/test/particle/benchmarkParticleUpdate.cpp
+++ b/test/particle/benchmarkParticleUpdate.cpp
@@ -223,9 +223,9 @@ int main(int argc, char* argv[]) {
             IpplTimings::startTimer(RandPTimer);
             std::mt19937_64 engP;
             engP.seed(42 + 10 * it + 100 * ippl::Comm->rank());
-            Kokkos::resize(P_host, P->P.size());
+            Kokkos::resize(P_host, P->getLocalNum());
             double sum_coord = 0.0;
-            Kokkos::resize(R_host, P->R.size());
+            Kokkos::resize(R_host, P->getLocalNum());
             Kokkos::deep_copy(R_host, P->R.getView());
             for (unsigned long int i = 0; i < P->getLocalNum(); i++) {
                 for (int d = 0; d < 3; d++) {

From a13c1a451de48a95a47f927f36b78f33b611144e Mon Sep 17 00:00:00 2001
From: Andreas Adelmann <andreas.adelmann@psi.ch>
Date: Tue, 16 Jun 2026 23:15:36 +0200
Subject: [PATCH 12/14] add doc

---
 PR501_SPLIT_MAP.md          | 418 ------------------------------------
 src/Communicate/Archive.hpp |   8 +
 2 files changed, 8 insertions(+), 418 deletions(-)
 delete mode 100644 PR501_SPLIT_MAP.md

diff --git a/PR501_SPLIT_MAP.md b/PR501_SPLIT_MAP.md
deleted file mode 100644
index aa893c3a3..000000000
--- a/PR501_SPLIT_MAP.md
+++ /dev/null
@@ -1,418 +0,0 @@
-# PR 501 Split Map
-
-Source PR: https://github.com/IPPL-framework/ippl/pull/501
-
-Local mapping branch: `pr-501-map`
-
-Compared against: `origin/master` at `da43fd66a18848b65dcd82af90001f5b64a798ab`
-
-Overall size:
-
-- 41 commits
-- 149 files changed
-- 23,231 insertions
-- 2,205 deletions
-
-## Subsystem Size
-
-| Subsystem | Files | Additions | Deletions |
-| --- | ---: | ---: | ---: |
-| Interpolation / Autotune | 33 | 8,138 | 20 |
-| FFT / NUFFT | 24 | 6,571 | 814 |
-| Particle | 21 | 3,841 | 757 |
-| Alpine / PIF examples | 12 | 1,923 | 37 |
-| Utility | 11 | 1,424 | 45 |
-| Communicate | 14 | 626 | 156 |
-| Solvers | 6 | 394 | 253 |
-| Field / Halo / Layout | 11 | 116 | 87 |
-| Build / CMake | 4 | 117 | 12 |
-| Types | 2 | 35 | 4 |
-| FEM | 3 | 12 | 13 |
-| Other | 8 | 34 | 7 |
-
-The review load is dominated by three areas:
-
-- Interpolation / Autotune: new scatter/gather dispatch, binning, tiling, tuning cache.
-- FFT / NUFFT: backend/transform refactor plus native NUFFT and pruned transforms.
-- Particle: new spatial update path, send/recv serialization hooks, sorting/buffer reuse.
-
-## Commit Shape
-
-The branch starts with well-labeled subsystem commits:
-
-| Commit | Area | Notes |
-| --- | --- | --- |
-| `7cff674a` | Build / dependencies | Adds `IPPL_ENABLE_FINUFFT`, `IPPL_ENABLE_CUFFTMP`, autotune preset plumbing. |
-| `13950c4f` | FFT / NUFFT | Large FFT backend/transform split plus native NUFFT engine. |
-| `dbbe59a4` | Interpolation | Scatter/gather redesign, binning, tiling, autotune runtime cache. |
-| `2878b90b` | Particle / halo | Rewrites `ParticleSpatialLayout`, adds sort buffers, threads `nghost` through field layout/halo. |
-| `9507a796` | Utility | Timing overhaul, `BufferView`, `Tuning`, `ParallelDispatch`, type utilities. |
-| `f2820064` | Communicate | Serialization hooks, shared buffer handler. |
-| `63d6ccf2` | Integration glue | Field/solver/type updates for new FFT and particle layout APIs. |
-| `99d21ba7` | Alpine PIF | Adds ElectrostaticPIF examples. |
-| `f0560f76` | Unit tests | Adds NUFFT, interpolation, binning, particle update tests. |
-| `927cac7b` | Integration tests | Updates particle benchmark and adds scaling benchmark. |
-
-After that, the branch has many fixup/regression commits. Several are descriptive and likely fold into the earlier subsystem commits; some are separable:
-
-| Commit | Area | Split relevance |
-| --- | --- | --- |
-| `40438d17` | PCG / preconditioner | Good candidate for an independent PR: hoists per-iteration allocations out of solver loop. |
-| `be5f794c` | PCG / PoissonCG | Follow-up to the PCG allocation PR: pass `Field` by reference through `OperatorF`. |
-| `95762403` | PIF examples | Consolidates pruned variants into parameterized source. Belongs with PIF examples. |
-| `0e7bc58d`, `6c1c48ad` | FFT / NUFFT | FINUFFT guards for CUDA + 2D unit-test build. Belong with NUFFT/FINUFFT PR. |
-| `cec1ca61` | Cleanup | Doxygen, formatting, size-type consistency. Should be folded into relevant split PRs or kept as a final cleanup PR. |
-
-## Dependency Observations
-
-### NUFFT Depends On Interpolation
-
-`src/FFT/NUFFT/NativeNUFFT.h` includes and uses:
-
-- `Interpolation/Scatter/ScatterConfig.h`
-- `Interpolation/Gather/GatherConfig.h`
-- scatter/gather kernels through the new interpolation dispatch
-
-That means native NUFFT cannot be cleanly reviewed before the higher-order scatter/gather layer.
-
-### PIF Depends On NUFFT
-
-The Alpine PIF examples use:
-
-- `ippl::FFT<ippl::NUFFTransform, Field_t>`
-- `scatterPIFNUFFT`
-- `gatherPIFNUFFT`
-
-So PIF examples should come after the NUFFT transform API.
-
-### Particle Update And Communicate Are Coupled
-
-The particle update rewrite uses:
-
-- `ParticleAttrib::serialize` / `deserialize`
-- shared `BufferHandler`
-- direct archive serialization
-- reusable send/recv buffers
-
-This suggests communication and particle update may be one PR unless a clean lower-level communication PR is extracted first.
-
-### `nghost` / Halo Changes Cross Particle And Field
-
-`ParticleSpatialLayout` changes also thread `nghost` through:
-
-- `FieldLayout`
-- `HaloCells`
-- field layout APIs
-
-This is small in line count but high in integration risk. It should be explicitly called out in whichever PR contains particle update.
-
-### PCG Allocation Fix Looks Separately Reviewable
-
-The late commits:
-
-- `40438d17`
-- `be5f794c`
-
-only touch:
-
-- `src/LinearSolvers/PCG.h`
-- `src/LinearSolvers/Preconditioner.h`
-- `src/PoissonSolvers/PoissonCG.h`
-
-This can likely become an independent performance/regression PR, separate from PIF.
-
-## Candidate Split
-
-### PR 1: PCG Allocation / Solver Performance
-
-Scope:
-
-- `src/LinearSolvers/PCG.h`
-- `src/LinearSolvers/Preconditioner.h`
-- `src/PoissonSolvers/PoissonCG.h`
-
-Candidate commits:
-
-- `40438d17`
-- `be5f794c`
-
-Reason:
-
-- Smallest coherent split.
-- Directly addresses the reported PCG slowdown / repeated `cudaMalloc` concern.
-- Does not conceptually depend on PIF, NUFFT, or interpolation.
-
-Risk:
-
-- Need to verify solver convergence and exact iteration behavior.
-
-### PR 2: Communication And Particle Update Infrastructure
-
-Scope:
-
-- `src/Communicate/*`
-- `src/Particle/ParticleSpatialLayout*`
-- `src/Particle/ParticleAttrib*`
-- `src/Particle/ParticleBase*`
-- `src/Particle/ParticleSort.h`
-- `src/Particle/SortBuffer.h`
-- `src/FieldLayout/FieldLayout*`
-- `src/Field/HaloCells*`
-- particle update tests
-
-Candidate commits:
-
-- `2878b90b`
-- `f2820064`
-- relevant parts of `63d6ccf2`
-- relevant parts of `f0560f76`
-- relevant fixup commits after `728af370`
-
-Reason:
-
-- Captures the PIC scaling improvement that is not PIF-specific.
-- Reviewable independently from FFT/NUFFT algorithms.
-
-Risk:
-
-- This is still a substantial behavioral change.
-- Needs multi-rank CPU/GPU particle update and `ParticleSendRecv` coverage.
-
-### PR 3: Higher-Order Scatter/Gather And Autotune
-
-Scope:
-
-- `src/Interpolation/*`
-- `cmake/AutoTunePresets.cmake`
-- `cmake/IpplAutoTunePresets.h.in`
-- `cmake/auto_tune/*`
-- interpolation tests
-
-Candidate commits:
-
-- `7cff674a` only for autotune preset plumbing, not FINUFFT/CUFFTMP if separable.
-- `dbbe59a4`
-- interpolation portions of `f0560f76`
-- `dacdcd9a` if routing legacy CIC through the new framework is included.
-
-Reason:
-
-- This is the algorithmic layer native NUFFT requires.
-- Can be validated without PIF examples.
-
-Risk:
-
-- `dacdcd9a` changes existing CIC scatter/gather behavior by routing legacy paths through the new framework. That may be better as a second PR after the new interpolation layer lands.
-
-### PR 4: FFT Backend / Transform Refactor
-
-Scope:
-
-- `src/FFT/Backend/*`
-- `src/FFT/Transform/CC.h`
-- `src/FFT/Transform/RC.h`
-- `src/FFT/Transform/Trig.h`
-- `src/FFT/Transform/PrunedCC.h`
-- `src/FFT/Transform/PrunedRC.h`
-- `src/FFT/Transform/Common.h`
-- `src/FFT/Traits.h`
-- existing FFT test updates
-
-Candidate commits:
-
-- FFT refactor parts of `13950c4f`
-- FFT portions of `f0560f76`
-- FFT portions of later build fixes
-
-Reason:
-
-- Gives reviewers the backend/transform API without also reviewing NUFFT and PIF.
-
-Risk:
-
-- Current commit `13950c4f` mixes backend refactor with native NUFFT files. This split likely requires path-based extraction or commit surgery.
-
-### PR 5: Native NUFFT And FINUFFT/cuFINUFFT Integration
-
-Scope:
-
-- `src/FFT/NUFFT/*`
-- `src/FFT/Transform/NUFFT.*`
-- FINUFFT/CUFFTMP build switches if not already merged
-- NUFFT tests
-
-Candidate commits:
-
-- NUFFT portions of `13950c4f`
-- FINUFFT/CUFFTMP portions of `7cff674a`
-- NUFFT portions of `f0560f76`
-- `0e7bc58d`
-- `6c1c48ad`
-
-Reason:
-
-- NUFFT depends on PR 3 scatter/gather and PR 4 transform structure.
-
-Risk:
-
-- Native NUFFT depends on new scatter/gather.
-- FINUFFT path has Dim-guard complexity and external dependency complexity.
-
-### PR 6: PIF / Alpine Examples
-
-Scope:
-
-- `alpine/ElectrostaticPIF/*`
-- Alpine manager wiring
-- PIF-specific particle attrib convenience APIs if not included in NUFFT PR
-
-Candidate commits:
-
-- `99d21ba7`
-- `95762403`
-- PIF/example fixups
-
-Reason:
-
-- Examples should be reviewed after core APIs are accepted.
-
-Risk:
-
-- PIF examples currently depend on the full stack: particle update, scatter/gather, FFT transform refactor, NUFFT.
-
-# LUMI Results 
-
-| Benchmark | Problem size | Nodes | Ranks | master | pr501-pcg | pr501-com | pr501-fft | pr501-hosg | pr501-nufft |
-| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: |
-| FEM | `513_10` | 8 | 64 | 28.23 | 27.77 (-2%) | 27.71 (-2%) | 27.88 (-1%) | 28.23 (0%) | 27.95 (-1%) |
-| FFT | `512_10` | 4 | 32 | 4.39 | 4.41 (0%) | 11.54 (+163%) | 11.53 (+162%) | 11.59 (+164%) | 11.53 (+162%) |
-| FFT | `512_10` | 16 | 128 | 1.65 | 1.65 (0%) | 2.88 (+74%) | 2.88 (+75%) | 2.93 (+78%) | 2.93 (+78%) |
-| PCG | `512_10` | 1 | 8 | 72.31 | 70.01 (-3%) |  | 76.99 (+6%) |  | 76.75 (+6%) |
-| PCG | `512_10` | 4 | 32 | 34.60 | 32.93 (-5%) | 43.71 (+26%) | 33.62 (-3%) | 36.32 (+5%) | 33.55 (-3%) |
-| PCG | `512_10` | 64 | 512 | 25.00 | 23.85 (-5%) | 22.89 (-8%) | 22.40 (-10%) | 22.74 (-9%) | 22.58 (-10%) |
-
-
-## Update Timer Investigation
-
-Observed issue:
-
-- On LUMI, the `update` / `updateParticle` timer can increase strongly when moving from `pr501-pcg` to `pr501-communication-particle-update` / `pr501-com`.
-- A first code inspection did not find an FFT solver change in this branch step: `pr501-pcg..pr501-communication-particle-update` has no changes under `src/FFT` or `src/PoissonSolvers`.
-- The branch step does change particle migration, communication archives, field layout / halo `nghost` plumbing, and timing infrastructure.
-
-Most likely immediate explanation:
-
-- In the communication branch, `ParticleSpatialLayout::update()` posts sends and receives, then performs `MPI_Waitall` plus deferred receive finalization/deserialization inside the outer `updateParticle` timer, but outside the child timers `particleSend`, `particleRecv`, and `sendPreprocess`.
-- This can make `updateParticle` look much larger while the child timers look artificially small.
-- On the LUMI timing data this pattern is visible: for `PCG_strong_scaling/512_10/nodes_64`, `locateParticles`, `particleSend`, and `particleRecv` are small in `pr501-com`, while the unexplained part of `updateParticle` is large.
-
-Secondary suspect:
-
-- `Archive` contiguous scalar serialization/deserialization changed from `Kokkos::deep_copy` on unmanaged byte views to custom byte-copy kernels plus fences.
-- On GPU, especially with GPU-aware MPI, that may be slower or introduce extra synchronization compared with the previous optimized device copy path.
-- This would affect particle migration directly, because `ParticleBase::sendToRank()` serializes every active particle attribute before `MPI_Isend`, and receive finalizers deserialize after the wait.
-
-Local Mac OpenMP check:
-
-- Built clean comparison worktrees for:
-  - `pr501-pcg`
-  - `pr501-communication-particle-update`
-- Configuration:
-  - `IPPL_PLATFORMS=OPENMP`
-  - `IPPL_ENABLE_ALPINE=ON`
-  - `IPPL_ENABLE_UNIT_TESTS=ON`
-  - `IPPL_ENABLE_FFT=ON`
-  - Kokkos `5.0.0`
-  - Homebrew LLVM `clang++` and `libomp`
-- AppleClang did not find OpenMP automatically; explicit `libomp` include/library paths were required.
-
-Mac OpenMP miniapp runs:
-
-```bash
-mpiexec -x OMP_NUM_THREADS=4 -x OMP_PROC_BIND=false -n 2 \
-  ./LandauDamping 32 32 32 20000 5 FFT 0.01 LeapFrog --overallocate 2.0 --info 5
-
-mpiexec -x OMP_NUM_THREADS=2 -x OMP_PROC_BIND=false -n 4 \
-  ./LandauDamping 32 32 32 20000 5 FFT 0.01 LeapFrog --overallocate 2.0 --info 5
-```
-
-Mac OpenMP wall-max timings:
-
-| Ranks | Branch | `update` | `updateParticle` | `locateParticles` | `sendPreprocess` | `particleSend` | `particleRecv` |
-| ---: | --- | ---: | ---: | ---: | ---: | ---: | ---: |
-| 2 | `pr501-pcg` | 0.0911 | 0.1090 | 0.0906 | 0.0187 | 0.00431 | 0.00451 |
-| 2 | `pr501-com` | 0.0250 | 0.0266 | 0.00577 | 0.000863 | 0.00150 | 0.000064 |
-| 4 | `pr501-pcg` | 0.1550 | 0.1923 | 0.1303 | 0.0391 | 0.0180 | 0.00923 |
-| 4 | `pr501-com` | 0.0495 | 0.0534 | 0.00872 | 0.00677 | 0.00291 | 0.000131 |
-
-Initial conclusion:
-
-- The Mac OpenMP run does not reproduce the LUMI slowdown. The communication branch is faster locally for this small CPU/OpenMP test.
-- That points away from a generic algorithmic slowdown in the particle update rewrite.
-- The remaining likely causes are GPU/MPI-specific behavior on LUMI: GPU-aware MPI wait behavior, archive device-buffer serialization/deserialization, or timer attribution around deferred receive finalization.
-
-Recommended next diagnostic patch:
-
-- Add or move timers around:
-  - `MPI_Waitall` as `particleWait`, or charge it back into `particleSend`.
-  - deferred receive finalizers as `particleDeserialize`, or charge them into `particleRecv`.
-  - optionally `Archive::serialize(hash)` and `Archive::deserialize(offset)` if the wait/deserialization split is still unclear.
-- Re-run the LUMI cases before changing algorithms, so the slowdown is attributed to wait time, packing/unpacking, or actual communication.
-
-Diagnostic patch applied locally:
-
-- `ParticleSpatialLayout::update()` now splits the previously hidden tail of `updateParticle` into:
-  - `particleWait`: combined send/receive `MPI_Waitall`
-  - `particleFreeBuffers`: communicator buffer release
-  - `particleDeserialize`: deferred receive finalizers / unpacking into particle attributes
-- Expected interpretation on LUMI:
-  - If `particleWait` dominates, the issue is likely GPU-aware MPI progress / synchronization or a load imbalance exposed by the nonblocking exchange.
-  - If `particleDeserialize` dominates, inspect `Archive::deserialize(offset)` and the custom byte-copy kernels.
-  - If neither dominates but `updateParticle` remains high, there is still another untimed section in the particle update path.
-
-### Receive-Side Pre-Reserve Fix
-
-Root cause:
-
-- The LUMI runs showed that the slowdown was not in `MPI_Waitall` and not in communicator buffer release.
-- The hidden cost was almost entirely in deferred receive finalization, reported by the new `particleDeserialize` timer.
-- Each receive finalizer called `ParticleAttrib::deserialize(offset, nrecvs)`.
-- That path called `ParticleAttrib::resize(offset + nrecvs)` for every source rank and every attribute.
-- `Kokkos::resize` preserves existing entries when growing. On GPU this repeatedly copied already-live particle storage during one update step.
-- The cost was amplified when a rank received particles from many source ranks, even when the actual receive-count imbalance was modest.
-
-Fix:
-
-- Add a grow-only `ParticleAttrib::reserve(size_type)` operation.
-- Before running deferred receive finalizers, compute the final receive capacity once:
-  `localAfterDestroy + totalRecvs`.
-- Reserve every particle attribute once to that final capacity.
-- Change receive-side `ParticleAttrib::deserialize(offset, nrecvs)` to call `reserve(offset + nrecvs)` instead of unconditional `resize`.
-- Keep `particleDeserialize` split into:
-  - `particleDeserResize`: one-time attribute reserve before finalizers
-  - `particleDeserCopy`: archive unpack / copy into particle attributes
-
-LUMI validation:
-
-| Ranks | Metric | Before pre-reserve | After pre-reserve | Improvement |
-| ---: | --- | ---: | ---: | ---: |
-| 32 | `updateParticle` wall max | 8.08981 | 0.338958 | 23.9x |
-| 32 | `particleDeserialize` wall max | 7.88320 | 0.0106297 | 741x |
-| 128 | `updateParticle` wall max | 1.59376 | 0.214328 | 7.4x |
-| 128 | `particleDeserialize` wall max | 1.40305 | 0.0132888 | 106x |
-
-Post-fix timer split:
-
-| Ranks | `particleDeserialize` wall max | `particleDeserResize` wall max | `particleDeserCopy` wall max |
-| ---: | ---: | ---: | ---: |
-| 32 | 0.0106297 | 0.0000129 | 0.0105913 |
-| 128 | 0.0132888 | 0.0000196 | 0.0132491 |
-
-Interpretation:
-
-- The repeated preserving resize was the dominant regression.
-- After pre-reserving, deserialize time is small and almost entirely the actual archive copy.
-- `updateParticle` is again balanced across ranks in the 128-rank run:
-  before `max/avg/min = 1.594 / 0.864 / 0.811`,
-  after `max/avg/min = 0.214 / 0.211 / 0.207`.
-- This fix belongs in PR 2 because it is specific to the communication/particle-update receive path and independent of higher-order scatter/gather, FFT, NUFFT, and PIF.
diff --git a/src/Communicate/Archive.hpp b/src/Communicate/Archive.hpp
index bfba0815c..cff0482f0 100644
--- a/src/Communicate/Archive.hpp
+++ b/src/Communicate/Archive.hpp
@@ -18,6 +18,14 @@ namespace ippl {
             }
         }
 
+        /*
+         * Packs a sparse selection of scalar particle-attribute entries into
+         * the archive buffer. `hash(i)` gives the source index in `view_data`
+         * for the i-th item being sent; the functor copies that value as raw
+         * bytes to `buf + wpos + i * sizeof(T)`. This is used by
+         * Archive::serialize(view, hash, nsends) during particle migration,
+         * where only particles listed in the send hash are serialized.
+         */
         template <typename T, typename HashView, typename BufferPtr>
         struct SerializeHashFunctor {
             const T* view_data;

From 23a56431271e130375cf887351fe1ce95ccbbdf0 Mon Sep 17 00:00:00 2001
From: Andreas Adelmann <andreas.adelmann@psi.ch>
Date: Wed, 17 Jun 2026 10:45:12 +0200
Subject: [PATCH 13/14] remove circular dependece

---
 src/Particle/ParticleSpatialLayout.hpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/Particle/ParticleSpatialLayout.hpp b/src/Particle/ParticleSpatialLayout.hpp
index e0590974c..cbb4c760b 100644
--- a/src/Particle/ParticleSpatialLayout.hpp
+++ b/src/Particle/ParticleSpatialLayout.hpp
@@ -24,8 +24,6 @@
 #include <numeric>
 #include <vector>
 
-#include "Ippl.h"
-
 #include "Utility/IpplTimings.h"
 #include "Utility/ParallelDispatch.h"
 

From 7872582fc19373722e3d6453452ff36adaf29693 Mon Sep 17 00:00:00 2001
From: Andreas Adelmann <andreas.adelmann@psi.ch>
Date: Wed, 17 Jun 2026 11:20:48 +0200
Subject: [PATCH 14/14] improve code doc

---
 src/LinearSolvers/PCG.h | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/src/LinearSolvers/PCG.h b/src/LinearSolvers/PCG.h
index d2c826f28..049fc6810 100644
--- a/src/LinearSolvers/PCG.h
+++ b/src/LinearSolvers/PCG.h
@@ -337,13 +337,16 @@ namespace ippl {
             , preconditioner_m(nullptr) {};
 
         /*
-         * Extends the CG workspace with the preconditioner result buffers s and
-         * pcond_out so operator() does not allocate per solve. pcond_out keeps
-         * its default NoBcFace BCs: the preconditioner's internal operator
-         * chain therefore never triggers PeriodicFace::apply MPI calls -- if it
-         * did, the global MPI sequence would diverge from the master code path
-         * (where the preconditioner returned a fresh NoBcFace field) and
-         * intermittent multi-rank halo deadlocks would follow.
+         * Allocate the extra PCG work fields once, together with the CG
+         * work fields r, d, and q.
+         *
+         * s and pcond_out are scratch buffers for M^{-1} r. They intentionally
+         * keep the default NoBcFace boundary conditions. Only the search
+         * direction d gets the physical BCs after the preconditioner result has
+         * been copied into it. If these scratch fields inherited periodic BCs,
+         * assignments inside the preconditioner would call PeriodicFace::apply
+         * and add halo MPI exchanges that did not exist when the preconditioner
+         * returned fresh temporary NoBcFace fields.
          */
         void initializeFields(mesh_type& mesh, layout_type& layout) override {
             CG<OperatorRet, LowerRet, UpperRet, UpperLowerRet, InverseDiagRet, DiagRet, FieldLHS,
@@ -524,14 +527,13 @@ namespace ippl {
         std::unique_ptr<preconditioner<FieldLHS>> preconditioner_m;
 
         /*
-         * Extends the CG workspace with the preconditioner result buffers s and
-         * pcond_out so operator() does not allocate per solve.
-         * Preconditioner result buffers, allocated once via initializeFields()
-         * pcond_out keeps its default NoBcFace BCs: the preconditioner's internal operator
-         * chain therefore never triggers PeriodicFace::apply MPI calls -- if it
-         * did, the global MPI sequence would diverge from the master code path
-         * (where the preconditioner returned a fresh NoBcFace field) and
-         * intermittent multi-rank halo deadlocks would follow.
+         * Persistent preconditioner output buffers. They are allocated in
+         * initializeFields() and reused on every solve to avoid per-iteration
+         * Field allocation.
+         *
+         * These buffers are scratch storage, not physical solution fields, so
+         * they must keep NoBcFace BCs. The physically meaningful BCs are applied
+         * only to d after copying pcond_out into it.
          */
 
         lhs_type s;