diff --git a/src/Communicate/Archive.h b/src/Communicate/Archive.h
index 6ec74f946..2e8561e36 100644
--- a/src/Communicate/Archive.h
+++ b/src/Communicate/Archive.h
@@ -9,6 +9,9 @@
 //   that they have type char and thus contain raw bytes, unlike other typed buffers
 //   such as detail::FieldBufferData used by HaloCells.
 //
+//   On CUDA/HIP the internal buffer is allocated directly via cudaMalloc/hipMalloc
+//   so that the pointer is page-aligned (4K) and compatible with MPI IPC.
+//
 #ifndef IPPL_ARCHIVE_H
 #define IPPL_ARCHIVE_H
 
@@ -32,6 +35,7 @@ namespace ippl {
             using pointer_type = typename buffer_type::pointer_type;
 
             Archive(size_type size = 0);
+            ~Archive();
 
             /*!
              * Serialize.
@@ -40,6 +44,10 @@ namespace ippl {
             template <typename T, class... ViewArgs>
             void serialize(const Kokkos::View<T*, ViewArgs...>& view, size_type nsends);
 
+            template <typename T, class... ViewArgs, typename HashView>
+            void serialize(const Kokkos::View<T*, ViewArgs...>& view, const HashView& hash,
+                           size_type nsends);
+
             /*!
              * Serialize vector attributes
              *
@@ -52,6 +60,10 @@ namespace ippl {
             void serialize(const Kokkos::View<Vector<T, Dim>*, ViewArgs...>& view,
                            size_type nsends);
 
+            template <typename T, unsigned Dim, class... ViewArgs, typename HashView>
+            void serialize(const Kokkos::View<Vector<T, Dim>*, ViewArgs...>& view,
+                           const HashView& hash, size_type nsends);
+
             /*!
              * Deserialize.
              * @param view to put data to
@@ -59,6 +71,14 @@ namespace ippl {
             template <typename T, class... ViewArgs>
             void deserialize(Kokkos::View<T*, ViewArgs...>& view, size_type nrecvs);
 
+            template <typename T, class... ViewArgs>
+            void deserialize(Kokkos::View<T*, ViewArgs...>& view, size_type offset,
+                             size_type nrecvs);
+
+            template <typename T, unsigned Dim, class... ViewArgs>
+            void deserialize(Kokkos::View<Vector<T, Dim>*, ViewArgs...>& view, size_type offset,
+                             size_type nrecvs);
+
             /*!
              * Deserialize vector attributes
              *
@@ -71,33 +91,50 @@ namespace ippl {
             void deserialize(Kokkos::View<Vector<T, Dim>*, ViewArgs...>& view, size_type nrecvs);
 
             /*!
-             * @returns a pointer to the data of the buffer
+             * @returns a pointer to the data of the buffer.
+             *          On GPU this is a page-aligned device pointer from cudaMalloc/hipMalloc.
              */
-            pointer_type getBuffer() { return buffer_m.data(); }
+            pointer_type getBuffer() { return bufferData(); }
 
             /*!
-             * @returns the size of the buffer
+             * @returns the number of bytes written so far
              */
             size_type getSize() const { return writepos_m; }
 
-            size_type getBufferSize() const { return buffer_m.size(); }
-
-            void resizeBuffer(size_type size) { Kokkos::resize(buffer_m, size); }
+            /*!
+             * @returns the total capacity of the buffer in bytes
+             */
+            size_type getBufferSize() const { return bufferSize(); }
 
-            void reallocBuffer(size_type size) { Kokkos::realloc(buffer_m, size); }
+            void resizeBuffer(size_type size);
+            void reallocBuffer(size_type size);
 
             void resetWritePos() { writepos_m = 0; }
             void resetReadPos() { readpos_m = 0; }
 
-            ~Archive() = default;
-
         private:
             //! write position for serialization
             size_type writepos_m;
             //! read position for deserialization
             size_type readpos_m;
-            //! serialized data
+
+#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
+            //! Raw device pointer from cudaMalloc/hipMalloc (page-aligned, IPC-safe)
+            pointer_type buffer_ptr_m = nullptr;
+            size_type buffer_size_m   = 0;
+
+            pointer_type bufferData() const { return buffer_ptr_m; }
+            size_type bufferSize() const { return buffer_size_m; }
+
+            void gpuAlloc(size_type size);
+            void gpuFree();
+#else
+            //! serialized data (standard Kokkos view on CPU)
             buffer_type buffer_m;
+
+            pointer_type bufferData() const { return buffer_m.data(); }
+            size_type bufferSize() const { return buffer_m.size(); }
+#endif
         };
     }  // namespace detail
 }  // namespace ippl
diff --git a/src/Communicate/Archive.hpp b/src/Communicate/Archive.hpp
index a55c33c7d..cff0482f0 100644
--- a/src/Communicate/Archive.hpp
+++ b/src/Communicate/Archive.hpp
@@ -4,6 +4,12 @@
 //
 #include "Archive.h"
 
+#if defined(KOKKOS_ENABLE_CUDA)
+#include <cuda_runtime.h>
+#elif defined(KOKKOS_ENABLE_HIP)
+#include <hip/hip_runtime.h>
+#endif
+
 namespace ippl {
     namespace detail {
         KOKKOS_INLINE_FUNCTION void copyBytes(char* dst, const char* src, size_t size) {
@@ -12,117 +18,372 @@ namespace ippl {
             }
         }
 
+        /*
+         * Packs a sparse selection of scalar particle-attribute entries into
+         * the archive buffer. `hash(i)` gives the source index in `view_data`
+         * for the i-th item being sent; the functor copies that value as raw
+         * bytes to `buf + wpos + i * sizeof(T)`. This is used by
+         * Archive::serialize(view, hash, nsends) during particle migration,
+         * where only particles listed in the send hash are serialized.
+         */
+        template <typename T, typename HashView, typename BufferPtr>
+        struct SerializeHashFunctor {
+            const T* view_data;
+            HashView hash;
+            BufferPtr buf;
+            size_t elem_size;
+            size_t wpos;
+
+            KOKKOS_INLINE_FUNCTION void operator()(const size_t i) const {
+                const char* src = reinterpret_cast<const char*>(view_data + hash(i));
+                char* dst       = buf + i * elem_size + wpos;
+                copyBytes(dst, src, elem_size);
+            }
+        };
+
+        template <typename T, unsigned Dim, typename HashView, typename BufferPtr>
+        struct SerializeHashVectorFunctor {
+            const Vector<T, Dim>* view_data;
+            HashView hash;
+            BufferPtr buf;
+            size_t elem_size;
+            size_t wpos;
+
+            KOKKOS_INLINE_FUNCTION void operator()(const size_t i, const size_t d) const {
+                const Vector<T, Dim>* vec = view_data + hash(i);
+                const T value             = (*vec)[d];
+                const char* src           = reinterpret_cast<const char*>(&value);
+                char* dst                 = buf + (Dim * i + d) * elem_size + wpos;
+                copyBytes(dst, src, elem_size);
+            }
+        };
+
+        // =================================================================
+        // GPU buffer management
+        // =================================================================
+
+#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
+
+        template <class... Properties>
+        void Archive<Properties...>::gpuAlloc(size_type size) {
+            if (size == 0) return;
+#if defined(KOKKOS_ENABLE_HIP)
+            // HSA IPC (used by Cray MPICH for large-message GPU transfers)
+            // requires allocation sizes to be multiples of the GPU page
+            // granularity (64 KB on MI250X / MI300X).  Without this,
+            // hsa_amd_ipc_memory_attach fails with INVALID_ARGUMENT.
+            static constexpr size_type kGranularity = 65536;
+            size = ((size + kGranularity - 1) / kGranularity) * kGranularity;
+#endif
+            void* ptr = nullptr;
+#if defined(KOKKOS_ENABLE_CUDA)
+            (void)cudaMalloc(&ptr, size);
+#else
+            (void)hipMalloc(&ptr, size);
+#endif
+            buffer_ptr_m  = static_cast<pointer_type>(ptr);
+            buffer_size_m = size;
+        }
+
+        template <class... Properties>
+        void Archive<Properties...>::gpuFree() {
+            if (buffer_ptr_m) {
+#if defined(KOKKOS_ENABLE_CUDA)
+                (void)cudaFree(buffer_ptr_m);
+#else
+                (void)hipFree(buffer_ptr_m);
+#endif
+                buffer_ptr_m  = nullptr;
+                buffer_size_m = 0;
+            }
+        }
+
+        template <class... Properties>
+        Archive<Properties...>::Archive(size_type size)
+            : writepos_m(0)
+            , readpos_m(0) {
+            gpuAlloc(size);
+        }
+
+        template <class... Properties>
+        Archive<Properties...>::~Archive() {
+            gpuFree();
+        }
+
+        template <class... Properties>
+        void Archive<Properties...>::resizeBuffer(size_type size) {
+            if (size <= buffer_size_m) return;
+
+#if defined(KOKKOS_ENABLE_HIP)
+            static constexpr size_type kGranularity = 65536;
+            size = ((size + kGranularity - 1) / kGranularity) * kGranularity;
+#endif
+            pointer_type new_ptr = nullptr;
+            void* vptr           = nullptr;
+#if defined(KOKKOS_ENABLE_CUDA)
+            (void)cudaMalloc(&vptr, size);
+#else
+            (void)hipMalloc(&vptr, size);
+#endif
+            new_ptr = static_cast<pointer_type>(vptr);
+
+            if (buffer_ptr_m && buffer_size_m > 0) {
+#if defined(KOKKOS_ENABLE_CUDA)
+                (void)cudaMemcpy(new_ptr, buffer_ptr_m, buffer_size_m, cudaMemcpyDeviceToDevice);
+                (void)cudaFree(buffer_ptr_m);
+#else
+                (void)hipMemcpy(new_ptr, buffer_ptr_m, buffer_size_m, hipMemcpyDeviceToDevice);
+                (void)hipFree(buffer_ptr_m);
+#endif
+            }
+
+            buffer_ptr_m  = new_ptr;
+            buffer_size_m = size;
+        }
+
+        template <class... Properties>
+        void Archive<Properties...>::reallocBuffer(size_type size) {
+            gpuFree();
+            gpuAlloc(size);
+        }
+
+#else  // CPU path
+
         template <class... Properties>
         Archive<Properties...>::Archive(size_type size)
             : writepos_m(0)
             , readpos_m(0)
             , buffer_m("buffer", size) {}
 
-        // -----------------------------------
-        // Scalar serialize
+        template <class... Properties>
+        Archive<Properties...>::~Archive() = default;
+
+        template <class... Properties>
+        void Archive<Properties...>::resizeBuffer(size_type size) {
+            Kokkos::resize(buffer_m, size);
+        }
+
+        template <class... Properties>
+        void Archive<Properties...>::reallocBuffer(size_type size) {
+            Kokkos::realloc(buffer_m, size);
+        }
+
+#endif  // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
+
+        // =================================================================
+        // Serialize — scalar
+        // =================================================================
+
         template <class... Properties>
         template <typename T, class... ViewArgs>
         void Archive<Properties...>::serialize(const Kokkos::View<T*, ViewArgs...>& view,
                                                size_type nsends) {
-            constexpr size_t size = sizeof(T);
-            char* dst_ptr         = (char*)(buffer_m.data()) + writepos_m;
-            char* src_ptr         = (char*)(view.data());
-            assert(writepos_m + (nsends * size) <= buffer_m.size());
-            // construct temp views of the src/dst buffers of the correct size (bytes)
-            using src_view_type =
-                Kokkos::View<char*, typename Kokkos::View<T*, ViewArgs...>::memory_space,
-                             Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
-            using dst_view_type =
-                Kokkos::View<char*, typename buffer_type::memory_space,
-                             Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
-            src_view_type src_view(src_ptr, size * nsends);
-            dst_view_type dst_view(dst_ptr, size * nsends);
-            Kokkos::deep_copy(dst_view, src_view);
+            using exec_space  = typename Kokkos::View<T*, ViewArgs...>::execution_space;
+            using policy_type = Kokkos::RangePolicy<exec_space>;
+
+            size_t size    = sizeof(T);
+            auto base      = bufferData();
+            auto writepos  = writepos_m;
+            Kokkos::parallel_for(
+                "Archive::serialize()", policy_type(0, nsends), KOKKOS_LAMBDA(const size_type i) {
+                    const char* src = reinterpret_cast<const char*>(view.data() + i);
+                    char* dst       = base + i * size + writepos;
+                    copyBytes(dst, src, size);
+                });
             Kokkos::fence();
-            writepos_m += (nsends * size);
+            writepos_m += size * nsends;
         }
 
-        // -----------------------------------
-        // Vector serialize
+        // =================================================================
+        // Serialize — scalar with hash
+        // =================================================================
+
+        template <class... Properties>
+        template <typename T, class... ViewArgs, typename HashView>
+        void Archive<Properties...>::serialize(const Kokkos::View<T*, ViewArgs...>& view,
+                                               const HashView& hash, size_type nsends) {
+            using exec_space  = HashView::execution_space;
+            using policy_type = Kokkos::RangePolicy<exec_space>;
+            using BufferPtr   = pointer_type;
+
+            SerializeHashFunctor<T, HashView, BufferPtr> f{view.data(), hash, bufferData(),
+                                                           sizeof(T), writepos_m};
+
+            Kokkos::parallel_for("Archive::serialize(hash)", policy_type(0, nsends), f);
+            Kokkos::fence();
+            writepos_m += sizeof(T) * nsends;
+        }
+
+        // =================================================================
+        // Serialize — vector
+        // =================================================================
+
         template <class... Properties>
         template <typename T, unsigned Dim, class... ViewArgs>
         void Archive<Properties...>::serialize(
             const Kokkos::View<Vector<T, Dim>*, ViewArgs...>& view, size_type nsends) {
-            constexpr size_t size         = sizeof(T);
-            char* dst_ptr                 = (char*)(buffer_m.data());
-            ippl::Vector<T, Dim>* src_ptr = view.data();
-            auto wp                       = writepos_m;
-            // The Kokkos range policies expect int64
-            // so we have to explicitly specify size_type (uint64)
             using exec_space = typename Kokkos::View<T*, ViewArgs...>::execution_space;
+
+            size_t size    = sizeof(T);
+            auto base      = bufferData();
+            auto writepos  = writepos_m;
+            // Default index type for range policies is int64,
+            // so we have to explicitly specify size_type (uint64)
             using mdrange_t =
                 Kokkos::MDRangePolicy<Kokkos::Rank<2>, Kokkos::IndexType<size_type>, exec_space>;
             Kokkos::parallel_for(
-                "Archive::serialize()", mdrange_t({0, 0}, {(long int)nsends, Dim}),
+                "Archive::serialize()",
+                // The constructor for Kokkos range policies always
+                // expects int64 regardless of index type provided
+                // by template parameters, so the typecast is necessary
+                // to avoid compiler warnings
+                mdrange_t({0, 0}, {(long int)nsends, Dim}),
                 KOKKOS_LAMBDA(const size_type i, const size_t d) {
-                    const char* src = reinterpret_cast<const char*>(&src_ptr[i][d]);
-                    char* dst       = dst_ptr + (Dim * i + d) * size + wp;
+                    const T value   = view.data()[i][d];
+                    const char* src = reinterpret_cast<const char*>(&value);
+                    char* dst       = base + (Dim * i + d) * size + writepos;
                     copyBytes(dst, src, size);
                 });
+            Kokkos::fence();
+            writepos_m += Dim * size * nsends;
+        }
+
+        // =================================================================
+        // Serialize — vector with hash
+        // =================================================================
 
+        template <class... Properties>
+        template <typename T, unsigned Dim, class... ViewArgs, typename HashView>
+        void Archive<Properties...>::serialize(
+            const Kokkos::View<Vector<T, Dim>*, ViewArgs...>& view, const HashView& hash,
+            size_type nsends) {
+            using exec_space = typename HashView::execution_space;
+            size_t size      = sizeof(T);
+            using BufferPtr  = pointer_type;
+            using mdrange_t =
+                Kokkos::MDRangePolicy<Kokkos::Rank<2>, Kokkos::IndexType<size_type>, exec_space>;
+
+            SerializeHashVectorFunctor<T, Dim, HashView, BufferPtr> f{
+                view.data(), hash, bufferData(), size, writepos_m};
+
+            Kokkos::parallel_for("Archive::serialize(hash, vector)",
+                                 mdrange_t({0, 0}, {(long int)nsends, Dim}), f);
             Kokkos::fence();
             writepos_m += Dim * size * nsends;
         }
 
-        // -----------------------------------
-        // Scalar Deserialize
+        // =================================================================
+        // Deserialize — scalar
+        // =================================================================
+
         template <class... Properties>
         template <typename T, class... ViewArgs>
         void Archive<Properties...>::deserialize(Kokkos::View<T*, ViewArgs...>& view,
                                                  size_type nrecvs) {
-            // if we have to enlarge the destination view
+            using exec_space  = typename Kokkos::View<T*, ViewArgs...>::execution_space;
+            using policy_type = Kokkos::RangePolicy<exec_space>;
+
+            size_t size = sizeof(T);
             if (nrecvs > view.extent(0)) {
                 Kokkos::realloc(view, nrecvs);
             }
-            //
-            constexpr size_t size = sizeof(T);
-            char* src_ptr         = (char*)(buffer_m.data()) + readpos_m;
-            char* dst_ptr         = (char*)(view.data());
-            assert(readpos_m + (nrecvs * size) <= buffer_m.size());
-            // construct temp views of the src/dst buffers of the correct size (bytes)
-            using src_view_type =
-                Kokkos::View<char*, typename buffer_type::memory_space,
-                             Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
-            using dst_view_type =
-                Kokkos::View<char*, typename Kokkos::View<T*, ViewArgs...>::memory_space,
-                             Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
-            src_view_type src_view(src_ptr, size * nrecvs);
-            dst_view_type dst_view(dst_ptr, size * nrecvs);
-            Kokkos::deep_copy(dst_view, src_view);
+            auto base    = bufferData();
+            auto readpos = readpos_m;
+            Kokkos::parallel_for(
+                "Archive::deserialize()", policy_type(0, nrecvs), KOKKOS_LAMBDA(const size_type i) {
+                    const char* src = base + i * size + readpos;
+                    char* dst       = reinterpret_cast<char*>(view.data() + i);
+                    copyBytes(dst, src, size);
+                });
+            // Wait for deserialization kernel to complete
+            // (as with serialization kernels)
             Kokkos::fence();
-            readpos_m += (nrecvs * size);
+            readpos_m += size * nrecvs;
         }
 
-        // -----------------------------------
-        // Vector Deserialize
+        // =================================================================
+        // Deserialize — vector
+        // =================================================================
+
         template <class... Properties>
         template <typename T, unsigned Dim, class... ViewArgs>
         void Archive<Properties...>::deserialize(Kokkos::View<Vector<T, Dim>*, ViewArgs...>& view,
-                                                 size_type nrecvs) 
-        {
-            // if we have to enlarge the destination view
+                                                 size_type nrecvs) {
+            using exec_space = typename Kokkos::View<T*, ViewArgs...>::execution_space;
+
+            size_t size = sizeof(T);
             if (nrecvs > view.extent(0)) {
                 Kokkos::realloc(view, nrecvs);
             }
-            //
-            constexpr size_t size         = sizeof(T);
-            char* src_ptr                 = (char*)(buffer_m.data());
-            ippl::Vector<T, Dim>* dst_ptr = view.data();
-            auto rp                       = readpos_m;
-            using exec_space              = typename Kokkos::View<T*, ViewArgs...>::execution_space;
             using mdrange_t =
                 Kokkos::MDRangePolicy<Kokkos::Rank<2>, Kokkos::IndexType<size_type>, exec_space>;
+            auto base    = bufferData();
+            auto readpos = readpos_m;
             Kokkos::parallel_for(
                 "Archive::deserialize()", mdrange_t({0, 0}, {(long int)nrecvs, Dim}),
                 KOKKOS_LAMBDA(const size_type i, const size_t d) {
-                    const char* src = src_ptr + (Dim * i + d) * size + rp;
-                    char* dst       = reinterpret_cast<char*>(&dst_ptr[i][d]);
+                    const char* src = base + (Dim * i + d) * size + readpos;
+                    T value;
+                    char* dst = reinterpret_cast<char*>(&value);
+                    copyBytes(dst, src, size);
+                    view.data()[i](d) = value;
+                });
+            Kokkos::fence();
+            readpos_m += Dim * size * nrecvs;
+        }
+
+        // =================================================================
+        // Deserialize — scalar with offset
+        // =================================================================
+
+        template <class... Properties>
+        template <typename T, class... ViewArgs>
+        void Archive<Properties...>::deserialize(Kokkos::View<T*, ViewArgs...>& view,
+                                                 size_type offset, size_type nrecvs) {
+            using exec_space  = typename Kokkos::View<T*, ViewArgs...>::execution_space;
+            using policy_type = Kokkos::RangePolicy<exec_space>;
+            size_t size       = sizeof(T);
+            if (offset + nrecvs > view.extent(0)) {
+                Kokkos::resize(view, offset + nrecvs);
+            }
+            auto base    = bufferData();
+            auto readpos = readpos_m;
+            Kokkos::parallel_for(
+                "Archive::deserialize(offset)", policy_type(0, nrecvs),
+                KOKKOS_LAMBDA(const size_type i) {
+                    const char* src = base + i * size + readpos;
+                    char* dst       = reinterpret_cast<char*>(view.data() + offset + i);
+                    copyBytes(dst, src, size);
+                });
+            Kokkos::fence();
+            readpos_m += size * nrecvs;
+        }
+
+        // =================================================================
+        // Deserialize — vector with offset
+        // =================================================================
+
+        template <class... Properties>
+        template <typename T, unsigned Dim, class... ViewArgs>
+        void Archive<Properties...>::deserialize(Kokkos::View<Vector<T, Dim>*, ViewArgs...>& view,
+                                                 size_type offset, size_type nrecvs) {
+            using exec_space = typename Kokkos::View<T*, ViewArgs...>::execution_space;
+            size_t size      = sizeof(T);
+            if (offset + nrecvs > view.extent(0)) {
+                Kokkos::resize(view, offset + nrecvs);
+            }
+            using mdrange_t =
+                Kokkos::MDRangePolicy<Kokkos::Rank<2>, Kokkos::IndexType<size_type>, exec_space>;
+            auto base    = bufferData();
+            auto readpos = readpos_m;
+            Kokkos::parallel_for(
+                "Archive::deserialize(offset, vector)", mdrange_t({0, 0}, {(long int)nrecvs, Dim}),
+                KOKKOS_LAMBDA(const size_type i, const size_t d) {
+                    const char* src = base + (Dim * i + d) * size + readpos;
+                    T value;
+                    char* dst = reinterpret_cast<char*>(&value);
                     copyBytes(dst, src, size);
+                    view.data()[offset + i](d) = value;
                 });
             Kokkos::fence();
             readpos_m += Dim * size * nrecvs;
diff --git a/src/Communicate/BufferHandler.hpp b/src/Communicate/BufferHandler.hpp
index c6d57f0a3..e27369c06 100644
--- a/src/Communicate/BufferHandler.hpp
+++ b/src/Communicate/BufferHandler.hpp
@@ -1,6 +1,8 @@
 #ifndef IPPL_BUFFER_HANDLER_HPP
 #define IPPL_BUFFER_HANDLER_HPP
 
+#include <algorithm>
+
 namespace ippl {
 
     template <typename MemorySpace>
@@ -11,6 +13,15 @@ namespace ippl {
     DefaultBufferHandler<MemorySpace>::getBuffer(size_type size, double overallocation) {
         size_type requiredSize = static_cast<size_type>(size * overallocation);
 
+        // Round up to page granularity to avoid 0-byte or tiny allocations
+        // and to align with the GPU-aware MPI registration cache.
+        constexpr size_type PAGE_SIZE = 4096;
+        if (requiredSize < PAGE_SIZE) {
+            requiredSize = PAGE_SIZE;
+        } else {
+            requiredSize = ((requiredSize + PAGE_SIZE - 1) / PAGE_SIZE) * PAGE_SIZE;
+        }
+
         auto freeBuffer = findFreeBuffer(requiredSize);
         if (freeBuffer != nullptr) {
             return getFreeBuffer(freeBuffer);
@@ -120,17 +131,12 @@ namespace ippl {
     template <typename MemorySpace>
     typename DefaultBufferHandler<MemorySpace>::buffer_type
     DefaultBufferHandler<MemorySpace>::reallocateLargestFreeBuffer(size_type requiredSize) {
-        auto largest_it    = std::prev(free_buffers.end());
-        buffer_type buffer = *largest_it;
-
-        freeSize_m -= buffer->getBufferSize();
-        usedSize_m += requiredSize;
-
-        free_buffers.erase(buffer);
-        buffer->reallocBuffer(requiredSize);
-
-        used_buffers.insert(buffer);
-        return buffer;
+        // Always allocate a
+        // fresh buffer instead of reallocating the largest free one: a
+        // free + alloc cycle can release the device pointer that a GPU-aware
+        // MPI's registration cache still holds. Keeping the old buffers in the free
+        // pool and returning a new buffer at a new address sidesteps this.
+        return allocateNewBuffer(requiredSize);
     }
 
     template <typename MemorySpace>
@@ -145,4 +151,4 @@ namespace ippl {
 
 }  // namespace ippl
 
-#endif
+#endif
\ No newline at end of file
diff --git a/src/Communicate/Buffers.cpp b/src/Communicate/Buffers.cpp
index 026633fec..e99100bc2 100644
--- a/src/Communicate/Buffers.cpp
+++ b/src/Communicate/Buffers.cpp
@@ -32,13 +32,13 @@ namespace ippl {
         }
 
         void Communicator::deleteAllBuffers() {
-            buffer_handlers_m->forAll([]<typename BufferHandler>(BufferHandler&& bh) {
+            getBufferHandler().forAll([]<typename BufferHandler>(BufferHandler&& bh) {
                 bh.deleteAllBuffers();
             });
         }
 
         void Communicator::freeAllBuffers() {
-            buffer_handlers_m->forAll([]<typename BufferHandler>(BufferHandler&& bh) {
+            getBufferHandler().forAll([]<typename BufferHandler>(BufferHandler&& bh) {
                 bh.freeAllBuffers();
             });
         }
diff --git a/src/Communicate/Buffers.hpp b/src/Communicate/Buffers.hpp
index 98c0b513f..597461b4d 100644
--- a/src/Communicate/Buffers.hpp
+++ b/src/Communicate/Buffers.hpp
@@ -20,21 +20,23 @@
 //   exchanging particle data between ranks.
 //
 
+#include "Utility/IpplTimings.h"
 namespace ippl {
     namespace mpi {
 
         template <typename MemorySpace, typename T>
         Communicator::buffer_type<MemorySpace> Communicator::getBuffer(size_type size,
                                                                        double overallocation) {
-            auto& buffer_handler = buffer_handlers_m->get<MemorySpace>();
+            auto& buffer_handler = getBufferHandler().get<MemorySpace>();
 
-            return buffer_handler.getBuffer(size * sizeof(T),
-                                            std::max(overallocation, defaultOveralloc_m));
+            auto buf = buffer_handler.getBuffer(size * sizeof(T),
+                                                std::max(overallocation, defaultOveralloc_m));
+            return buf;
         }
 
         template <typename MemorySpace>
         void Communicator::freeBuffer(Communicator::buffer_type<MemorySpace> buffer) {
-            auto& buffer_handler = buffer_handlers_m->get<MemorySpace>();
+            auto& buffer_handler = getBufferHandler().get<MemorySpace>();
 
             buffer_handler.freeBuffer(buffer);
         }
diff --git a/src/Communicate/Communicator.cpp b/src/Communicate/Communicator.cpp
index 2c33f939f..e1a8b777b 100644
--- a/src/Communicate/Communicator.cpp
+++ b/src/Communicate/Communicator.cpp
@@ -3,25 +3,27 @@
 
 namespace ippl::mpi {
 
+    namespace {
+        // Populate rank_m / size_m from the live communicator.
+        void cacheRankAndSize(const MPI_Comm& comm, int& rank, int& size) {
+            MPI_Comm_rank(comm, &rank);
+            MPI_Comm_size(comm, &size);
+        }
+    }  // namespace
+
     Communicator::Communicator()
-        : buffer_handlers_m(get_buffer_handler_instance())
-        , comm_m(new MPI_Comm(MPI_COMM_WORLD)) {
-        MPI_Comm_rank(*comm_m, &rank_m);
-        MPI_Comm_size(*comm_m, &size_m);
+        : comm_m(new MPI_Comm(MPI_COMM_WORLD)) {
+        cacheRankAndSize(*comm_m, rank_m, size_m);
     }
 
     Communicator::Communicator(MPI_Comm comm) {
-        buffer_handlers_m = get_buffer_handler_instance();
-        comm_m            = std::make_shared<MPI_Comm>(comm);
-        MPI_Comm_rank(*comm_m, &rank_m);
-        MPI_Comm_size(*comm_m, &size_m);
+        comm_m = std::make_shared<MPI_Comm>(comm);
+        cacheRankAndSize(*comm_m, rank_m, size_m);
     }
 
     Communicator& Communicator::operator=(MPI_Comm comm) {
-        buffer_handlers_m = get_buffer_handler_instance();
-        comm_m            = std::make_shared<MPI_Comm>(comm);
-        MPI_Comm_rank(*comm_m, &rank_m);
-        MPI_Comm_size(*comm_m, &size_m);
+        comm_m = std::make_shared<MPI_Comm>(comm);
+        cacheRankAndSize(*comm_m, rank_m, size_m);
         return *this;
     }
 
@@ -41,14 +43,4 @@ namespace ippl::mpi {
         return (flag != 0);
     }
 
-    // ---------------------------------------
-    // singleton access to buffer manager
-    // ---------------------------------------
-    std::shared_ptr<Communicator::buffer_handler_type> Communicator::get_buffer_handler_instance() {
-        static std::shared_ptr<Communicator::buffer_handler_type> comm_buff_handler_ptr{nullptr};
-        if (comm_buff_handler_ptr == nullptr) {
-            comm_buff_handler_ptr = std::make_shared<Communicator::buffer_handler_type>();
-        }
-        return comm_buff_handler_ptr;
-    }
 }  // namespace ippl::mpi
diff --git a/src/Communicate/Communicator.h b/src/Communicate/Communicator.h
index 9819ab3bd..7685d20d5 100644
--- a/src/Communicate/Communicator.h
+++ b/src/Communicate/Communicator.h
@@ -15,6 +15,7 @@
 
 ////////////////////////////////////////////////
 // For message size check; see below
+
 #include <climits>
 #include <cstdlib>
 
@@ -27,7 +28,6 @@
 
 namespace ippl {
     namespace mpi {
-
         class Communicator : public TagMaker {
         public:
             Communicator();
@@ -157,16 +157,21 @@ namespace ippl {
 
             const MPI_Comm& getCommunicator() const noexcept { return *comm_m; }
 
+            // MPI uses int for byte counts; messages exceeding INT_MAX must
+            // be split. This shared check aborts the run with a single
+            // diagnostic instead of silently truncating.
+            void assertMessageSize(size_type msize) const {
+                if (msize > static_cast<size_type>(INT_MAX)) {
+                    std::cerr << "Communicator: message size " << msize
+                              << " bytes exceeds INT_MAX (" << INT_MAX << ")\n";
+                    MPI_Abort(*comm_m, -1);
+                }
+            }
+
             template <class Buffer, typename Archive>
             void recv(int src, int tag, Buffer& buffer, Archive& ar, size_type msize,
                       size_type nrecvs) {
-                // Temporary fix. MPI communication seems to have problems when the
-                // count argument exceeds the range of int, so large messages should
-                // be split into smaller messages
-                if (msize > INT_MAX) {
-                    std::cerr << "Message size exceeds range of int" << std::endl;
-                    this->abort();
-                }
+                assertMessageSize(msize);
                 MPI_Status status;
                 MPI_Recv(ar.getBuffer(), msize, MPI_BYTE, src, tag, *comm_m, &status);
 
@@ -176,20 +181,27 @@ namespace ippl {
             template <class Buffer, typename Archive>
             void isend(int dest, int tag, Buffer& buffer, Archive& ar, MPI_Request& request,
                        size_type nsends) {
-                if (ar.getSize() > INT_MAX) {
-                    std::cerr << "Message size exceeds range of int" << std::endl;
-                    this->abort();
-                }
+                assertMessageSize(ar.getSize());
                 buffer.serialize(ar, nsends);
                 MPI_Isend(ar.getBuffer(), ar.getSize(), MPI_BYTE, dest, tag, *comm_m, &request);
             }
 
+            template <typename Archive>
+            void isend(int dest, int tag, Archive& ar, MPI_Request& request) {
+                assertMessageSize(ar.getSize());
+                MPI_Isend(ar.getBuffer(), ar.getSize(), MPI_BYTE, dest, tag, *comm_m, &request);
+            }
+
+            template <typename Archive>
+            void recv(int src, int tag, Archive& ar, size_type msize) {
+                assertMessageSize(msize);
+                MPI_Status status;
+                MPI_Recv(ar.getBuffer(), msize, MPI_BYTE, src, tag, *comm_m, &status);
+            }
+
             template <typename Archive>
             void irecv(int src, int tag, Archive& ar, MPI_Request& request, size_type msize) {
-                if (msize > INT_MAX) {
-                    std::cerr << "Message size exceeds range of int" << std::endl;
-                    this->abort();
-                }
+                assertMessageSize(msize);
                 MPI_Irecv(ar.getBuffer(), msize, MPI_BYTE, src, tag, *comm_m, &request);
             }
 
@@ -201,7 +213,11 @@ namespace ippl {
             std::vector<LogEntry> gatherLogsFromAllRanks(const std::vector<LogEntry>& localLogs);
             void writeLogsToFile(const std::vector<LogEntry>& allLogs, const std::string& filename);
 
-            std::shared_ptr<buffer_handler_type> buffer_handlers_m;
+            static buffer_handler_type& getBufferHandler() {
+                static buffer_handler_type handler;
+                return handler;
+            }
+
             double defaultOveralloc_m = 1.0;
 
             /////////////////////////////////////////////////////////////////////////////////////
@@ -210,12 +226,10 @@ namespace ippl {
             std::shared_ptr<MPI_Comm> comm_m;
             int size_m;
             int rank_m;
-
-        public:
-            std::shared_ptr<buffer_handler_type> get_buffer_handler_instance();
         };
 
     }  // namespace mpi
+
 }  // namespace ippl
 
 #include "Communicate/Collectives.hpp"
diff --git a/src/Communicate/CommunicatorLogging.cpp b/src/Communicate/CommunicatorLogging.cpp
index 541ff4074..39c0197e8 100644
--- a/src/Communicate/CommunicatorLogging.cpp
+++ b/src/Communicate/CommunicatorLogging.cpp
@@ -7,8 +7,16 @@
 
 #include "Communicate/Communicator.h"
 #include "Communicate/LogEntry.h"
+#include "Communicate/LoggingBufferHandler.h"
 
 namespace ippl::mpi {
+
+    template <class MemorySpace>
+    struct is_a_logger : std::false_type {};
+
+    template <class MemorySpace>
+    struct is_a_logger<LoggingBufferHandler<MemorySpace>> : std::true_type {};
+
     void Communicator::printLogs(const std::string& filename) {
         std::vector<LogEntry> localLogs = gatherLocalLogs();
 
@@ -24,20 +32,17 @@ namespace ippl::mpi {
         }
     }
 
-    template <class MemorySpace>
-    struct is_a_logger : std::false_type {};
-
-    template <class MemorySpace>
-    struct is_a_logger<LoggingBufferHandler<MemorySpace> > : std::true_type {};
-
     std::vector<LogEntry> Communicator::gatherLocalLogs() {
         std::vector<LogEntry> localLogs;
-        if constexpr (is_a_logger<buffer_handler_type>::value) {
-            buffer_handlers_m->forAll([&](auto& loggingHandler) {
-                const auto& logs = loggingHandler.getLogs();
+
+        getBufferHandler().forAll([&](auto& handler) {
+            using handler_t = std::decay_t<decltype(handler)>;
+            if constexpr (is_a_logger<handler_t>::value) {
+                const auto& logs = handler.getLogs();
                 localLogs.insert(localLogs.end(), logs.begin(), logs.end());
-            });
-        }
+            }
+        });
+
         return localLogs;
     }
 
@@ -88,11 +93,9 @@ namespace ippl::mpi {
         size_t offset = 0;
 
         while (offset < buffer.size()) {
-            LogEntry logEntry = LogEntry::deserialize(buffer, offset);
-
-            logs.push_back(logEntry);
-
-            offset += logEntry.serialize().size();
+            // deserializeAdvance walks offset past the consumed bytes -
+            // avoids the O(N^2) blowup of re-serializing each entry
+            logs.push_back(LogEntry::deserializeAdvance(buffer, offset));
         }
         return logs;
     }
diff --git a/src/Communicate/Environment.cpp b/src/Communicate/Environment.cpp
index 8e19df365..706c821f1 100644
--- a/src/Communicate/Environment.cpp
+++ b/src/Communicate/Environment.cpp
@@ -9,9 +9,23 @@ namespace ippl {
     namespace mpi {
 
         Environment::Environment(int& argc, char**& argv, const MPI_Comm& comm)
-            : comm_m(comm) {
+            : comm_m(comm)
+            , threadMultiple_m(false) {
             if (!initialized()) {
-                MPI_Init(&argc, &argv);
+                int provided = MPI_THREAD_SINGLE;
+                int rc = MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided);
+                if (rc != MPI_SUCCESS) {
+                    std::cerr << "MPI_Init_thread failed (rc=" << rc << ")" << std::endl;
+                    std::exit(EXIT_FAILURE);
+                }
+                threadMultiple_m = (provided >= MPI_THREAD_MULTIPLE);
+                if (!threadMultiple_m) {
+                    int rank = 0;
+                    MPI_Comm_rank(comm_m, &rank);
+                    if (rank == 0) {
+                        std::cerr << "MPI doesn't support MPI_THREAD_MULTIPLE!" << std::endl;
+                    }
+                }
             }
         }
 
diff --git a/src/Communicate/Environment.h b/src/Communicate/Environment.h
index fd03a9ee4..bad195df4 100644
--- a/src/Communicate/Environment.h
+++ b/src/Communicate/Environment.h
@@ -23,10 +23,13 @@ namespace ippl {
 
             static bool finalized();
 
+            bool threadMultiple() noexcept { return threadMultiple_m; }
+
             void abort(int errorcode = -1) noexcept { MPI_Abort(comm_m, errorcode); }
 
         private:
             MPI_Comm comm_m;
+            bool threadMultiple_m;
         };
     }  // namespace mpi
 }  // namespace ippl
diff --git a/src/Communicate/LogEntry.cpp b/src/Communicate/LogEntry.cpp
index 444221d6d..66539e879 100644
--- a/src/Communicate/LogEntry.cpp
+++ b/src/Communicate/LogEntry.cpp
@@ -40,28 +40,31 @@ namespace ippl {
         return buffer;
     }
 
-    LogEntry LogEntry::deserialize(const std::vector<char>& buffer, size_t offset) {
+    LogEntry LogEntry::deserializeAdvance(const std::vector<char>& buffer, size_t& offset) {
         LogEntry entry;
-        size_t current_pos = offset;
 
-        entry.methodName  = deserializeString(buffer, current_pos);
-        entry.usedSize    = deserializeBasicType<size_t>(buffer, current_pos);
-        entry.freeSize    = deserializeBasicType<size_t>(buffer, current_pos);
-        entry.memorySpace = deserializeString(buffer, current_pos);
-        entry.rank        = deserializeBasicType<int>(buffer, current_pos);
+        entry.methodName  = deserializeString(buffer, offset);
+        entry.usedSize    = deserializeBasicType<size_t>(buffer, offset);
+        entry.freeSize    = deserializeBasicType<size_t>(buffer, offset);
+        entry.memorySpace = deserializeString(buffer, offset);
+        entry.rank        = deserializeBasicType<int>(buffer, offset);
 
-        auto duration   = deserializeBasicType<long long>(buffer, current_pos);
+        auto duration   = deserializeBasicType<long long>(buffer, offset);
         entry.timestamp = std::chrono::time_point<std::chrono::high_resolution_clock>(
             std::chrono::high_resolution_clock::duration(duration));
 
-        size_t mapSize = deserializeBasicType<size_t>(buffer, current_pos);
+        size_t mapSize = deserializeBasicType<size_t>(buffer, offset);
         for (size_t i = 0; i < mapSize; ++i) {
-            std::string key       = deserializeString(buffer, current_pos);
-            std::string value     = deserializeString(buffer, current_pos);
+            std::string key       = deserializeString(buffer, offset);
+            std::string value     = deserializeString(buffer, offset);
             entry.parameters[key] = value;
         }
 
         return entry;
     }
 
+    LogEntry LogEntry::deserialize(const std::vector<char>& buffer, size_t offset) {
+        return deserializeAdvance(buffer, offset);
+    }
+
 }  // namespace ippl
diff --git a/src/Communicate/LogEntry.h b/src/Communicate/LogEntry.h
index a853c3431..1929915c3 100644
--- a/src/Communicate/LogEntry.h
+++ b/src/Communicate/LogEntry.h
@@ -20,6 +20,11 @@ namespace ippl {
 
         std::vector<char> serialize() const;
         static LogEntry deserialize(const std::vector<char>& buffer, size_t offset = 0);
+
+        /// Variant of deserialize() that advances the caller's offset past
+        /// the just-read entry. Lets callers walk a byte-stream of entries
+        /// without re-serializing each one to compute its size.
+        static LogEntry deserializeAdvance(const std::vector<char>& buffer, size_t& offset);
     };
 
     template <typename T>
diff --git a/src/Communicate/LoggingBufferHandler.h b/src/Communicate/LoggingBufferHandler.h
index f6a0bd5d6..95fb418cd 100644
--- a/src/Communicate/LoggingBufferHandler.h
+++ b/src/Communicate/LoggingBufferHandler.h
@@ -1,7 +1,6 @@
 #ifndef IPPL_LOGGING_BUFFER_HANDLER_H
 #define IPPL_LOGGING_BUFFER_HANDLER_H
 
-#include <chrono>
 #include <map>
 #include <memory>
 #include <string>
diff --git a/src/Field/BareField.hpp b/src/Field/BareField.hpp
index 695576af3..14a36eb45 100644
--- a/src/Field/BareField.hpp
+++ b/src/Field/BareField.hpp
@@ -146,7 +146,7 @@ namespace ippl {
     template <typename T, unsigned Dim, class... ViewArgs>
     void BareField<T, Dim, ViewArgs...>::fillHalo() {
         if (layout_m->comm.size() > 1) {
-            halo_m.fillHalo(dview_m, layout_m);
+            halo_m.fillHalo(dview_m, layout_m, nghost_m);
         }
         if (layout_m->isAllPeriodic_m) {
             using Op = typename detail::HaloCells<T, Dim, ViewArgs...>::assign;
@@ -157,7 +157,7 @@ namespace ippl {
     template <typename T, unsigned Dim, class... ViewArgs>
     void BareField<T, Dim, ViewArgs...>::accumulateHalo() {
         if (layout_m->comm.size() > 1) {
-            halo_m.accumulateHalo(dview_m, layout_m);
+            halo_m.accumulateHalo(dview_m, layout_m, nghost_m);
         }
         if (layout_m->isAllPeriodic_m) {
             using Op = typename detail::HaloCells<T, Dim, ViewArgs...>::rhs_plus_assign;
diff --git a/src/Field/HaloCells.h b/src/Field/HaloCells.h
index c4d87b9bd..6e7606083 100644
--- a/src/Field/HaloCells.h
+++ b/src/Field/HaloCells.h
@@ -57,7 +57,7 @@ namespace ippl {
              * @param view the original field data
              * @param layout the field layout storing the domain decomposition
              */
-            void accumulateHalo(view_type& view, Layout_t* layout);
+            void accumulateHalo(view_type& view, Layout_t* layout, int nghost);
 
             /*!
              * Send halo data to internal cells for only the physical cells
@@ -75,7 +75,7 @@ namespace ippl {
              * @param view the original field data
              * @param layout the field layout storing the domain decomposition
              */
-            void fillHalo(view_type&, Layout_t* layout);
+            void fillHalo(view_type&, Layout_t* layout, int nghost);
 
             /*!
              * Pack the field data to be sent into a contiguous array.
@@ -139,7 +139,8 @@ namespace ippl {
              * unpack function call
              */
             template <class Op>
-            void exchangeBoundaries(view_type& view, Layout_t* layout, SendOrder order, int nghost = 1);
+            void exchangeBoundaries(view_type& view, Layout_t* layout, SendOrder order,
+                                    int nghost);
 
             /*!
              * Extract the subview of the original data. This does not copy.
diff --git a/src/Field/HaloCells.hpp b/src/Field/HaloCells.hpp
index eb56642ce..d99ea638f 100644
--- a/src/Field/HaloCells.hpp
+++ b/src/Field/HaloCells.hpp
@@ -7,6 +7,7 @@
 #include <vector>
 
 #include "Utility/IpplException.h"
+#include "Utility/ParallelDispatch.h"
 
 #include "Communicate/Communicator.h"
 
@@ -16,8 +17,8 @@ namespace ippl {
         HaloCells<T, Dim, ViewArgs...>::HaloCells() {}
 
         template <typename T, unsigned Dim, class... ViewArgs>
-        void HaloCells<T, Dim, ViewArgs...>::accumulateHalo(view_type& view, Layout_t* layout) {
-            exchangeBoundaries<lhs_plus_assign>(view, layout, HALO_TO_INTERNAL);
+        void HaloCells<T, Dim, ViewArgs...>::accumulateHalo(view_type& view, Layout_t* layout, int nghost) {
+            exchangeBoundaries<lhs_plus_assign>(view, layout, HALO_TO_INTERNAL, nghost);
         }
 
         template <typename T, unsigned Dim, class... ViewArgs>
@@ -25,8 +26,8 @@ namespace ippl {
             exchangeBoundaries<lhs_plus_assign>(view, layout, HALO_TO_INTERNAL_NOGHOST, nghost);
         }
         template <typename T, unsigned Dim, class... ViewArgs>
-        void HaloCells<T, Dim, ViewArgs...>::fillHalo(view_type& view, Layout_t* layout) {
-            exchangeBoundaries<assign>(view, layout, INTERNAL_TO_HALO);
+        void HaloCells<T, Dim, ViewArgs...>::fillHalo(view_type& view, Layout_t* layout, int nghost) {
+            exchangeBoundaries<assign>(view, layout, INTERNAL_TO_HALO, nghost);
         }
 
         template <typename T, unsigned Dim, class... ViewArgs>
@@ -45,9 +46,9 @@ namespace ippl {
             auto ldom = layout->getLocalNDIndex();
             for (const auto& axis : ldom) {
                 if ((axis.length() == 1) && (Dim != 1)) {
-                    throw std::runtime_error(
-                        "HaloCells: Cannot do neighbour exchange when domain decomposition "
-                        "contains planes!");
+                    throw IpplException(
+                        "HaloCells::exchangeBoundaries",
+                        "Cannot do neighbour exchange when domain decomposition contains planes.");
                 }
             }
 
@@ -62,7 +63,7 @@ namespace ippl {
                 totalRequests += componentNeighbors.size();
             }
 
-            int me=Comm->rank();
+            int me = Comm->rank();
 
             using memory_space = typename view_type::memory_space;
             using buffer_type  = mpi::Communicator::buffer_type<memory_space>;
diff --git a/src/FieldLayout/FieldLayout.h b/src/FieldLayout/FieldLayout.h
index 780040e8b..c8f1c1e2b 100644
--- a/src/FieldLayout/FieldLayout.h
+++ b/src/FieldLayout/FieldLayout.h
@@ -201,7 +201,7 @@ namespace ippl {
         FieldLayout(const mpi::Communicator& = MPI_COMM_WORLD);
 
         FieldLayout(mpi::Communicator, const NDIndex<Dim>& domain, std::array<bool, Dim> decomp,
-                    bool isAllPeriodic = false);
+                    bool isAllPeriodic = false, int nghost = 1);
 
         // Destructor: Everything deletes itself automatically ... the base
         // class destructors inform all the FieldLayoutUser's we're going away.
@@ -213,38 +213,29 @@ namespace ippl {
         // FieldLayout constructors:
 
         void initialize(const NDIndex<Dim>& domain, std::array<bool, Dim> decomp,
-                        bool isAllPeriodic = false);
+                        bool isAllPeriodic = false, int nghost = 1);
 
         // Return the domain.
         const NDIndex<Dim>& getDomain() const { return gDomain_m; }
 
-        // Compare FieldLayouts to see if they represent the same domain; if
-        // dimensionalities are different, the NDIndex operator==() will return
-        // false:
+        // Compare FieldLayouts. Different dimensionalities or different global
+        // domains are not equal; same global domain but different per-rank
+        // local-domain decompositions are also not equal.
         template <unsigned Dim2>
         bool operator==(const FieldLayout<Dim2>& x) const {
-
-            // Throw exception if the domains are not the same
-            if (gDomain_m != x.getDomain()) {
-                throw std::runtime_error("FieldLayout: only FieldLayouts with the same global domain should be compared");
-            }
-
-            return gDomain_m == x.getDomain();
-        }
-
-        bool operator==(const FieldLayout<Dim>& x) const {
-
-            // Throw exception if the domains are not the same
-            if (gDomain_m != x.getDomain()) {
-                throw std::runtime_error("FieldLayout: only FieldLayouts with the same global domain should be compared");
-            }
-
-            for (unsigned int i = 0; i < Dim; ++i) {
-                if (hLocalDomains_m(comm.rank())[i] != x.getLocalNDIndex()[i]) {
+            if constexpr (Dim != Dim2) {
+                return false;
+            } else {
+                if (gDomain_m != x.getDomain()) {
                     return false;
                 }
+                for (unsigned int i = 0; i < Dim; ++i) {
+                    if (hLocalDomains_m(comm.rank())[i] != x.getLocalNDIndex()[i]) {
+                        return false;
+                    }
+                }
+                return true;
             }
-            return true;
         }
 
         // for the requested dimension, report if the distribution is
@@ -330,7 +321,7 @@ namespace ippl {
          * Finds all neighboring ranks based on the field layout
          * @param nghost number of ghost cells (default 1)
          */
-        void findNeighbors(int nghost = 1);
+        void findNeighbors(int nghost);
 
         /*!
          * Adds a neighbor to the neighbor list
@@ -389,6 +380,9 @@ namespace ippl {
         // Minimum width of all the local domains for each dimension
         unsigned int minWidth_m[Dim];
 
+        // Nghost needed for computing send/receive ranges
+        int nghost_m;
+
         void calcWidths();
     };
 
diff --git a/src/FieldLayout/FieldLayout.hpp b/src/FieldLayout/FieldLayout.hpp
index 2bcb1bcd1..ad6ae5a53 100644
--- a/src/FieldLayout/FieldLayout.hpp
+++ b/src/FieldLayout/FieldLayout.hpp
@@ -37,7 +37,8 @@ namespace ippl {
     FieldLayout<Dim>::FieldLayout(const mpi::Communicator& communicator)
         : comm(communicator)
         , dLocalDomains_m("local domains (device)", 0)
-        , hLocalDomains_m(Kokkos::create_mirror_view(dLocalDomains_m)) {
+        , hLocalDomains_m(Kokkos::create_mirror_view(dLocalDomains_m))
+        , nghost_m(1) {
         for (unsigned int d = 0; d < Dim; ++d) {
             minWidth_m[d] = 0;
         }
@@ -45,9 +46,9 @@ namespace ippl {
 
     template <unsigned Dim>
     FieldLayout<Dim>::FieldLayout(mpi::Communicator communicator, const NDIndex<Dim>& domain,
-                                  std::array<bool, Dim> isParallel, bool isAllPeriodic)
+                                  std::array<bool, Dim> isParallel, bool isAllPeriodic, int nghost)
         : FieldLayout(communicator) {
-        initialize(domain, isParallel, isAllPeriodic);
+        initialize(domain, isParallel, isAllPeriodic, nghost);
     }
 
     template <unsigned Dim>
@@ -60,7 +61,7 @@ namespace ippl {
             hLocalDomains_m(i) = domains[i];
         }
 
-        findNeighbors();
+        findNeighbors(nghost_m);
 
         Kokkos::deep_copy(dLocalDomains_m, hLocalDomains_m);
 
@@ -69,7 +70,7 @@ namespace ippl {
 
     template <unsigned Dim>
     void FieldLayout<Dim>::initialize(const NDIndex<Dim>& domain, std::array<bool, Dim> isParallel,
-                                      bool isAllPeriodic) {
+                                      bool isAllPeriodic, int nghost) {
         int nRanks = comm.size();
 
         gDomain_m = domain;
@@ -78,11 +79,17 @@ namespace ippl {
 
         isParallelDim_m = isParallel;
 
+        nghost_m = nghost;
+
         if (nRanks < 2) {
             Kokkos::resize(dLocalDomains_m, nRanks);
             Kokkos::resize(hLocalDomains_m, nRanks);
             hLocalDomains_m(0) = domain;
             Kokkos::deep_copy(dLocalDomains_m, hLocalDomains_m);
+            // Even on a single rank we must populate minWidth_m so that
+            // getDistribution(d) returns the right answer; without this the
+            // serial-build path silently reports every dim as parallel.
+            calcWidths();
             return;
         }
 
@@ -95,8 +102,11 @@ namespace ippl {
         }
 
         if (totparelems < nRanks) {
-            throw std::runtime_error("FieldLayout:initialize: domain can only be partitioned in to "
-                + std::to_string(totparelems) + " local domains, but there are " + std::to_string(nRanks) + " ranks, decrease the number of ranks or increase the domain.");
+            throw std::runtime_error(
+                "FieldLayout:initialize: domain can only be partitioned in to "
+                + std::to_string(totparelems) + " local domains, but there are "
+                + std::to_string(nRanks)
+                + " ranks, decrease the number of ranks or increase the domain.");
         }
 
         Kokkos::resize(dLocalDomains_m, nRanks);
@@ -105,7 +115,7 @@ namespace ippl {
         detail::Partitioner<Dim> partitioner;
         partitioner.split(domain, hLocalDomains_m, isParallel, nRanks);
 
-        findNeighbors();
+        findNeighbors(nghost);
 
         Kokkos::deep_copy(dLocalDomains_m, hLocalDomains_m);
 
@@ -288,7 +298,7 @@ namespace ippl {
             // 0 - touching the lower axis value
             // 1 - touching the upper axis value
             // 2 - parallel to the axis
-            if (intersect[d].length() == 1) {
+            if (intersect[d].length() == static_cast<size_t>(nghost)) {
                 if (gnd[d].first() != intersect[d].first()) {
                     index += digit;
                 }
diff --git a/src/FieldLayout/SubFieldLayout.hpp b/src/FieldLayout/SubFieldLayout.hpp
index 53ee24c11..50a83f9e2 100644
--- a/src/FieldLayout/SubFieldLayout.hpp
+++ b/src/FieldLayout/SubFieldLayout.hpp
@@ -102,7 +102,7 @@ namespace ippl {
             }
         }
 
-        this->findNeighbors();
+        this->findNeighbors(this->nghost_m);
 
         Kokkos::deep_copy(this->dLocalDomains_m, this->hLocalDomains_m);
 
diff --git a/src/Ippl.h b/src/Ippl.h
index 174c72a29..a606c655c 100644
--- a/src/Ippl.h
+++ b/src/Ippl.h
@@ -10,7 +10,6 @@
 #include "Types/IpplTypes.h"
 
 #include "Utility/Inform.h"
-#include "Utility/ParallelDispatch.h"
 
 #include "Communicate/Communicator.h"
 #include "Communicate/Environment.h"
@@ -46,6 +45,8 @@ namespace ippl {
     }  // namespace detail
 }  // namespace ippl
 
+#include "Utility/ParallelDispatch.h"
+
 // FIMXE remove (only for backwards compatibility)
 #include "IpplCore.h"
 
diff --git a/src/LinearSolvers/PCG.h b/src/LinearSolvers/PCG.h
index 3fdf17742..07a29426a 100644
--- a/src/LinearSolvers/PCG.h
+++ b/src/LinearSolvers/PCG.h
@@ -356,6 +356,18 @@ namespace ippl {
                  FieldRHS>()
             , preconditioner_m(nullptr) {};
 
+        /*
+         * Allocate the extra PCG work fields once, together with the CG
+         * work fields r, d, and q.
+         *
+         * s and pcond_out are scratch buffers for M^{-1} r. They intentionally
+         * keep the default NoBcFace boundary conditions. Only the search
+         * direction d gets the physical BCs after the preconditioner result has
+         * been copied into it. If these scratch fields inherited periodic BCs,
+         * assignments inside the preconditioner would call PeriodicFace::apply
+         * and add halo MPI exchanges that did not exist when the preconditioner
+         * returned fresh temporary NoBcFace fields.
+         */
         void initializeFields(mesh_type& mesh, layout_type& layout) override {
             CG<OperatorRet, LowerRet, UpperRet, UpperLowerRet, InverseDiagRet, DiagRet, FieldLHS,
                FieldRHS>::initializeFields(mesh, layout);
@@ -535,14 +547,13 @@ namespace ippl {
         std::unique_ptr<preconditioner<FieldLHS>> preconditioner_m;
 
         /*
-         * Extends the CG workspace with the preconditioner result buffers s and
-         * pcond_out so operator() does not allocate per solve.
-         * Preconditioner result buffers, allocated once via initializeFields()
-         * pcond_out keeps its default NoBcFace BCs: the preconditioner's internal operator
-         * chain therefore never triggers PeriodicFace::apply MPI calls -- if it
-         * did, the global MPI sequence would diverge from the master code path
-         * (where the preconditioner returned a fresh NoBcFace field) and
-         * intermittent multi-rank halo deadlocks would follow.
+         * Persistent preconditioner output buffers. They are allocated in
+         * initializeFields() and reused on every solve to avoid per-iteration
+         * Field allocation.
+         *
+         * These buffers are scratch storage, not physical solution fields, so
+         * they must keep NoBcFace BCs. The physically meaningful BCs are applied
+         * only to d after copying pcond_out into it.
          */
 
         lhs_type s;
diff --git a/src/Particle/ParticleAttrib.h b/src/Particle/ParticleAttrib.h
index 3d41b9029..bbdce337a 100644
--- a/src/Particle/ParticleAttrib.h
+++ b/src/Particle/ParticleAttrib.h
@@ -20,8 +20,12 @@
 
 #include "Expression/IpplExpressions.h"
 
+#ifdef IPPL_ENABLE_FFT
+#include "FFT/FFT.h"
+#endif
 #include "Interpolation/CIC.h"
 #include "Particle/ParticleAttribBase.h"
+#include "Particle/SortBuffer.h"
 
 namespace ippl {
 
@@ -58,6 +62,8 @@ namespace ippl {
         // discarded.
         void alloc(size_type) override;
 
+        void reserve(size_type) override;
+
         /*!
          * Particle deletion function. Partition the particles into a valid region
          * and an invalid region.
@@ -73,32 +79,46 @@ namespace ippl {
         void unpack(size_type) override;
 
         void serialize(detail::Archive<memory_space>& ar, size_type nsends) override {
-            ar.serialize(buf_m, nsends);
+            ar.serialize(dview_m, nsends);
+        }
+
+        void serialize(detail::Archive<memory_space>& ar, const hash_type& hash,
+                       size_type nsends) override {
+            ar.serialize(dview_m, hash, nsends);
         }
 
         void deserialize(detail::Archive<memory_space>& ar, size_type nrecvs) override {
             ar.deserialize(buf_m, nrecvs);
         }
 
-        virtual ~ParticleAttrib() = default;
-        
+        void deserialize(detail::Archive<memory_space>& ar, size_type offset,
+                         size_type nrecvs) override {
+            this->reserve(offset + nrecvs);
+            ar.deserialize(dview_m, offset, nrecvs);
+        }
+
+        KOKKOS_INLINE_FUNCTION virtual ~ParticleAttrib() = default;
+
         size_type size() const override { return dview_m.extent(0); }
-        
+
         size_type packedSize(const size_type count) const override {
             return count * sizeof(value_type);
         }
-        
-        void resize(size_type n) { Kokkos::resize(dview_m, n); }
 
         /*!
-         * @brief Reallocate the underlying view with a new size.
+         * @brief Resize the underlying view, preserving existing entries on grow.
          *
-         * This function reallocates the device view to a new size. Existing data is
-         * discarded and should not be relied upon after this call. Note that this function does not
-         * apply overallocation. For use from outside, call `ParticleAttrib::alloc(size_type)`
-         * instead.
+         * Capacity-only operation; does not change the live particle count.
+         * Overallocation is the caller's responsibility (`alloc()` / `create()`
+         * apply it when sizing this view).
+         */
+        void resize(size_type n) { Kokkos::resize(dview_m, n); }
+
+        /*!
+         * @brief Reallocate the underlying view, discarding existing entries.
          *
-         * @param n The new size to allocate in the internal view.
+         * Capacity-only operation. Does not apply the default overallocation
+         * factor — call `alloc()` instead from the outside.
          */
         void realloc(size_type n) { Kokkos::realloc(dview_m, n); }
 
@@ -112,11 +132,22 @@ namespace ippl {
 
         KOKKOS_INLINE_FUNCTION T& operator()(const size_t i) const { return dview_m(i); }
 
-        view_type& getView() { return dview_m; }
-
-        const view_type& getView() const { return dview_m; }
+        /*!
+         * Returns a subview of the underlying storage covering only the live
+         * particle range [0, size()).
+         */
+        view_type getView() {
+            return Kokkos::subview(dview_m,
+                                   Kokkos::make_pair(size_type(0),
+                                                     static_cast<size_type>(*(this->localNum_mp))));
+        }
+        const view_type getView() const {
+            return Kokkos::subview(dview_m,
+                                   Kokkos::make_pair(size_type(0),
+                                                     static_cast<size_type>(*(this->localNum_mp))));
+        }
 
-        host_mirror_type getHostMirror() const { return Kokkos::create_mirror(dview_m); }
+        host_mirror_type getHostMirror() const { return Kokkos::create_mirror(getView()); }
 
         void set_name(const std::string& name_) override {
             size_t len = name_.size();
@@ -227,9 +258,21 @@ namespace ippl {
         void internalCopy(const hash_type& indices) override;
 
     private:
-        view_type dview_m;
-        view_type buf_m;
+        view_type dview_m{"ParticleAttrib::dview", 0};
+        view_type buf_m{"ParticleAttrib::buf", 0};
     };
+
+    namespace detail {
+        template <typename Attrib>
+        struct AttribTraits;
+
+        template <typename T, class... Props>
+        struct AttribTraits<ParticleAttrib<T, Props...>> {
+            using value_type = T;
+            using view_type =
+                std::decay_t<decltype(std::declval<ParticleAttrib<T, Props...>>().getView())>;
+        };
+    }  // namespace detail
 }  // namespace ippl
 
 #include "Particle/ParticleAttrib.hpp"
diff --git a/src/Particle/ParticleAttrib.hpp b/src/Particle/ParticleAttrib.hpp
index 1422bb9dd..95a4a2321 100644
--- a/src/Particle/ParticleAttrib.hpp
+++ b/src/Particle/ParticleAttrib.hpp
@@ -17,8 +17,15 @@
 
 #include "Communicate/DataTypes.h"
 
+#include "Utility/BufferView.h"
 #include "Utility/IpplTimings.h"
 
+#ifdef IPPL_ENABLE_FFT
+#include "FFT/FFT.h"
+#endif
+#include "Particle/ParticleSort.h"
+#include "Particle/SortBuffer.h"
+
 namespace ippl {
 
     template <typename T, class... Properties>
@@ -42,6 +49,13 @@ namespace ippl {
         this->realloc(n * overalloc);
     }
 
+    template <typename T, class... Properties>
+    void ParticleAttrib<T, Properties...>::reserve(size_type n) {
+        if (this->size() < n) {
+            this->resize(n);
+        }
+    }
+
     template <typename T, class... Properties>
     void ParticleAttrib<T, Properties...>::destroy(const hash_type& deleteIndex,
                                                    const hash_type& keepIndex,
@@ -71,17 +85,12 @@ namespace ippl {
         Kokkos::parallel_for(
             "ParticleAttrib::pack()", policy_type(0, size),
             KOKKOS_LAMBDA(const size_t i) { buf(i) = dview(hash(i)); });
-        Kokkos::fence();
     }
 
     template <typename T, class... Properties>
     void ParticleAttrib<T, Properties...>::unpack(size_type nrecvs) {
-        auto size          = dview_m.extent(0);
         size_type required = *(this->localNum_mp) + nrecvs;
-        if (size < required) {
-            int overalloc = Comm->getDefaultOverallocation();
-            this->resize(required * overalloc);
-        }
+        this->resize(required);
 
         size_type count   = *(this->localNum_mp);
         auto buf = buf_m;
@@ -94,7 +103,6 @@ namespace ippl {
     }
 
     template <typename T, class... Properties>
-    // KOKKOS_INLINE_FUNCTION
     ParticleAttrib<T, Properties...>& ParticleAttrib<T, Properties...>::operator=(T x) {
         auto dview = dview_m;
         using policy_type = Kokkos::RangePolicy<execution_space>;
@@ -106,7 +114,6 @@ namespace ippl {
 
     template <typename T, class... Properties>
     template <typename E, size_t N>
-    // KOKKOS_INLINE_FUNCTION
     ParticleAttrib<T, Properties...>& ParticleAttrib<T, Properties...>::operator=(
         detail::Expression<E, N> const& expr) {
         using capture_type = detail::CapturedExpression<E, N>;
@@ -147,11 +154,11 @@ namespace ippl {
         const NDIndex<Dim>& lDom       = layout.getLocalNDIndex();
         const int nghost               = f.getNghost();
 
-        //using policy_type = Kokkos::RangePolicy<execution_space>;
         const bool useHashView = hash_array.extent(0) > 0;
         if (useHashView && (iteration_policy.end() > hash_array.extent(0))) {
             Inform m("scatter");
-            m << "Hash array was passed to scatter, but size does not match iteration policy." << endl;
+            m << "Hash array was passed to scatter, but size does not match iteration policy."
+              << endl;
             ippl::Comm->abort();
         }
         auto dview = dview_m;
@@ -159,10 +166,8 @@ namespace ippl {
         Kokkos::parallel_for(
             "ParticleAttrib::scatter", iteration_policy,
             KOKKOS_LAMBDA(const size_t idx) {
-                // map index to possible hash_map
                 size_t mapped_idx = useHashView ? hash_array(idx) : idx;
 
-                // find nearest grid point
                 vector_type l                        = (ppview(mapped_idx) - origin) * invdx + 0.5;
                 Vector<int, Field::dim> index        = l;
                 Vector<PositionType, Field::dim> whi = l - index;
@@ -170,7 +175,6 @@ namespace ippl {
 
                 Vector<size_t, Field::dim> args = index - lDom.first() + nghost;
 
-                // scatter
                 const value_type& val = dview(mapped_idx);
                 detail::scatterToField(std::make_index_sequence<1 << Field::dim>{}, view, wlo, whi,
                                        args, val);
@@ -219,7 +223,6 @@ namespace ippl {
         Kokkos::parallel_for(
             "ParticleAttrib::gather", policy_type(0, *(this->localNum_mp)),
             KOKKOS_LAMBDA(const size_t idx) {
-                // find nearest grid point
                 vector_type l                        = (ppview(idx) - origin) * invdx + 0.5;
                 Vector<int, Field::dim> index        = l;
                 Vector<PositionType, Field::dim> whi = l - index;
@@ -227,9 +230,8 @@ namespace ippl {
 
                 Vector<size_t, Field::dim> args = index - lDom.first() + nghost;
 
-                // gather
-                value_type gathered = detail::gatherFromField(std::make_index_sequence<1 << Field::dim>{},
-                                                              view, wlo, whi, args);
+                value_type gathered = detail::gatherFromField(
+                    std::make_index_sequence<1 << Field::dim>{}, view, wlo, whi, args);
                 if (addToAttribute) {
                     dview(idx) += gathered;
                 } else {
@@ -240,10 +242,8 @@ namespace ippl {
     }
 
     template <typename T, class... Properties>
-    void ParticleAttrib<T, Properties...>::applyPermutation(
-        const hash_type& permutation) {
-
-        const auto view = this->getView();
+    void ParticleAttrib<T, Properties...>::applyPermutation(const hash_type& permutation) {
+        const auto view = this->getView();  // trimmed to localNum_mp
         const auto size = this->getParticleCount();
 
         view_type temp("copy", size);
@@ -255,31 +255,28 @@ namespace ippl {
 
         Kokkos::fence();
 
-        Kokkos::deep_copy(Kokkos::subview(view, Kokkos::make_pair<size_type, size_type>(0, size)), temp);
+        Kokkos::deep_copy(view, temp);
     }
 
-    template<typename T, class... Properties>
-    void ParticleAttrib<T, Properties...>::internalCopy(
-        const hash_type &indices) {
+    template <typename T, class... Properties>
+    void ParticleAttrib<T, Properties...>::internalCopy(const hash_type& indices) {
         auto copySize = indices.size();
-        create(copySize);
 
-        auto view = this->getView();
-        const auto size = this->getParticleCount();
+        // Snapshot the current count BEFORE create() increments localNum_mp.
+        const size_type oldSize = *(this->localNum_mp);
 
-        using policy_type = Kokkos::RangePolicy<execution_space>;
+        create(copySize);  // localNum_mp becomes oldSize + copySize
+
+        auto view = this->getView();
         Kokkos::parallel_for(
-            "Copy to temp", policy_type(0, copySize),
-            KOKKOS_LAMBDA(const size_type &i) {
-            view(size + i) = view(i);
-        });
+            "internalCopy", Kokkos::RangePolicy<execution_space>(0, copySize),
+            KOKKOS_LAMBDA(const size_type& i) { view(oldSize + i) = view(indices(i)); });
 
         Kokkos::fence();
     }
 
     /*
-     * Non-class function
-     *
+     * Non-class functions
      */
 
     /**
@@ -287,7 +284,7 @@ namespace ippl {
      *
      * This overload preserves legacy functionality by providing a default iteration policy.
      * It calls the member scatter() with a default Kokkos::RangePolicy.
-     * 
+     *
      * @note The default behaviour is to scatter all particles without any custom index mapping.
      *
      * @tparam Attrib1 The type of the particle attribute.
@@ -298,19 +295,21 @@ namespace ippl {
      * @param f The field onto which the data is scattered.
      * @param pp The ParticleAttrib representing particle positions.
      */
-    template <typename Attrib1, typename Field, typename Attrib2, 
-                typename policy_type = Kokkos::RangePolicy<typename Field::execution_space>>
+    template <typename Attrib1, typename Field, typename Attrib2,
+              typename policy_type = Kokkos::RangePolicy<typename Field::execution_space>>
     inline void scatter(const Attrib1& attrib, Field& f, const Attrib2& pp) {
-        attrib.scatter(f, pp, policy_type(0, attrib.getParticleCount())); 
+        attrib.scatter(f, pp, policy_type(0, attrib.getParticleCount()));
     }
 
     /**
-     * @brief Non-class interface for scattering with a custom iteration policy and optional index array.
+     * @brief Non-class interface for scattering with a custom iteration policy and optional index
+     * array.
      *
      * This overload allows the caller to specify a custom `Kokkos::range_policy` and an optional
      * `ippl::hash_type` array. It forwards the parameters to the member scatter() function.
-     * 
-     * @note See ParticleAttrib::scatter() for more information on the custom iteration functionality.
+     *
+     * @note See ParticleAttrib::scatter() for more information on the custom iteration
+     * functionality.
      *
      * @tparam Attrib1 The type of the particle attribute.
      * @tparam Field The type of the field.
@@ -322,33 +321,34 @@ namespace ippl {
      * @param iteration_policy A custom `Kokkos::range_policy` defining the iteration range.
      * @param hash_array An optional `ippl::hash_type` array for index mapping.
      */
-    template <typename Attrib1, typename Field, typename Attrib2, 
-                typename policy_type = Kokkos::RangePolicy<typename Field::execution_space>>
-    inline void scatter(const Attrib1& attrib, Field& f, const Attrib2& pp, 
-                        policy_type iteration_policy, typename Attrib1::hash_type hash_array = {}) {
+    template <typename Attrib1, typename Field, typename Attrib2,
+              typename policy_type = Kokkos::RangePolicy<typename Field::execution_space>>
+    inline void scatter(const Attrib1& attrib, Field& f, const Attrib2& pp,
+                        policy_type iteration_policy,
+                        typename Attrib1::hash_type hash_array = {}) {
         attrib.scatter(f, pp, iteration_policy, hash_array);
     }
 
     /**
      * @brief Non-class interface for gathering field data into a particle attribute.
-     * 
+     *
      * This interface calls the member ParticleAttrib::gather() function with the provided
      * parameters and preserving legacy behavior by assigning `addToAttribute` a default value.
-     * 
+     *
      * @note See ParticleAttrib::gather() for more information on the behavior of `addToAttribute`.
-     * 
+     *
      * @tparam Attrib1 The type of the particle attribute.
      * @tparam Field The type of the field.
      * @tparam Attrib2 The type of the particle position attribute.
      * @param attrib The particle attribute to gather data into.
      * @param f The field from which data is gathered.
      * @param pp The ParticleAttrib representing particle positions.
-     * @param addToAttribute If true, the gathered field value is added to the current attribute value;
-     *                       otherwise, the attribute value is overwritten.
+     * @param addToAttribute If true, the gathered field value is added to the current attribute
+     * value; otherwise, the attribute value is overwritten.
      */
     template <typename Attrib1, typename Field, typename Attrib2>
-    inline void gather(Attrib1& attrib, Field& f, const Attrib2& pp, 
-                        const bool addToAttribute = false) {
+    inline void gather(Attrib1& attrib, Field& f, const Attrib2& pp,
+                       const bool addToAttribute = false) {
         attrib.gather(f, pp, addToAttribute);
     }
 
@@ -374,4 +374,5 @@ namespace ippl {
     DefineParticleReduction(Max, max, if (myVal > valL) valL = myVal, std::greater)
     DefineParticleReduction(Min, min, if (myVal < valL) valL = myVal, std::less)
     DefineParticleReduction(Prod, prod, valL *= myVal, std::multiplies)
+
 }  // namespace ippl
diff --git a/src/Particle/ParticleAttribBase.h b/src/Particle/ParticleAttribBase.h
index 4dea4ac8e..3e9baa197 100644
--- a/src/Particle/ParticleAttribBase.h
+++ b/src/Particle/ParticleAttribBase.h
@@ -66,6 +66,10 @@ namespace ippl {
             // prior entries survive a capacity grow.
             virtual void create(size_type, bool non_destructive = false) = 0;
 
+            // Grow internal capacity to at least N particles while preserving existing
+            // entries. Does not shrink and does not touch the logical particle count.
+            virtual void reserve(size_type) = 0;
+
             virtual void destroy(const hash_type&, const hash_type&, size_type) = 0;
             virtual size_type packedSize(const size_type) const                 = 0;
 
@@ -73,19 +77,22 @@ namespace ippl {
 
             virtual void unpack(size_type) = 0;
 
-            virtual void serialize(Archive<memory_space>& ar, size_type nsends) = 0;
-
+            virtual void serialize(Archive<memory_space>& ar, size_type nsends)   = 0;
+            virtual void serialize(detail::Archive<memory_space>& ar, const hash_type& hash,
+                                   size_type nsends)                              = 0;
             virtual void deserialize(Archive<memory_space>& ar, size_type nrecvs) = 0;
+            virtual void deserialize(detail::Archive<memory_space>& ar, size_type offset,
+                                     size_type nrecvs)                            = 0;
 
             virtual size_type size() const = 0;
 
-            virtual ~ParticleAttribBase() = default;
+            KOKKOS_INLINE_FUNCTION virtual ~ParticleAttribBase() = default;
 
             void setParticleCount(size_type& num) { localNum_mp = &num; }
             size_type getParticleCount() const { return *localNum_mp; }
 
             virtual void applyPermutation(const hash_type&) = 0;
-            virtual void internalCopy(const hash_type&) = 0;
+            virtual void internalCopy(const hash_type&)     = 0;
 
         protected:
             const size_type* localNum_mp;
diff --git a/src/Particle/ParticleBase.h b/src/Particle/ParticleBase.h
index 9fbed2faa..9793da2a9 100644
--- a/src/Particle/ParticleBase.h
+++ b/src/Particle/ParticleBase.h
@@ -299,9 +299,18 @@ namespace ippl {
         /* This function does not alter the totalNum_m member function. It should only be called
          * during the update function where we know the number of particles remains the same.
          */
+        template <typename memory_space, typename execution_space, typename F,
+                  typename... Properties>
+        void internalDestroy(const F& invalid_functor, const size_type destroyNum);
         template <typename... Properties>
         void internalDestroy(const Kokkos::View<bool*, Properties...>& invalid,
-                             const size_type destroyNum);
+                             const size_type destroyNum) {
+            using view_type       = Kokkos::View<bool*, Properties...>;
+            using memory_space    = typename view_type::memory_space;
+            using execution_space = typename view_type::execution_space;
+            internalDestroy<memory_space, execution_space>(
+                KOKKOS_LAMBDA(size_t i) { return invalid(i); }, destroyNum);
+        }
 
         /*!
          * Sends particles to another rank
@@ -316,6 +325,9 @@ namespace ippl {
         void sendToRank(int rank, int tag, std::vector<MPI_Request>& requests,
                         const HashType& hash);
 
+        template <typename HashType>
+        MPI_Request sendToRank(int rank, int tag, const HashType& hash);
+
         /*!
          * Receives particles from another rank
          * @param rank the source rank
@@ -324,6 +336,9 @@ namespace ippl {
          */
         void recvFromRank(int rank, int tag, size_type nRecvs);
 
+        std::pair<MPI_Request, std::function<void(size_type)>> postRecvFromRank(int rank, int tag,
+                                                                                size_type nRecvs);
+
         /*!
          * Serialize to do MPI calls.
          * @param ar archive
diff --git a/src/Particle/ParticleBase.hpp b/src/Particle/ParticleBase.hpp
index a71a917d6..fbbe72638 100644
--- a/src/Particle/ParticleBase.hpp
+++ b/src/Particle/ParticleBase.hpp
@@ -62,8 +62,6 @@ namespace ippl {
             addAttribute(ID);
         }
         addAttribute(R);
-        ID.set_name("ID");
-        R.set_name("position");
     }
 
     template <class PLayout, typename... IP>
@@ -175,9 +173,9 @@ namespace ippl {
     }
 
     template <class PLayout, typename... IP>
-    template <typename... Properties>
-    void ParticleBase<PLayout, IP...>::internalDestroy(
-        const Kokkos::View<bool*, Properties...>& invalid, const size_type destroyNum) {
+    template <typename memory_space, typename execution_space, typename F, typename... Properties>
+    void ParticleBase<PLayout, IP...>::internalDestroy(const F& invalid_functor,
+                                                       const size_type destroyNum) {
         PAssert(destroyNum <= localNum_m);
 
         // If there aren't any particles to delete, do nothing
@@ -193,19 +191,18 @@ namespace ippl {
             return;
         }
 
-        using view_type       = Kokkos::View<bool*, Properties...>;
-        using memory_space    = typename view_type::memory_space;
-        using execution_space = typename view_type::execution_space;
-        using policy_type     = Kokkos::RangePolicy<execution_space>;
-        auto& locDeleteIndex  = deleteIndex_m.get<memory_space>();
-        auto& locKeepIndex    = keepIndex_m.get<memory_space>();
+        using policy_type    = Kokkos::RangePolicy<execution_space>;
+        auto& locDeleteIndex = deleteIndex_m.get<memory_space>();
+        auto& locKeepIndex   = keepIndex_m.get<memory_space>();
 
-        // Resize buffers, if necessary
+        // Resize buffers per-memory-space. The previous code mistakenly used
+        // the outer `memory_space` template parameter inside the per-space
+        // lambda, leaving every other space's del/keep buffers undersized.
         detail::runForAllSpaces([&]<typename MemorySpace>() {
-            if (attributes_m.template get<memory_space>().size() > 0) {
+            if (attributes_m.template get<MemorySpace>().size() > 0) {
                 int overalloc = Comm->getDefaultOverallocation();
-                auto& del     = deleteIndex_m.get<memory_space>();
-                auto& keep    = keepIndex_m.get<memory_space>();
+                auto& del     = deleteIndex_m.template get<MemorySpace>();
+                auto& keep    = keepIndex_m.template get<MemorySpace>();
                 if (del.size() < destroyNum) {
                     Kokkos::realloc(del, destroyNum * overalloc);
                     Kokkos::realloc(keep, destroyNum * overalloc);
@@ -220,10 +217,10 @@ namespace ippl {
         Kokkos::parallel_scan(
             "Scan in ParticleBase::destroy()", policy_type(0, localNum_m - destroyNum),
             KOKKOS_LAMBDA(const size_t i, int& idx, const bool final) {
-                if (final && invalid(i)) {
+                if (final && invalid_functor(i)) {
                     locDeleteIndex(idx) = i;
                 }
-                if (invalid(i)) {
+                if (invalid_functor(i)) {
                     idx += 1;
                 }
             });
@@ -245,10 +242,10 @@ namespace ippl {
             "Second scan in ParticleBase::destroy()",
             Kokkos::RangePolicy<size_type, execution_space>(localNum_m - destroyNum, localNum_m),
             KOKKOS_LAMBDA(const size_t i, int& idx, const bool final) {
-                if (final && !invalid(i)) {
+                if (final && !invalid_functor(i)) {
                     locKeepIndex(idx) = i;
                 }
-                if (!invalid(i)) {
+                if (!invalid_functor(i)) {
                     idx += 1;
                 }
             });
@@ -280,27 +277,39 @@ namespace ippl {
 
     template <class PLayout, typename... IP>
     template <typename HashType>
-    void ParticleBase<PLayout, IP...>::sendToRank(int rank, int tag,
-                                                  std::vector<MPI_Request>& requests,
-                                                  const HashType& hash) {
-        size_type nSends = hash.size();
-        requests.resize(requests.size() + 1);
+    MPI_Request ParticleBase<PLayout, IP...>::sendToRank(int rank, int tag, const HashType& hash) {
+        size_type nSends    = hash.size();
+        MPI_Request request = MPI_REQUEST_NULL;
 
         auto hashes = hash_container_type(hash, [&]<typename MemorySpace>() {
             return attributes_m.template get<MemorySpace>().size() > 0;
         });
-        pack(hashes);
         detail::runForAllSpaces([&]<typename MemorySpace>() {
             size_type bufSize = packedSize<MemorySpace>(nSends);
             if (bufSize == 0) {
                 return;
             }
-
             auto buf = Comm->getBuffer<MemorySpace>(bufSize);
 
-            Comm->isend(rank, tag++, *this, *buf, requests.back(), nSends);
+            forAllAttributes<MemorySpace>([&]<typename Attribute>(Attribute& att) {
+                att->serialize(*buf, hashes.template get<MemorySpace>(), nSends);
+            });
+
+            Comm->isend(rank, tag++, *buf, request);
             buf->resetWritePos();
         });
+        return request;
+    }
+
+    template <class PLayout, typename... IP>
+    template <typename HashType>
+    void ParticleBase<PLayout, IP...>::sendToRank(int rank, int tag,
+                                                  std::vector<MPI_Request>& requests,
+                                                  const HashType& hash) {
+        auto hashes = hash_container_type(hash, [&]<typename MemorySpace>() {
+            return attributes_m.template get<MemorySpace>().size() > 0;
+        });
+        requests.push_back(sendToRank(rank, tag, hash));
     }
 
     template <class PLayout, typename... IP>
@@ -313,10 +322,46 @@ namespace ippl {
 
             auto buf = Comm->getBuffer<MemorySpace>(bufSize);
 
-            Comm->recv(rank, tag++, *this, *buf, bufSize, nRecvs);
+            Comm->recv(rank, tag++, *buf, bufSize);
+            forAllAttributes<MemorySpace>([&]<typename Attribute>(Attribute& att) {
+                att->deserialize(*buf, localNum_m, nRecvs);
+            });
+
             buf->resetReadPos();
         });
-        unpack(nRecvs);
+        localNum_m += nRecvs;
+    }
+
+    template <class PLayout, typename... IP>
+    std::pair<MPI_Request, std::function<void(size_t)>>
+    ParticleBase<PLayout, IP...>::postRecvFromRank(int rank, int tag, size_type nRecvs) {
+        MPI_Request request = MPI_REQUEST_NULL;
+
+        // Collect (buf, nRecvs) per memory space for deferred deserialization
+        auto deferred = std::make_shared<std::vector<std::function<void(size_type)>>>();
+
+        detail::runForAllSpaces([&]<typename MemorySpace>() {
+            size_type bufSize = packedSize<MemorySpace>(nRecvs);
+            if (bufSize == 0)
+                return;
+
+            auto buf = Comm->getBuffer<MemorySpace>(bufSize);
+            Comm->irecv(rank, tag++, *buf, request, bufSize);
+
+            deferred->push_back([this, buf, nRecvs](size_type offset) {
+                forAllAttributes<MemorySpace>([&]<typename Attribute>(Attribute& att) {
+                    att->deserialize(*buf, offset, nRecvs);
+                });
+                buf->resetReadPos();
+            });
+        });
+
+        // Finalize takes the offset (localNum_m after destruction) and advances it
+        return {request, [this, deferred, nRecvs](size_type offset) {
+                    for (auto& fn : *deferred)
+                        fn(offset);
+                    localNum_m += nRecvs;
+                }};
     }
 
     template <class PLayout, typename... IP>
diff --git a/src/Particle/ParticleSort.h b/src/Particle/ParticleSort.h
new file mode 100644
index 000000000..f5eaa3336
--- /dev/null
+++ b/src/Particle/ParticleSort.h
@@ -0,0 +1,331 @@
+#ifndef IPPL_PARTICLE_SORT_H
+#define IPPL_PARTICLE_SORT_H
+
+#include <Kokkos_Core.hpp>
+
+#include <Kokkos_Sort.hpp>
+#include <algorithm>
+#include <cstdint>
+#include <stdexcept>
+#include <vector>
+
+#include "Particle/SortBuffer.h"
+
+#ifdef KOKKOS_ENABLE_CUDA
+#include <cub/device/device_radix_sort.cuh>
+#endif
+
+#ifdef KOKKOS_ENABLE_HIP
+#include <rocprim/rocprim.hpp>
+#endif
+
+namespace ippl {
+    namespace detail {
+
+        /**
+         * @brief Validate that all permutation indices are in bounds
+         */
+        template <typename ExecSpace, typename IndexView>
+        void validatePermutation(const IndexView& permute, size_t n,
+                                 const char* label = "sortParticles") {
+            using size_type = size_t;
+
+            size_type num_invalid = 0;
+            Kokkos::parallel_reduce(
+                label, Kokkos::RangePolicy<ExecSpace>(0, n),
+                KOKKOS_LAMBDA(size_type i, size_type & count) {
+                    if (permute(i) >= n)
+                        count++;
+                },
+                num_invalid);
+            Kokkos::fence();
+
+            if (num_invalid > 0) {
+                size_type first_invalid_idx = n;
+                Kokkos::parallel_reduce(
+                    "find_first_invalid", Kokkos::RangePolicy<ExecSpace>(0, n),
+                    KOKKOS_LAMBDA(size_type i, size_type & first_idx) {
+                        if (permute(i) >= n && i < first_idx)
+                            first_idx = i;
+                    },
+                    Kokkos::Min<size_type>(first_invalid_idx));
+                Kokkos::fence();
+
+                auto permute_host =
+                    Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, permute);
+
+                throw std::runtime_error(std::string(label) + ": Found "
+                                         + std::to_string(num_invalid)
+                                         + " invalid permutation indices. First invalid: permute["
+                                         + std::to_string(first_invalid_idx)
+                                         + "] = " + std::to_string(permute_host(first_invalid_idx))
+                                         + " (n = " + std::to_string(n) + ")");
+            }
+        }
+
+        /**
+         * @brief Compute Morton code (Z-order curve) for spatial sorting
+         */
+        template <unsigned Dim, typename T, typename IndexType = size_t>
+        KOKKOS_INLINE_FUNCTION uint64_t computeMortonCode(const Vector<T, Dim>& position,
+                                                          const Vector<T, Dim>& origin,
+                                                          const Vector<T, Dim>& invdx,
+                                                          const Vector<IndexType, Dim>& ngrid) {
+            uint64_t morton            = 0;
+            constexpr int bits_per_dim = 21;  // 21*3 = 63 bits, safe for 3D
+
+            uint32_t gridIndices[Dim];
+            for (unsigned d = 0; d < Dim; ++d) {
+                T sx = (position[d] - origin[d]) * invdx[d];
+                sx -= ngrid[d] * Kokkos::floor(sx / ngrid[d]);  // wrap to [0, ngrid)
+                gridIndices[d] = static_cast<uint32_t>(sx);
+            }
+
+            for (int bit = 0; bit < bits_per_dim; ++bit) {
+                for (unsigned d = 0; d < Dim; ++d) {
+                    if (gridIndices[d] & (1u << bit)) {
+                        morton |= (uint64_t(1) << (bit * Dim + d));
+                    }
+                }
+            }
+            return morton;
+        }
+
+        /**
+         * @brief Functor to compute Morton codes for all particles
+         */
+        template <unsigned Dim, typename PositionView, typename T, typename KeyView>
+        struct ComputeMortonCodesFunctor {
+            using size_type = size_t;
+
+            PositionView positions;
+            KeyView keys;
+            Vector<T, Dim> origin;
+            Vector<T, Dim> invdx;
+            Vector<size_type, Dim> ngrid;
+
+            KOKKOS_INLINE_FUNCTION void operator()(size_type i) const {
+                keys(i) = computeMortonCode<Dim, T, size_type>(positions(i), origin, invdx, ngrid);
+            }
+        };
+
+        // -------------------------------------------------------------------
+        // Platform-specific sort implementations
+        // -------------------------------------------------------------------
+
+        /**
+         * @brief Sort particles on host using std::sort on (morton, index) pairs
+         *
+         * @return Subview of the buffered permute array, valid until next ensureCapacity
+         */
+        template <unsigned Dim, typename T>
+        Kokkos::View<size_t*, Kokkos::HostSpace> sortParticlesHost(
+            Kokkos::View<Vector<T, Dim>*, Kokkos::HostSpace> positions,
+            const Vector<T, Dim>& origin, const Vector<T, Dim>& invdx,
+            const Vector<size_t, Dim>& ngrid, size_t n) {
+            auto& bufs = ippl::detail::getDefaultBinSortBuffers<Kokkos::HostSpace>();
+            bufs.ensureCapacity(n, /*n_bins_p1=*/1);
+
+            auto& permute = bufs.permute();
+
+            // Build (key, index) pairs, sort, then extract permutation
+            std::vector<std::pair<uint64_t, size_t>> pairs(n);
+            for (size_t i = 0; i < n; ++i) {
+                pairs[i] = {computeMortonCode<Dim, T, size_t>(positions(i), origin, invdx, ngrid),
+                            i};
+            }
+            std::sort(pairs.begin(), pairs.end());  // pair has lexicographic < by .first
+
+            for (size_t i = 0; i < n; ++i) {
+                permute(i) = pairs[i].second;
+            }
+
+            return permute;
+        }
+
+#ifdef KOKKOS_ENABLE_CUDA
+        /**
+         * @brief Sort particles on CUDA using CUB DeviceRadixSort with buffered temporaries
+         *
+         * @return Subview of the buffered permOut array containing sorted particle indices
+         */
+        template <unsigned Dim, typename T>
+        Kokkos::View<size_t*, Kokkos::CudaSpace> sortParticlesCuda(
+            Kokkos::View<Vector<T, Dim>*, Kokkos::CudaSpace> positions,
+            const Vector<T, Dim>& origin, const Vector<T, Dim>& invdx,
+            const Vector<size_t, Dim>& ngrid, size_t n) {
+            using size_type = size_t;
+            auto& bufs      = ippl::detail::getDefaultBinSortBuffers<Kokkos::CudaSpace>();
+            bufs.ensureCapacity(n, /*n_bins_p1=*/1);
+
+            // Step 1: compute Morton codes into binKeys, init permute to identity
+            auto keys    = bufs.binKeys();
+            auto permute = bufs.permute();
+
+            Kokkos::parallel_for(
+                "MortonSort::ComputeKeys", Kokkos::RangePolicy<Kokkos::Cuda>(0, n),
+                ComputeMortonCodesFunctor<Dim, decltype(positions), T, decltype(keys)>{
+                    positions, keys, origin, invdx, ngrid});
+
+            Kokkos::parallel_for(
+                "MortonSort::InitPermute", Kokkos::RangePolicy<Kokkos::Cuda>(0, n),
+                KOKKOS_LAMBDA(size_type i) { permute(i) = i; });
+            Kokkos::fence();
+
+            // Step 2: query temp storage, grow buffer if needed, then sort
+            auto keys_out = bufs.keysOut();
+            auto perm_out = bufs.permOut();
+
+            size_t temp_bytes = 0;
+            cub::DeviceRadixSort::SortPairs(nullptr, temp_bytes, keys.data(), keys_out.data(),
+                                            permute.data(), perm_out.data(), static_cast<int>(n));
+
+            bufs.ensureTempStorage(temp_bytes);
+
+            cub::DeviceRadixSort::SortPairs(bufs.tempStorage().data(), temp_bytes, keys.data(),
+                                            keys_out.data(), permute.data(), perm_out.data(),
+                                            static_cast<int>(n));
+            Kokkos::fence();
+
+            return bufs.permOut();
+        }
+#endif
+
+#ifdef KOKKOS_ENABLE_HIP
+        /**
+         * @brief Sort particles on HIP using rocPRIM radix_sort_pairs with buffered temporaries
+         *
+         * @return Subview of the buffered permOut array containing sorted particle indices
+         */
+        template <unsigned Dim, typename T>
+        Kokkos::View<size_t*, Kokkos::HIPSpace> sortParticlesHip(
+            Kokkos::View<Vector<T, Dim>*, Kokkos::HIPSpace> positions, const Vector<T, Dim>& origin,
+            const Vector<T, Dim>& invdx, const Vector<size_t, Dim>& ngrid, size_t n) {
+            using size_type = size_t;
+            auto& bufs      = ippl::detail::getDefaultBinSortBuffers<Kokkos::HIPSpace>();
+            bufs.ensureCapacity(n, /*n_bins_p1=*/1);
+
+            auto keys    = bufs.binKeys();
+            auto permute = bufs.permute();
+
+            Kokkos::parallel_for(
+                "MortonSort::ComputeKeys", Kokkos::RangePolicy<Kokkos::HIP>(0, n),
+                ComputeMortonCodesFunctor<Dim, decltype(positions), T, decltype(keys)>{
+                    positions, keys, origin, invdx, ngrid});
+
+            Kokkos::parallel_for(
+                "MortonSort::InitPermute", Kokkos::RangePolicy<Kokkos::HIP>(0, n),
+                KOKKOS_LAMBDA(size_type i) { permute(i) = i; });
+            Kokkos::fence();
+
+            auto keys_out = bufs.keysOut();
+            auto perm_out = bufs.permOut();
+
+            size_t temp_bytes = 0;
+            std::ignore = rocprim::radix_sort_pairs(nullptr, temp_bytes, keys.data(), keys_out.data(),
+                                      permute.data(), perm_out.data(), n, 0, sizeof(uint64_t) * 8,
+                                      Kokkos::HIP().hip_stream());
+
+            bufs.ensureTempStorage(temp_bytes);
+
+            std::ignore = rocprim::radix_sort_pairs(bufs.tempStorage().data(), temp_bytes, keys.data(),
+                                      keys_out.data(), permute.data(), perm_out.data(), n, 0,
+                                      sizeof(uint64_t) * 8, Kokkos::HIP().hip_stream());
+            Kokkos::fence();
+
+            return bufs.permOut();
+        }
+#endif
+
+        /**
+         * @brief Generic sort dispatcher — selects CUDA/HIP/host implementation
+         */
+        template <unsigned Dim, typename ExecSpace, typename T>
+        Kokkos::View<size_t*, typename ExecSpace::memory_space> sortParticles(
+            Kokkos::View<Vector<T, Dim>*, typename ExecSpace::memory_space> positions,
+            const Vector<T, Dim>& origin, const Vector<T, Dim>& invdx,
+            const Vector<size_t, Dim>& ngrid, size_t n) {
+            using memory_space = typename ExecSpace::memory_space;
+
+#ifdef KOKKOS_ENABLE_CUDA
+            if constexpr (std::is_same_v<ExecSpace, Kokkos::Cuda>) {
+                return sortParticlesCuda<Dim, T>(positions, origin, invdx, ngrid, n);
+            }
+#endif
+#ifdef KOKKOS_ENABLE_HIP
+            if constexpr (std::is_same_v<ExecSpace, Kokkos::HIP>) {
+                return sortParticlesHip<Dim, T>(positions, origin, invdx, ngrid, n);
+            }
+#endif
+            if constexpr (std::is_same_v<memory_space, Kokkos::HostSpace>) {
+                return sortParticlesHost<Dim, T>(positions, origin, invdx, ngrid, n);
+            } else {
+                // Generic fallback: sort on host, copy result to device buffer
+                auto positions_host =
+                    Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, positions);
+
+                Vector<size_t, Dim> ngrid_host;
+                for (unsigned d = 0; d < Dim; ++d)
+                    ngrid_host[d] = ngrid[d];
+
+                auto permute_host =
+                    sortParticlesHost<Dim, T>(positions_host, origin, invdx, ngrid_host, n);
+
+                auto& bufs = ippl::detail::getDefaultBinSortBuffers<memory_space>();
+                bufs.ensureCapacity(n, /*n_bins_p1=*/1);
+
+                Kokkos::deep_copy(Kokkos::subview(bufs.permute(), std::make_pair(size_t(0), n)),
+                                  Kokkos::subview(permute_host, std::make_pair(size_t(0), n)));
+
+                return bufs.permute();
+            }
+        }
+
+        // -------------------------------------------------------------------
+        // Permutation application helpers
+        // -------------------------------------------------------------------
+
+        template <typename SrcView, typename DstView, typename IndexView>
+        struct ApplyPermutationFunctor {
+            SrcView src;
+            DstView dst;
+            IndexView permute;
+            KOKKOS_INLINE_FUNCTION void operator()(size_t i) const { dst(i) = src(permute(i)); }
+        };
+
+        template <typename SrcView, typename DstView, typename IndexView>
+        struct ApplyInversePermutationFunctor {
+            SrcView src;
+            DstView dst;
+            IndexView permute;
+            KOKKOS_INLINE_FUNCTION void operator()(size_t i) const { dst(permute(i)) = src(i); }
+        };
+
+        /**
+         * @brief Reorder data in-place according to permutation
+         */
+        template <typename ExecSpace, typename DataView, typename IndexView>
+        void applyPermutation(DataView& data, const IndexView& permute, size_t n) {
+            DataView temp("temp", n);
+            Kokkos::parallel_for(
+                "apply_permutation", Kokkos::RangePolicy<ExecSpace>(0, n),
+                ApplyPermutationFunctor<DataView, DataView, IndexView>{data, temp, permute});
+            Kokkos::deep_copy(Kokkos::subview(data, Kokkos::make_pair(size_t(0), n)), temp);
+        }
+
+        /**
+         * @brief Scatter src → dst using inverse permutation (dst[permute[i]] = src[i])
+         */
+        template <typename ExecSpace, typename DataView, typename IndexView>
+        void applyInversePermutation(const DataView& src, DataView& dst, const IndexView& permute,
+                                     size_t n) {
+            Kokkos::parallel_for(
+                "apply_inverse_permutation", Kokkos::RangePolicy<ExecSpace>(0, n),
+                ApplyInversePermutationFunctor<DataView, DataView, IndexView>{src, dst, permute});
+            Kokkos::fence();
+        }
+
+    }  // namespace detail
+}  // namespace ippl
+
+#endif  // IPPL_PARTICLE_SORT_H
\ No newline at end of file
diff --git a/src/Particle/ParticleSpatialLayout.h b/src/Particle/ParticleSpatialLayout.h
index 568b30753..ed596e180 100644
--- a/src/Particle/ParticleSpatialLayout.h
+++ b/src/Particle/ParticleSpatialLayout.h
@@ -35,6 +35,19 @@
 
 namespace ippl {
 
+    // Controls how the send-count exchange is performed before the particle
+    // data transfer. All paths assume GPU-aware MPI (device pointers passed
+    // directly to Isend/Irecv/Alltoall).
+    enum class CountExchange {
+        // One-sided RMA: each sender writes its count directly into the
+        // receiver's window.
+        RMA,
+        // Two-sided point-to-point: per-rank Isend/Irecv over device pointers.
+        P2P,
+        // Collective Alltoall over device pointers.
+        Alltoall
+    };
+
     /*!
      * ParticleSpatialLayout class definition.
      * @tparam T value type
@@ -59,9 +72,9 @@ namespace ippl {
 
         using size_type = detail::size_type;
 
-    public:
         // constructor: this one also takes a Mesh
-        ParticleSpatialLayout(FieldLayout<Dim>&, Mesh&, bool fem = false);
+        ParticleSpatialLayout(FieldLayout<Dim>&, Mesh&, bool fem = false,
+                              CountExchange mode = CountExchange::RMA);
 
         ParticleSpatialLayout()
             : detail::ParticleLayout<T, Dim, PositionProperties...>() {}
@@ -73,21 +86,33 @@ namespace ippl {
         template <class ParticleContainer>
         void update(ParticleContainer& pc);
 
-        const RegionLayout_t& getRegionLayout() const { return rlayout_m; }
+        const RegionLayout_t& getRegionLayout() const { return *rlayout_m; }
 
     protected:
         //! The RegionLayout which determines where our particles go.
-        RegionLayout_t rlayout_m;
+        std::shared_ptr<RegionLayout_t> rlayout_m;
 
         //! The FieldLayout containing information on nearest neighbors
         FieldLayout_t& flayout_m;
 
+        //! How counts are exchanged
+        CountExchange countExchangeMode_;
+
+        //
+        // RMA Path
+        //
+
         // Vector keeping track of the recieves from all ranks
         std::vector<size_type> nRecvs_m;
 
         // MPI RMA window for one-sided communication
         mpi::rma::Window<mpi::rma::Active> window_m;
 
+        //
+        // P2P GPU Path
+        //
+        locate_type recvCounts_d_;  // [nranks]
+
         //! Type of the Kokkos view containing the local regions.
         using region_view_type = typename RegionLayout_t::view_type;
         //! Type of a single Region object.
@@ -99,6 +124,10 @@ namespace ippl {
         KOKKOS_INLINE_FUNCTION constexpr static bool positionInRegion(
             const std::index_sequence<Idx...>&, const vector_type& pos, const region_type& region);
 
+        template <size_t... Idx>
+        KOKKOS_INLINE_FUNCTION constexpr static bool positionInRegionInclusive(
+            const std::index_sequence<Idx...>&, const vector_type& pos, const region_type& region);
+
         /*!
          * Evaluates the total number of MPI ranks sharing the spatial nearest neighbors.
          * @param neighbors structure containing, for every spatial direction, a list of
@@ -109,35 +138,59 @@ namespace ippl {
 
     public:
         /*!
-         * For each particle in the bunch, determine the rank on which it should
-         * be stored based on its location
-         * @tparam ParticleContainer the particle container type
-         * @param pc the particle container
-         * @param ranks the integer view in which to store the destination ranks
-         * @param invalid the boolean view in which to store whether each particle
-         * is invalidated, i.e. needs to be sent to another rank
-         * @param nSends_dview the view storing per-rank send counts
-         * @param sends_dview the view storing per-rank send offsets
-         * @return The total number of invalidated particles
+         * For each local particle, determine its destination rank, write the
+         * indices of leaving particles into `sendIds_d_`, populate the
+         * per-rank send-count and offset buffers, build a compacted list of
+         * destination ranks, and mark each particle's "is leaving" status in
+         * `leaving_d_` so the destroy pass doesn't have to recompute it.
+         * @return number of leaving particles on this rank
          */
         template <typename ParticleContainer>
-        std::pair<size_type, size_type> locateParticles(const ParticleContainer& pc,
-                                                        locate_type& ranks, bool_type& invalid,
-                                                        locate_type& nSends_dview,
-                                                        locate_type& sends_dview) const;
-
-        /*!
-         * @param rank we sent to
-         * @param ranks a container specifying where a particle at the i-th index should go.
-         * @param hash a mapping to fill the send buffer contiguously
-         */
-        void fillHash(int rank, const locate_type& ranks, hash_type& hash);
-
-        /*!
-         * @param rank we sent to
-         * @param ranks a container specifying where a particle at the i-th index should go.
-         */
-        size_t numberOfSends(int rank, const locate_type& ranks);
+        size_t locateParticlesPacked(const ParticleContainer& pc);
+
+    private:
+        // Fixed-size scratch
+        locate_type rankSendCount_d_;  // [nRanks]
+        locate_type sendOffsets_d_;    // [nRanks+1]
+        hash_type sendIds_d_;          // [capacity >= max nInvalid seen]
+        locate_type cursor_d_;         // [nRanks] per-rank insertion cursor
+        locate_type destRanks_d_;      // [nRanks] (compacted list)
+        bool_type leaving_d_;          // [capacity >= max nLocal seen] mask
+
+        // Single scalar on device to count destinations
+        Kokkos::View<size_type, position_memory_space> nDest_d_;
+
+        // Neigbour cache
+        locate_type neighbors_d_;          // [neighborSize] cached device neighbors list
+        std::vector<int> neighbors_host_;  // flat host copy
+        bool neighbors_dirty_      = true;
+        size_t neighbors_capacity_ = 0;
+        size_type neighbors_used_  = 0;
+
+        // Host mirror buffers
+        using host_mem_space   = Kokkos::HostSpace;
+        using locate_host_type = typename detail::ViewType<int, 1, host_mem_space>::view_type;
+
+        locate_host_type rankSendCount_h_;  // [nRanks] (mirror)
+        locate_host_type sendOffsets_h_;    // [nRanks+1] (mirror)
+        locate_host_type destRanks_h_;      // [nRanks] (mirror)
+
+        // Host-side destination list
+        std::vector<int> destinationRanks_host_;
+
+        // capacities
+        size_t sendIds_capacity_ = 0;
+        size_t leaving_capacity_ = 0;
+        int nRanks_              = 0;
+
+        void initScratch(int nRanks);
+        void ensureSendCapacity(size_t nInvalid);
+        void ensureLeavingCapacity(size_t nLocal);
+        void ensureNeighborsCached();
+
+        void countExchangeRMA();
+        void countExchangeP2P();
+        void countExchangeAlltoall();
     };
 }  // namespace ippl
 
diff --git a/src/Particle/ParticleSpatialLayout.hpp b/src/Particle/ParticleSpatialLayout.hpp
index b47529aae..cbb4c760b 100644
--- a/src/Particle/ParticleSpatialLayout.hpp
+++ b/src/Particle/ParticleSpatialLayout.hpp
@@ -19,60 +19,95 @@
 //   frequency of load balancing (N), or may supply a function to
 //   determine if load balancing should be done or not.
 //
+#include <algorithm>
 #include <memory>
 #include <numeric>
 #include <vector>
 
 #include "Utility/IpplTimings.h"
+#include "Utility/ParallelDispatch.h"
 
 #include "Communicate/Window.h"
 
 namespace ippl {
 
-    /*!
-     * We need this struct since Kokkos parallel_scan only accepts
-     * one variable of type ReturnType where to perform the reduction operation.
-     * For more details, see
-     * https://kokkos.github.io/kokkos-core-wiki/API/core/parallel-dispatch/parallel_scan.html.
-     */
-    struct increment_type {
-        size_t count[2];
+    template <typename T, unsigned Dim, class Mesh, typename... Properties>
+    ParticleSpatialLayout<T, Dim, Mesh, Properties...>::ParticleSpatialLayout(FieldLayout<Dim>& fl,
+                                                                              Mesh& mesh, bool fem,
+                                                                              CountExchange mode)
+        : rlayout_m(std::make_shared<RegionLayout_t>(fl, mesh, fem))
+        , flayout_m(fl)
+        , countExchangeMode_(mode) {
+        const int nRanks = Comm->size();
+        nRanks_          = nRanks;
+        initScratch(nRanks);
 
-        KOKKOS_FUNCTION void init() {
-            count[0] = 0;
-            count[1] = 0;
+        if (mode == CountExchange::RMA && nRanks > 1) {
+            nRecvs_m.resize(Comm->size());
+            window_m.create(*Comm, nRecvs_m.begin(), nRecvs_m.end());
         }
+    }
 
-        KOKKOS_INLINE_FUNCTION increment_type& operator+=(bool* values) {
-            count[0] += values[0];
-            count[1] += values[1];
-            return *this;
-        }
+    template <typename T, unsigned Dim, class Mesh, typename... Properties>
+    void ParticleSpatialLayout<T, Dim, Mesh, Properties...>::updateLayout(FieldLayout<Dim>& fl,
+                                                                          Mesh& mesh) {
+        rlayout_m->changeDomain(fl, mesh);
+        neighbors_dirty_ = true;
+    }
+
+    template <typename T, unsigned Dim, class Mesh, typename... Properties>
+    void ParticleSpatialLayout<T, Dim, Mesh, Properties...>::countExchangeRMA() {
+        std::fill(nRecvs_m.begin(), nRecvs_m.end(), 0);
+        window_m.fence(0);
 
-        KOKKOS_INLINE_FUNCTION increment_type& operator+=(increment_type values) {
-            count[0] += values.count[0];
-            count[1] += values.count[1];
-            return *this;
+        for (int rank : destinationRanks_host_) {
+            if (rank == Comm->rank())
+                continue;
+            const int* src = &rankSendCount_h_(rank);
+            window_m.put<int>(src, rank, Comm->rank());
         }
-    };
+
+        window_m.fence(0);
+    }
 
     template <typename T, unsigned Dim, class Mesh, typename... Properties>
-    ParticleSpatialLayout<T, Dim, Mesh, Properties...>::ParticleSpatialLayout(FieldLayout<Dim>& fl,
-                                                                              Mesh& mesh, bool fem)
-        : rlayout_m(fl, mesh, fem)
-        , flayout_m(fl)
-    {   
-        nRecvs_m.resize(Comm->size());
-        if (Comm->size() > 1) {
-            window_m.create(*Comm, nRecvs_m.begin(), nRecvs_m.end());
+    void ParticleSpatialLayout<T, Dim, Mesh, Properties...>::countExchangeP2P() {
+        const int myRank = Comm->rank();
+        const int tag    = Comm->next_tag(mpi::tag::P_SPATIAL_LAYOUT, mpi::tag::P_LAYOUT_CYCLE);
+
+        // Zero the receive-count buffer on the device
+        Kokkos::deep_copy(position_execution_space{}, recvCounts_d_, 0);
+        Kokkos::fence();
+
+        // Device pointer to send-count array
+        int* d_sendCounts = rankSendCount_d_.data();
+        int* d_recvCounts = recvCounts_d_.data();
+
+        std::vector<MPI_Request> reqs;
+        reqs.reserve(2 * std::max(0, nRanks_ - 1));
+
+        // Post receives from all ranks (except self)
+        for (int r = 0; r < nRanks_; ++r) {
+            if (r == myRank)
+                continue;
+            MPI_Irecv(d_recvCounts + r, 1, MPI_INT, r, tag, Comm->getCommunicator(),
+                      &reqs.emplace_back());
         }
+
+        for (int r = 0; r < nRanks_; ++r) {
+            if (r == myRank)
+                continue;
+            MPI_Isend(d_sendCounts + r, 1, MPI_INT, r, tag, Comm->getCommunicator(),
+                      &reqs.emplace_back());
+        }
+
+        MPI_Waitall(static_cast<int>(reqs.size()), reqs.data(), MPI_STATUSES_IGNORE);
     }
 
     template <typename T, unsigned Dim, class Mesh, typename... Properties>
-    void ParticleSpatialLayout<T, Dim, Mesh, Properties...>::updateLayout(FieldLayout<Dim>& fl,
-                                                                          Mesh& mesh) {
-        // flayout_m = fl;
-        rlayout_m.changeDomain(fl, mesh);
+    void ParticleSpatialLayout<T, Dim, Mesh, Properties...>::countExchangeAlltoall() {
+        MPI_Alltoall(rankSendCount_d_.data(), 1, MPI_INT, recvCounts_d_.data(), 1, MPI_INT,
+                     Comm->getCommunicator());
     }
 
     template <typename T, unsigned Dim, class Mesh, typename... Properties>
@@ -81,18 +116,20 @@ namespace ippl {
         /* Apply Boundary Conditions */
         static IpplTimings::TimerRef ParticleBCTimer = IpplTimings::getTimer("particleBC");
         IpplTimings::startTimer(ParticleBCTimer);
-        this->applyBC(pc.R, rlayout_m.getDomain());
+        this->applyBC(pc.R, rlayout_m->getDomain());
         IpplTimings::stopTimer(ParticleBCTimer);
 
         /* Update Timer for the rest of the function */
         static IpplTimings::TimerRef ParticleUpdateTimer = IpplTimings::getTimer("updateParticle");
         IpplTimings::startTimer(ParticleUpdateTimer);
 
-        int nRanks = Comm->size();
-        if (nRanks < 2) {
+        if (nRanks_ < 2) {
+            IpplTimings::stopTimer(ParticleUpdateTimer);
             return;
         }
 
+        ensureNeighborsCached();
+
         /* particle MPI exchange:
          *   1. figure out which particles need to go where -> locateParticles(...)
          *   2. fill send buffer and send particles
@@ -105,69 +142,45 @@ namespace ippl {
         static IpplTimings::TimerRef locateTimer = IpplTimings::getTimer("locateParticles");
         IpplTimings::startTimer(locateTimer);
 
-        /* Rank-local number of particles */
-        size_type localnum = pc.getLocalNum();
-
-        /* The indices correspond to the indices of the local particles,
-         * the values correspond to the ranks to which the particles need to be sent
-         */
-        locate_type particleRanks("particles' MPI ranks", localnum);
+        const size_type nInvalid = locateParticlesPacked(pc);
 
-        /* The indices are the indices of the particles,
-         * the boolean values describe whether the particle has left the current rank
-         * 0 --> particle valid (inside current rank)
-         * 1 --> particle invalid (left rank)
-         */
-        bool_type invalidParticles("validity of particles", localnum);
+        // Copy metadata to host
+        size_type nDest = 0;
+        Kokkos::deep_copy(position_execution_space{}, nDest, nDest_d_);
 
-        /* The indices are the MPI ranks,
-         * the values are the number of particles are sent to that rank from myrank
-         */
-        locate_type rankSendCount_dview("rankSendCount Device", nRanks);
-
-        /* The indices have no particluar meaning,
-         * the values are the MPI ranks to which we need to send
-         */
-        locate_type destinationRanks_dview("destinationRanks Device", nRanks);
-
-        /* nInvalid is the number of invalid particles
-         * nDestinationRanks is the number of MPI ranks we need to send to
-         */
-        auto [nInvalid, nDestinationRanks] = locateParticles(
-            pc, particleRanks, invalidParticles, rankSendCount_dview, destinationRanks_dview);
+        // destRanks prefix
+        if (nDest > 0) {
+            Kokkos::deep_copy(
+                position_execution_space{},
+                Kokkos::subview(destRanks_h_, std::make_pair(size_t(0), size_t(nDest))),
+                Kokkos::subview(destRanks_d_, std::make_pair(size_t(0), size_t(nDest))));
+        }
 
-        /* Host space copy of rankSendCount_dview */
-        auto rankSendCount_hview =
-            Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), rankSendCount_dview);
+        // counts + offsets
+        Kokkos::deep_copy(position_execution_space{}, rankSendCount_h_, rankSendCount_d_);
+        Kokkos::deep_copy(position_execution_space{}, sendOffsets_h_, sendOffsets_d_);
 
-        /* Host Space copy of destinationRanks_dview */
-        auto destinationRanks_hview =
-            Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), destinationRanks_dview);
+        // Build host destination list without allocation
+        destinationRanks_host_.clear();
+        for (size_t i = 0; i < (size_t)nDest; ++i)
+            destinationRanks_host_.push_back(destRanks_h_(i));
 
         IpplTimings::stopTimer(locateTimer);
 
         // 2. fill send buffer and send particles =============================================== //
 
-        // 2.1 Remote Memory Access window for one-sided communication
+        // 2.1 Count Exchange
 
         static IpplTimings::TimerRef preprocTimer = IpplTimings::getTimer("sendPreprocess");
         IpplTimings::startTimer(preprocTimer);
 
-        std::fill(nRecvs_m.begin(), nRecvs_m.end(), 0);
-
-        window_m.fence(0);
-
-        // Prepare RMA window for the ranks we need to send to
-        for (size_t ridx = 0; ridx < nDestinationRanks; ridx++) {
-            int rank = destinationRanks_hview[ridx];
-            if (rank == Comm->rank()) {
-                // we do not need to send to ourselves
-                continue;
-            }
-            const int* src_ptr = &rankSendCount_hview(rank);
-            window_m.put<int>(src_ptr, rank, Comm->rank());
+        if (countExchangeMode_ == CountExchange::RMA) {
+            countExchangeRMA();
+        } else if (countExchangeMode_ == CountExchange::P2P) {
+            countExchangeP2P();
+        } else {
+            countExchangeAlltoall();
         }
-        window_m.fence(0);
 
         IpplTimings::stopTimer(preprocTimer);
 
@@ -176,55 +189,136 @@ namespace ippl {
         static IpplTimings::TimerRef sendTimer = IpplTimings::getTimer("particleSend");
         IpplTimings::startTimer(sendTimer);
 
-        std::vector<MPI_Request> requests(0);
+        std::vector<MPI_Request> requests;
+        requests.reserve(destinationRanks_host_.size());
 
         int tag = Comm->next_tag(mpi::tag::P_SPATIAL_LAYOUT, mpi::tag::P_LAYOUT_CYCLE);
 
-        for (size_t ridx = 0; ridx < nDestinationRanks; ridx++) {
-            int rank = destinationRanks_hview[ridx];
-            if (rank == Comm->rank()) {
+        for (int rank : destinationRanks_host_) {
+            if (rank == Comm->rank())
                 continue;
-            }
-            hash_type hash("hash", rankSendCount_hview(rank));
-            fillHash(rank, particleRanks, hash);
-            pc.sendToRank(rank, tag, requests, hash);
+            const size_type count = static_cast<size_type>(rankSendCount_h_(rank));
+            if (count == 0)
+                continue;
+            const size_type begin = static_cast<size_type>(sendOffsets_h_(rank));
+            auto ids_sub =
+                Kokkos::subview(sendIds_d_, std::make_pair((size_t)begin, (size_t)(begin + count)));
+            requests.push_back(pc.sendToRank(rank, tag, ids_sub));
         }
 
         IpplTimings::stopTimer(sendTimer);
 
+        // 2.3 Post receives
+
+        static IpplTimings::TimerRef recvTimer = IpplTimings::getTimer("particleRecv");
+        IpplTimings::startTimer(recvTimer);
+
+        std::vector<std::pair<int, size_type>> recvList;
+        size_type totalRecvs = 0;
+
+        if (countExchangeMode_ == CountExchange::RMA) {
+            for (int rank = 0; rank < nRanks_; ++rank) {
+                if (nRecvs_m[rank] > 0) {
+                    recvList.push_back({rank, nRecvs_m[rank]});
+                    totalRecvs += nRecvs_m[rank];
+                }
+            }
+        } else {
+            auto recvCounts_h =
+                Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), recvCounts_d_);
+            for (int rank = 0; rank < nRanks_; ++rank) {
+                if (recvCounts_h(rank) > 0) {
+                    recvList.push_back({rank, recvCounts_h(rank)});
+                    totalRecvs += static_cast<size_type>(recvCounts_h(rank));
+                }
+            }
+        }
+
+        std::vector<MPI_Request> recvRequests(recvList.size(), MPI_REQUEST_NULL);
+        std::vector<std::function<void(size_type)>> finalizers(recvList.size());
+
+        for (size_t i = 0; i < recvList.size(); ++i) {
+            auto [rank, count] = recvList[i];
+            auto [req, fin]    = pc.postRecvFromRank(rank, tag, count);
+            recvRequests[i]    = req;
+            finalizers[i]      = std::move(fin);
+        }
+
+        IpplTimings::stopTimer(recvTimer);
+
         // 3. Internal destruction of invalid particles ======================================= //
 
         static IpplTimings::TimerRef destroyTimer = IpplTimings::getTimer("particleDestroy");
         IpplTimings::startTimer(destroyTimer);
 
-        pc.internalDestroy(invalidParticles, nInvalid);
+        // locateParticlesPacked already wrote the per-particle "is leaving"
+        // mask into leaving_d_. Reuse it instead of re-running the full
+        // region search inside the destroy predicate.
+        auto leaving = leaving_d_;
+        pc.template internalDestroy<position_memory_space, position_execution_space>(
+            KOKKOS_LAMBDA(size_t i) { return leaving(i); }, nInvalid);
         Kokkos::fence();
+        const size_type localAfterDestroy = pc.getLocalNum();
 
         IpplTimings::stopTimer(destroyTimer);
 
-        // 4. Receive Particles ================================================================ //
+        static IpplTimings::TimerRef waitTimer = IpplTimings::getTimer("particleWait");
+        IpplTimings::startTimer(waitTimer);
 
-        static IpplTimings::TimerRef recvTimer = IpplTimings::getTimer("particleRecv");
-        IpplTimings::startTimer(recvTimer);
+        requests.insert(requests.end(), recvRequests.begin(), recvRequests.end());
+        if (!requests.empty()) {
+            MPI_Waitall(static_cast<int>(requests.size()), requests.data(), MPI_STATUSES_IGNORE);
+        }
 
-        for (int rank = 0; rank < nRanks; ++rank) {
-            if (nRecvs_m[rank] > 0) {
-                pc.recvFromRank(rank, tag, nRecvs_m[rank]);
-            }
+        IpplTimings::stopTimer(waitTimer);
+
+        static IpplTimings::TimerRef freeBufferTimer = IpplTimings::getTimer("particleFreeBuffers");
+        IpplTimings::startTimer(freeBufferTimer);
+
+        Comm->freeAllBuffers();
+
+        IpplTimings::stopTimer(freeBufferTimer);
+
+        // 5. Deserialize
+        static IpplTimings::TimerRef deserializeTimer =
+            IpplTimings::getTimer("particleDeserialize");
+        IpplTimings::startTimer(deserializeTimer);
+
+        static IpplTimings::TimerRef deserializeResizeTimer =
+            IpplTimings::getTimer("particleDeserResize");
+        IpplTimings::startTimer(deserializeResizeTimer);
+
+        if (totalRecvs > 0) {
+            const size_type receiveCapacity = localAfterDestroy + totalRecvs;
+            pc.forAllAttributes([&]<typename Attribute>(Attribute*& attribute) {
+                attribute->reserve(receiveCapacity);
+            });
         }
-        IpplTimings::stopTimer(recvTimer);
 
-        IpplTimings::startTimer(sendTimer);
+        IpplTimings::stopTimer(deserializeResizeTimer);
+
+        static IpplTimings::TimerRef deserializeCopyTimer =
+            IpplTimings::getTimer("particleDeserCopy");
+        IpplTimings::startTimer(deserializeCopyTimer);
 
-        if (requests.size() > 0) {
-            MPI_Waitall(requests.size(), requests.data(), MPI_STATUSES_IGNORE);
+        for (auto& finalize : finalizers) {
+            finalize(pc.getLocalNum());
         }
-        IpplTimings::stopTimer(sendTimer);
-        Comm->freeAllBuffers();
+
+        IpplTimings::stopTimer(deserializeCopyTimer);
+        IpplTimings::stopTimer(deserializeTimer);
 
         IpplTimings::stopTimer(ParticleUpdateTimer);
     }
 
+    template <typename T, unsigned Dim, class Mesh, typename... Properties>
+    template <size_t... Idx>
+    KOKKOS_INLINE_FUNCTION constexpr bool
+    ParticleSpatialLayout<T, Dim, Mesh, Properties...>::positionInRegionInclusive(
+        const std::index_sequence<Idx...>&, const vector_type& pos, const region_type& region) {
+        return ((pos[Idx] >= region[Idx].min()) && ...) && ((pos[Idx] <= region[Idx].max()) && ...);
+    };
+
     template <typename T, unsigned Dim, class Mesh, typename... Properties>
     template <size_t... Idx>
     KOKKOS_INLINE_FUNCTION constexpr bool
@@ -248,220 +342,204 @@ namespace ippl {
         return totalSize;
     }
 
-    /**
-     * @brief This function determines to which rank particles need to be sent after the iteration
-     * step. It starts by first scanning direct rank neighbors, and only does a global scan if there
-     * are still unfound particles. It then calculates how many particles need to be sent to each
-     * rank and how many ranks are sent to in total.
-     *
-     * @param pc           Particle Container
-     * @param ranks        A vector the length of the number of particles on the current rank, where
-     * each value refers to the new rank of the particle
-     * @param invalid      A vector marking the particles that need to be sent away, and thus
-     * locally deleted
-     * @param nSends_dview Device view the length of number of ranks, where each value determines
-     * the number of particles sent to that rank from the current rank
-     * @param sends_dview  Device view for the number of ranks that are sent to from current rank
-     *
-     * @return tuple with the number of particles sent away and the number of ranks sent to
-     */
     template <typename T, unsigned Dim, class Mesh, typename... Properties>
     template <typename ParticleContainer>
-    std::pair<detail::size_type, detail::size_type>
-    ParticleSpatialLayout<T, Dim, Mesh, Properties...>::locateParticles(
-        const ParticleContainer& pc, locate_type& ranks, bool_type& invalid,
-        locate_type& nSends_dview, locate_type& sends_dview) const {
-        auto positions           = pc.R.getView();
-        region_view_type Regions = rlayout_m.getdLocalRegions();
+    size_t ParticleSpatialLayout<T, Dim, Mesh, Properties...>::locateParticlesPacked(
+        const ParticleContainer& pc) {
+        const int nRanks       = Comm->size();
+        const size_type myRank = Comm->rank();
 
-        using policy_type  = Kokkos::RangePolicy<position_execution_space>;
-        using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<2>, position_execution_space>;
+        ensureNeighborsCached();
 
-        size_type myRank = Comm->rank();
-
-        const auto is = std::make_index_sequence<Dim>{};
-
-        const neighbor_list& neighbors = flayout_m.getNeighbors();
-
-        /// outsideIds: Container of particle IDs that travelled outside of the neighborhood.
-        locate_type outsideIds("Particles outside of neighborhood", size_type(pc.getLocalNum()));
-
-        /// outsideCount: Tracks the number of particles that travelled outside of the neighborhood.
-        size_type outsideCount = 0;
-        /// invalidCount: Tracks the number of particles that need to be sent to other ranks.
-        size_type invalidCount = 0;
+        auto positions           = pc.R.getView();
+        region_view_type Regions = rlayout_m->getdLocalRegions();
+        const auto is            = std::make_index_sequence<Dim>{};
 
-        /// neighborSize: Size of a neighborhood in D dimentions.
-        const size_type neighborSize = getNeighborSize(neighbors);
+        using exec_space  = position_execution_space;
+        using policy_type = Kokkos::RangePolicy<size_t, exec_space>;
 
-        /// neighbors_view: Kokkos view with the IDs of the neighboring MPI ranks.
-        locate_type neighbors_view("Nearest neighbors IDs", neighborSize);
+        // Reset small device buffers
+        Kokkos::deep_copy(rankSendCount_d_, size_type(0));
+        Kokkos::deep_copy(cursor_d_, size_type(0));
+        Kokkos::deep_copy(nDest_d_, size_type(0));
 
-        /* red_val: Used to reduce both the number of invalid particles and the number of particles
-         * outside of the neighborhood (Kokkos::parallel_scan doesn't allow multiple reduction
-         * values, so we use the helper class increment_type). First element updates InvalidCount,
-         * second one updates outsideCount.
-         */
-        increment_type red_val;
-        red_val.init();
+        const size_type neighbors_used = neighbors_used_;
+        auto& neighbours_d             = neighbors_d_;
 
-        auto neighbors_mirror =
-            Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), neighbors_view);
+        // Destination rank computation (no per-particle storage)
+        auto destRankOf = KOKKOS_LAMBDA(const size_t i)->size_type {
+            if (positionInRegion(is, positions(i), Regions(myRank)))
+                return myRank;
 
-        size_t k = 0;
+            for (int j = 0; j < static_cast<int>(neighbors_used); ++j) {
+                const int r = neighbours_d(j);
+                if (positionInRegion(is, positions(i), Regions(r)))
+                    return r;
+            }
 
-        for (const auto& componentNeighbors : neighbors) {
-            for (size_t j = 0; j < componentNeighbors.size(); ++j) {
-                neighbors_mirror(k) = componentNeighbors[j];
-                // std::cout << "Neighbor: " << neighbors_mirror(k) << std::endl;
-                k++;
+            // slow-path: global scan
+            for (int r = 0; r < static_cast<int>(Regions.extent(0)); ++r) {
+                if (positionInRegion(is, positions(i), Regions(r)))
+                    return r;
             }
-        }
 
-        Kokkos::deep_copy(neighbors_view, neighbors_mirror);
+            // Inclusive fallback: catches particles sitting exactly on a region
+            // lower boundary that the strict > check above missed (e.g. (0,0,0))
+            for (int r = 0; r < static_cast<int>(Regions.extent(0)); ++r) {
+                if (positionInRegionInclusive(is, positions(i), Regions(r)))
+                    return r;
+            }
+            return myRank;  // truly outside all regions - applyBC should have prevented this
+        };
 
-        /*! Begin Kokkos loop:
-         * Step 1: search in current rank
-         * Step 2: search in neighbors
-         * Step 3: save information on whether the particle was located
-         * Step 4: run additional loop on non-located particles
-         */
-        static IpplTimings::TimerRef neighborSearch = IpplTimings::getTimer("neighborSearch");
-        IpplTimings::startTimer(neighborSearch);
+        // Make sure the leaving-mask buffer is large enough; it's reused across
+        // updates to avoid reallocation.
+        ensureLeavingCapacity(pc.getLocalNum());
+        auto& leaving_d = leaving_d_;
 
-        Kokkos::parallel_scan(
-            "ParticleSpatialLayout::locateParticles()",
-            policy_type(0, ranks.extent(0)),
-            KOKKOS_LAMBDA(const size_type i, increment_type& val, const bool final) {
-                /* Step 1
-                 * inCurr: True if the particle hasn't left the current MPI rank.
-                 * inNeighbor: True if the particle is found in a neighboring rank.
-                 * found: True either if inCurr = True or inNeighbor = True.
-                 * increment: Helper variable to update red_val.
-                 */
-                bool inCurr     = false;
-                bool inNeighbor = false;
-                bool found      = false;
-                bool increment[2];
-
-                inCurr = positionInRegion(is, positions(i), Regions(myRank));
-
-                ranks(i)   = inCurr * myRank;
-                invalid(i) = !inCurr;
-                found      = inCurr || found;
-
-                /// Step 2
-                for (size_t j = 0; j < neighbors_view.extent(0); ++j) {
-                    size_type rank = neighbors_view(j);
-
-                    inNeighbor = positionInRegion(is, positions(i), Regions(rank));
-
-                    ranks(i) = !(inNeighbor)*ranks(i) + inNeighbor * rank;
-                    found    = inNeighbor || found;
-                }
-                /// Step 3
-                /* isOut: When the last thread has finished the search, checks whether the particle
-                 * has been found either in the current rank or in a neighboring one. Used to avoid
-                 * race conditions when updating outsideIds.
-                 */
-                if (final && !found) {
-                    outsideIds(val.count[1]) = i;
-                }
-                // outsideIds(val.count[1]) = i * isOut;
-                increment[0] = invalid(i);
-                increment[1] = !found;
-                val += increment;
+        // Pass 1: compute send counts + nInvalid + per-particle "is leaving"
+        size_type nInvalid    = 0;
+        auto& rankSendCount_d = rankSendCount_d_;
+        Kokkos::parallel_reduce(
+            "PSL::packed_count", policy_type(0, pc.getLocalNum()),
+            KOKKOS_LAMBDA(const size_t i, size_type& inval) {
+                const size_type dest = destRankOf(i);
+                const bool leaves    = (dest != myRank);
+                leaving_d(i)         = leaves;
+                inval += leaves;
+                if (leaves)
+                    Kokkos::atomic_fetch_add(&rankSendCount_d(dest), size_type(1));
             },
-            red_val);
-
+            nInvalid);
         Kokkos::fence();
 
-        invalidCount = red_val.count[0];
-        outsideCount = red_val.count[1];
-
-        IpplTimings::stopTimer(neighborSearch);
-
-        /// Step 4
-        static IpplTimings::TimerRef nonNeighboringParticles =
-            IpplTimings::getTimer("nonNeighboringParticles");
-        IpplTimings::startTimer(nonNeighboringParticles);
-        if (outsideCount > 0) {
-            Kokkos::parallel_for(
-                "ParticleSpatialLayout::leftParticles()",
-                mdrange_type({0, 0}, {outsideCount, Regions.extent(0)}),
-                KOKKOS_LAMBDA(const size_t i, const size_type j) {
-                    /// pID: (local) ID of the particle that is currently being searched.
-                    size_type pId = outsideIds(i);
-
-                    /// inRegion: Checks whether particle pID is inside region j.
-                    bool inRegion = positionInRegion(is, positions(pId), Regions(j));
-                    if (inRegion) {
-                        ranks(pId) = j;
-                    }
-                });
-            Kokkos::fence();
-        }
-        IpplTimings::stopTimer(nonNeighboringParticles);
+        // Ensure sendIds capacity after we know nInvalid
+        ensureSendCapacity(nInvalid);
 
-        Kokkos::parallel_for(
-            "Calculate nSends", policy_type(0, ranks.extent(0)),
-            KOKKOS_LAMBDA(const size_t i) {
-                size_type rank = ranks(i);
-                Kokkos::atomic_fetch_add(&nSends_dview(rank), 1);
+        // Exclusive scan to offsets (length nRanks+1)
+        auto& sendOffsets_d = sendOffsets_d_;
+        Kokkos::parallel_scan(
+            "PSL::packed_offsets", policy_type(0, (size_t)nRanks + 1),
+            KOKKOS_LAMBDA(const size_t r, size_type& upd, const bool final) {
+                if (final)
+                    sendOffsets_d(r) = upd;
+                if (r < (size_t)nRanks)
+                    upd += rankSendCount_d(r);
             });
+        Kokkos::fence();
+
+        // Pass 2: fill packed send IDs into sendIds_d_ (prefix [0, nInvalid))
+        auto& cursor_d  = cursor_d_;
+        auto& sendIds_d = sendIds_d_;
+        Kokkos::parallel_for(
+            "PSL::packed_fill", policy_type(0, pc.getLocalNum()), KOKKOS_LAMBDA(const size_t i) {
+                const size_type dest = destRankOf(i);
+                if (dest == myRank)
+                    return;
+
+                const size_type pos  = Kokkos::atomic_fetch_add(&cursor_d(dest), size_type(1));
+                const size_type base = sendOffsets_d(dest);
 
-        // Number of Ranks we need to send to
-        Kokkos::View<size_type, position_memory_space> rankSends(
-            "Number of Ranks we need to send to");
+                sendIds_d(base + pos) = static_cast<typename hash_type::non_const_value_type>(i);
+            });
+        Kokkos::fence();
 
+        // Build destination rank list on device (compact), store length in nDest_d_
+        auto& destRanks_d = destRanks_d_;
+        auto& nDest_d     = nDest_d_;
         Kokkos::parallel_for(
-            "Calculate sends", policy_type(0, nSends_dview.extent(0)),
-            KOKKOS_LAMBDA(const size_t rank) {
-                if (nSends_dview(rank) != 0) {
-                    size_type index    = Kokkos::atomic_fetch_add(&rankSends(), 1);
-                    sends_dview(index) = rank;
+            "PSL::packed_destRanks", policy_type(0, (size_t)nRanks), KOKKOS_LAMBDA(const size_t r) {
+                if ((size_type)r == myRank)
+                    return;
+                if (rankSendCount_d(r) > 0) {
+                    const size_type idx = Kokkos::atomic_fetch_add(&nDest_d(), size_type(1));
+                    destRanks_d(idx)    = static_cast<int>(r);
                 }
             });
-        size_type temp;
-        Kokkos::deep_copy(temp, rankSends);
+        Kokkos::fence();
 
-        return {invalidCount, temp};
+        return nInvalid;
     }
 
     template <typename T, unsigned Dim, class Mesh, typename... Properties>
-    void ParticleSpatialLayout<T, Dim, Mesh, Properties...>::fillHash(int rank,
-                                                                      const locate_type& ranks,
-                                                                      hash_type& hash) {
-        /* Compute the prefix sum and fill the hash
-         */
-        using policy_type = Kokkos::RangePolicy<position_execution_space>;
-        Kokkos::parallel_scan(
-            "ParticleSpatialLayout::fillHash()", policy_type(0, ranks.extent(0)),
-            KOKKOS_LAMBDA(const size_t i, int& idx, const bool final) {
-                if (final) {
-                    if (rank == ranks(i)) {
-                        hash(idx) = i;
-                    }
-                }
+    void ParticleSpatialLayout<T, Dim, Mesh, Properties...>::initScratch(int nRanks) {
+        Kokkos::realloc(rankSendCount_d_, nRanks);
+        Kokkos::realloc(sendOffsets_d_, nRanks + 1);
+        Kokkos::realloc(cursor_d_, nRanks);
+        Kokkos::realloc(destRanks_d_, nRanks);
+        Kokkos::realloc(recvCounts_d_, nRanks);
+
+        // scalar counter
+        nDest_d_ = Kokkos::View<size_type, position_memory_space>("nDest_d");
+
+        // Host mirrors
+        Kokkos::realloc(rankSendCount_h_, nRanks);
+        Kokkos::realloc(sendOffsets_h_, nRanks + 1);
+        Kokkos::realloc(destRanks_h_, nRanks);
+
+        destinationRanks_host_.clear();
+        destinationRanks_host_.reserve(nRanks);
+    }
 
-                if (rank == ranks(i)) {
-                    idx += 1;
-                }
-            });
-        Kokkos::fence();
+    template <typename T, unsigned Dim, class Mesh, typename... Properties>
+    void ParticleSpatialLayout<T, Dim, Mesh, Properties...>::ensureSendCapacity(size_t nInvalid) {
+        if (nInvalid <= sendIds_capacity_)
+            return;
+
+        // grow geometrically
+        size_t newCap = sendIds_capacity_ ? sendIds_capacity_ : size_t(1024);
+        while (newCap < nInvalid)
+            newCap *= 2;
+
+        sendIds_capacity_ = newCap;
+        Kokkos::realloc(sendIds_d_, newCap);
     }
 
     template <typename T, unsigned Dim, class Mesh, typename... Properties>
-    size_t ParticleSpatialLayout<T, Dim, Mesh, Properties...>::numberOfSends(
-        int rank, const locate_type& ranks) {
-        size_t nSends     = 0;
-        using policy_type = Kokkos::RangePolicy<position_execution_space>;
-        Kokkos::parallel_reduce(
-            "ParticleSpatialLayout::numberOfSends()", policy_type(0, ranks.extent(0)),
-            KOKKOS_LAMBDA(const size_t i, size_t& num) { num += size_t(rank == ranks(i)); },
-            nSends);
-        Kokkos::fence();
-        return nSends;
+    void ParticleSpatialLayout<T, Dim, Mesh, Properties...>::ensureLeavingCapacity(size_t nLocal) {
+        if (nLocal <= leaving_capacity_)
+            return;
+
+        size_t newCap = leaving_capacity_ ? leaving_capacity_ : size_t(1024);
+        while (newCap < nLocal)
+            newCap *= 2;
+
+        leaving_capacity_ = newCap;
+        Kokkos::realloc(leaving_d_, newCap);
+    }
+
+    template <typename T, unsigned Dim, class Mesh, typename... Properties>
+    void ParticleSpatialLayout<T, Dim, Mesh, Properties...>::ensureNeighborsCached() {
+        if (!neighbors_dirty_)
+            return;
+
+        const neighbor_list& neighbors = flayout_m.getNeighbors();
+        const size_type neighborSize   = getNeighborSize(neighbors);
+        neighbors_used_                = neighborSize;
+
+        if (neighborSize > neighbors_capacity_) {
+            neighbors_capacity_ = neighborSize;
+            Kokkos::realloc(neighbors_d_, neighbors_capacity_);
+        }
+
+        neighbors_host_.clear();
+        neighbors_host_.reserve(neighborSize);
+
+        auto neighbors_h = Kokkos::create_mirror_view(neighbors_d_);
+        size_t k         = 0;
+        for (const auto& comp : neighbors) {
+            for (size_t j = 0; j < comp.size(); ++j) {
+                neighbors_h(k) = comp[j];
+                neighbors_host_.push_back(comp[j]);
+                ++k;
+            }
+        }
+
+        Kokkos::deep_copy(
+            Kokkos::subview(neighbors_d_, std::make_pair(size_t(0), size_t(neighborSize))),
+            Kokkos::subview(neighbors_h, std::make_pair(size_t(0), size_t(neighborSize))));
+
+        neighbors_dirty_ = false;
     }
 
 }  // namespace ippl
diff --git a/src/Particle/SortBuffer.h b/src/Particle/SortBuffer.h
new file mode 100644
index 000000000..977db9f59
--- /dev/null
+++ b/src/Particle/SortBuffer.h
@@ -0,0 +1,185 @@
+#ifndef IPPL_SORT_BUFFER_H
+#define IPPL_SORT_BUFFER_H
+
+#include <Kokkos_Core.hpp>
+
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+
+namespace ippl {
+    namespace detail {
+
+        /**
+         * @brief Persistent scratch buffers for bin-sort-based particle binning.
+         *
+         * Grows on demand, never shrinks. Call finalizeBinSortBuffers() before
+         * Kokkos::finalize() to release device memory safely.
+         *
+         * Buffer layout
+         *   bin_keys      uint64_t[n_particles]   sort keys (bin index per particle)
+         *   permute       size_t  [n_particles]   identity → sorted particle order
+         *   bin_offsets   size_t  [n_bins + 1]    start of each bin in permute
+         *   keys_out      uint64_t[n_particles]   CUB radix-sort output (CUDA only)
+         *   perm_out      size_t  [n_particles]   CUB radix-sort output (CUDA only)
+         *   temp_storage  char    [temp_bytes]    CUB device-temp (CUDA only)
+         */
+        template <typename MemorySpace>
+        class BinSortBuffers {
+        public:
+            using memory_space = MemorySpace;
+            using size_type    = size_t;
+
+            BinSortBuffers()                                 = default;
+            BinSortBuffers(const BinSortBuffers&)            = delete;
+            BinSortBuffers& operator=(const BinSortBuffers&) = delete;
+            BinSortBuffers(BinSortBuffers&&)                 = default;
+            BinSortBuffers& operator=(BinSortBuffers&&)      = default;
+
+            /**
+             * @brief Ensure particle-sized buffers are large enough.
+             *
+             * @param n_particles  Number of particles.
+             * @param n_bins_p1    Number of bins + 1 (size needed for bin_offsets).
+             * @param growth       Over-allocation factor (default 1.2).
+             */
+            void ensureCapacity(size_type n_particles, size_type n_bins_p1, double growth = 1.2) {
+                if (n_particles > particle_capacity_) {
+                    const size_type cap =
+                        std::max(static_cast<size_type>(n_particles * growth), n_particles);
+
+                    bin_keys_ = Kokkos::View<uint64_t*, memory_space>("bin_keys", cap);
+                    permute_  = Kokkos::View<size_type*, memory_space>("permute", cap);
+                    keys_out_ = Kokkos::View<uint64_t*, memory_space>("keys_out", cap);
+                    perm_out_ = Kokkos::View<size_type*, memory_space>("perm_out", cap);
+
+                    particle_capacity_ = cap;
+                }
+
+                if (n_bins_p1 > bin_offset_capacity_) {
+                    const size_type cap =
+                        std::max(static_cast<size_type>(n_bins_p1 * growth), n_bins_p1);
+
+                    bin_offsets_ = Kokkos::View<size_type*, memory_space>("bin_offsets", cap);
+                    bin_offset_capacity_ = cap;
+                }
+            }
+
+            /**
+             * @brief Ensure CUB/rocPRIM temp-storage buffer is large enough.
+             * @param bytes  Required bytes.
+             */
+            void ensureTempStorage(size_t bytes) {
+                if (bytes <= temp_capacity_)
+                    return;
+                const size_t cap = static_cast<size_t>(bytes * 1.1);
+                temp_storage_    = Kokkos::View<char*, memory_space>("temp_storage", cap);
+                temp_capacity_   = cap;
+            }
+
+            // --- accessors -------------------------------------------------------
+            Kokkos::View<uint64_t*, memory_space>& binKeys() { return bin_keys_; }
+            Kokkos::View<size_type*, memory_space>& permute() { return permute_; }
+            Kokkos::View<size_type*, memory_space>& binOffsets() { return bin_offsets_; }
+            Kokkos::View<uint64_t*, memory_space>& keysOut() { return keys_out_; }
+            Kokkos::View<size_type*, memory_space>& permOut() { return perm_out_; }
+            Kokkos::View<char*, memory_space>& tempStorage() { return temp_storage_; }
+
+            size_type particleCapacity() const { return particle_capacity_; }
+            size_type binOffsetCapacity() const { return bin_offset_capacity_; }
+            size_t tempCapacity() const { return temp_capacity_; }
+
+            size_t memoryUsage() const {
+                // Sum the bytes actually held by each view rather than
+                // assuming all of them are populated — keys_out_ / perm_out_
+                // are only allocated on the CUDA fast path.
+                return bin_keys_.span() * sizeof(uint64_t)
+                       + permute_.span() * sizeof(size_type)
+                       + bin_offsets_.span() * sizeof(size_type)
+                       + keys_out_.span() * sizeof(uint64_t)
+                       + perm_out_.span() * sizeof(size_type)
+                       + temp_storage_.span();
+            }
+
+            void clear() {
+                bin_keys_            = {};
+                permute_             = {};
+                bin_offsets_         = {};
+                keys_out_            = {};
+                perm_out_            = {};
+                temp_storage_        = {};
+                particle_capacity_   = 0;
+                bin_offset_capacity_ = 0;
+                temp_capacity_       = 0;
+            }
+
+        private:
+            Kokkos::View<uint64_t*, memory_space> bin_keys_;
+            Kokkos::View<size_type*, memory_space> permute_;
+            Kokkos::View<size_type*, memory_space> bin_offsets_;
+            Kokkos::View<uint64_t*, memory_space> keys_out_;
+            Kokkos::View<size_type*, memory_space> perm_out_;
+            Kokkos::View<char*, memory_space> temp_storage_;
+
+            size_type particle_capacity_   = 0;
+            size_type bin_offset_capacity_ = 0;
+            size_t temp_capacity_          = 0;
+        };
+
+        // ---------------------------------------------------------------------------
+        // Singleton access
+        // ---------------------------------------------------------------------------
+
+        template <typename MemorySpace>
+        class BinSortBuffersHolder {
+        public:
+            static BinSortBuffersHolder& instance() {
+                static BinSortBuffersHolder h;
+                return h;
+            }
+
+            BinSortBuffers<MemorySpace>& get() {
+                if (!buffers_)
+                    buffers_ = std::make_unique<BinSortBuffers<MemorySpace>>();
+                return *buffers_;
+            }
+
+            void finalize() {
+                if (buffers_) {
+                    buffers_->clear();
+                    buffers_.reset();
+                }
+            }
+
+        private:
+            BinSortBuffersHolder()                                       = default;
+            ~BinSortBuffersHolder()                                      = default;
+            BinSortBuffersHolder(const BinSortBuffersHolder&)            = delete;
+            BinSortBuffersHolder& operator=(const BinSortBuffersHolder&) = delete;
+
+            std::unique_ptr<BinSortBuffers<MemorySpace>> buffers_;
+        };
+
+        template <typename MemorySpace>
+        BinSortBuffers<MemorySpace>& getDefaultBinSortBuffers() {
+            return BinSortBuffersHolder<MemorySpace>::instance().get();
+        }
+
+        /**
+         * @brief Release all bin-sort buffers for known memory spaces.
+         *        Call this before Kokkos::finalize().
+         */
+        inline void finalizeBinSortBuffers() {
+#ifdef KOKKOS_ENABLE_CUDA
+            BinSortBuffersHolder<Kokkos::CudaSpace>::instance().finalize();
+#endif
+#ifdef KOKKOS_ENABLE_HIP
+            BinSortBuffersHolder<Kokkos::HIPSpace>::instance().finalize();
+#endif
+            BinSortBuffersHolder<Kokkos::HostSpace>::instance().finalize();
+        }
+
+    }  // namespace detail
+}  // namespace ippl
+
+#endif  // IPPL_SORT_BUFFER_H
\ No newline at end of file
diff --git a/src/Utility/BufferView.h b/src/Utility/BufferView.h
new file mode 100644
index 000000000..732bf8375
--- /dev/null
+++ b/src/Utility/BufferView.h
@@ -0,0 +1,229 @@
+#ifndef IPPL_BUFFERVIEW_H
+#define IPPL_BUFFERVIEW_H
+
+#include <Kokkos_Core.hpp>
+#include <cstddef>
+#include <stdexcept>
+#include <string>
+
+namespace ippl {
+
+    /**
+     * @brief Helper to compute aligned offset
+     */
+    [[nodiscard]] constexpr size_t alignUp(size_t offset, size_t alignment) {
+        return (offset + alignment - 1) & ~(alignment - 1);
+    }
+
+    /**
+     *
+     * @tparam T Element type
+     * @tparam MemorySpace Kokkos memory space
+     */
+    template <typename T, typename MemorySpace = Kokkos::DefaultExecutionSpace::memory_space>
+    class BufferView {
+    public:
+        using view_type = Kokkos::View<T*, MemorySpace>;
+
+        explicit BufferView(size_t count)
+            : view_m("BufferView", count) {}
+
+        ~BufferView() = default;
+
+        // Non-copyable
+        BufferView(const BufferView&)            = delete;
+        BufferView& operator=(const BufferView&) = delete;
+
+        // Movable
+        BufferView(BufferView&&) = default;
+        BufferView& operator=(BufferView&&) = default;
+
+        view_type& getView() { return view_m; }
+        const view_type& getView() const { return view_m; }
+
+        T* data() { return view_m.data(); }
+        const T* data() const { return view_m.data(); }
+        size_t size() const { return view_m.extent(0); }
+
+    private:
+        view_type view_m;
+    };
+
+    /**
+     * @brief Allocates a single buffer and provides multiple aligned views into it.
+     *
+     * This class manages its own memory via a Kokkos View, completely independent
+     * from MPI buffer management. Use this for temporary computation buffers
+     * that don't need MPI communication.
+     *
+     * @tparam MemorySpace The Kokkos memory space for the buffer
+     */
+    template <typename MemorySpace = Kokkos::DefaultExecutionSpace::memory_space>
+    class MultiViewBuffer {
+    public:
+        using buffer_view_type = Kokkos::View<char*, MemorySpace>;
+
+        static constexpr size_t DEFAULT_ALIGNMENT = 256;
+
+        /**
+         * @brief Construct without allocation. Call allocate() before getting views.
+         */
+        MultiViewBuffer()
+            : currentOffset_m(0) {}
+
+        /**
+         * @brief Construct and allocate buffer of specified size.
+         * @param totalBytes Total size in bytes
+         * @param label Optional Kokkos label for debugging
+         */
+        explicit MultiViewBuffer(size_t totalBytes, const std::string& label = "MultiViewBuffer")
+            : buffer_m(label, totalBytes)
+            , currentOffset_m(0) {}
+
+        ~MultiViewBuffer() = default;
+
+        // Non-copyable
+        MultiViewBuffer(const MultiViewBuffer&)            = delete;
+        MultiViewBuffer& operator=(const MultiViewBuffer&) = delete;
+
+        // Movable
+        MultiViewBuffer(MultiViewBuffer&& other) noexcept
+            : buffer_m(std::move(other.buffer_m))
+            , currentOffset_m(other.currentOffset_m) {
+            other.currentOffset_m = 0;
+        }
+
+        MultiViewBuffer& operator=(MultiViewBuffer&& other) noexcept {
+            if (this != &other) {
+                buffer_m              = std::move(other.buffer_m);
+                currentOffset_m       = other.currentOffset_m;
+                other.currentOffset_m = 0;
+            }
+            return *this;
+        }
+
+        /**
+         * @brief Allocate or reallocate the underlying buffer.
+         * @param totalBytes Total size in bytes
+         * @param label Optional Kokkos label for debugging
+         */
+        void allocate(size_t totalBytes, const std::string& label = "MultiViewBuffer") {
+            buffer_m        = buffer_view_type(label, totalBytes);
+            currentOffset_m = 0;
+        }
+
+        /**
+         * @brief Get an aligned view of count elements of type T from the buffer.
+         *
+         * Views are allocated sequentially from the buffer. Each view starts at
+         * an address aligned to DEFAULT_ALIGNMENT (256 bytes).
+         *
+         * @tparam T Element type for the view
+         * @param count Number of elements
+         * @return Unmanaged Kokkos::View pointing into the buffer
+         */
+        template <typename T>
+        Kokkos::View<T*, MemorySpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+        getView(size_t count) {
+            using view_type = Kokkos::View<T*, MemorySpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
+
+            size_t alignedOffset = alignUp(currentOffset_m, DEFAULT_ALIGNMENT);
+            size_t requiredSize  = alignedOffset + count * sizeof(T);
+
+            if (requiredSize > buffer_m.size()) {
+                throw std::runtime_error(
+                    "MultiViewBuffer: insufficient space. Required: "
+                    + std::to_string(requiredSize)
+                    + ", Available: " + std::to_string(buffer_m.size()));
+            }
+
+            T* viewPtr = reinterpret_cast<T*>(buffer_m.data() + alignedOffset);
+            currentOffset_m = alignedOffset + count * sizeof(T);
+
+            return view_type(viewPtr, count);
+        }
+
+        /**
+         * @brief Reset the allocation offset to reuse the buffer for new views.
+         *
+         * Previously obtained views become invalid after this call.
+         */
+        void reset() { currentOffset_m = 0; }
+
+        /**
+         * @brief Check if buffer has been allocated.
+         */
+        bool isAllocated() const { return buffer_m.size() > 0; }
+
+        /**
+         * @brief Get the current used size in bytes.
+         */
+        size_t getUsedSize() const { return currentOffset_m; }
+
+        /**
+         * @brief Get the total available size in bytes.
+         */
+        size_t getTotalSize() const { return buffer_m.size(); }
+
+        /**
+         * @brief Get remaining available bytes.
+         */
+        size_t getRemainingSize() const { return getTotalSize() - currentOffset_m; }
+
+        /**
+         * @brief Get raw pointer to buffer start.
+         */
+        char* data() { return buffer_m.data(); }
+        const char* data() const { return buffer_m.data(); }
+
+    private:
+        buffer_view_type buffer_m;
+        size_t currentOffset_m = 0;
+    };
+
+    /**
+     * @brief Helper struct for recursive buffer size computation
+     */
+    template <typename... Ts>
+    struct BufferSizeComputer;
+
+    // Base case: single type
+    template <typename T>
+    struct BufferSizeComputer<T> {
+        static constexpr size_t compute(size_t offset, size_t count) {
+            return alignUp(offset, MultiViewBuffer<>::DEFAULT_ALIGNMENT) + count * sizeof(T);
+        }
+    };
+
+    // Recursive case: multiple types
+    template <typename T, typename... Rest>
+    struct BufferSizeComputer<T, Rest...> {
+        template <typename... Counts>
+        static constexpr size_t compute(size_t offset, size_t count, Counts... counts) {
+            static_assert(sizeof...(Rest) == sizeof...(Counts),
+                          "Number of remaining types must match remaining counts");
+            size_t alignedOffset = alignUp(offset, MultiViewBuffer<>::DEFAULT_ALIGNMENT);
+            size_t newOffset     = alignedOffset + count * sizeof(T);
+            return BufferSizeComputer<Rest...>::compute(newOffset, counts...);
+        }
+    };
+
+    /**
+     * @brief Compute total buffer size needed for multiple views with alignment.
+     *
+     * Usage: auto size = computeBufferSize<double, int, float>(100, 200, 50);
+     *
+     * @tparam Ts Element types for each view
+     * @param counts Number of elements for each view
+     * @return Total required buffer size in bytes
+     */
+    template <typename... Ts, typename... Counts>
+    constexpr size_t computeBufferSize(Counts... counts) {
+        static_assert(sizeof...(Ts) == sizeof...(Counts),
+                      "Number of types must match number of counts");
+        return BufferSizeComputer<Ts...>::compute(0, static_cast<size_t>(counts)...);
+    }
+
+}  // namespace ippl
+
+#endif  // IPPL_BUFFERVIEW_H
\ No newline at end of file
diff --git a/src/Utility/IpplTimings.cpp b/src/Utility/IpplTimings.cpp
index e0f65f36f..93fd9447b 100644
--- a/src/Utility/IpplTimings.cpp
+++ b/src/Utility/IpplTimings.cpp
@@ -28,7 +28,10 @@
 
 #include <algorithm>
 #include <fstream>
+#include <iomanip>
 #include <iostream>
+#include <limits>
+#include <sstream>
 
 #include "Utility/Inform.h"
 #include "Utility/IpplInfo.h"
@@ -53,8 +56,10 @@ const int num_colors    = sizeof(colors) / sizeof(uint32_t);
     }
 #endif
 
+// Static member initialization
 Timing* IpplTimings::instance = new Timing();
 std::stack<Timing*> IpplTimings::stashedInstance;
+const std::vector<double> Timing::emptyMeasurements;
 
 Timing::Timing()
     : TimerList()
@@ -113,6 +118,111 @@ void Timing::clearTimer(TimerRef t) {
     TimerList[t]->clear();
 }
 
+// Reset all timers - useful for warmup
+void Timing::resetAllTimers() {
+    for (unsigned int i = 0; i < TimerList.size(); ++i) {
+        TimerList[i]->clearAll();
+    }
+}
+// Get measurements for a specific timer by reference
+const std::vector<double>& Timing::getMeasurements(TimerRef t) const {
+    if (t >= TimerList.size())
+        return emptyMeasurements;
+    return TimerList[t]->measurements;
+}
+
+// Get measurements for a specific timer by name
+const std::vector<double>& Timing::getMeasurements(const std::string& name) const {
+    TimerMap_t::const_iterator loc = TimerMap.find(name);
+    if (loc == TimerMap.end())
+        return emptyMeasurements;
+    return loc->second->measurements;
+}
+
+// Get measurement count for a timer
+size_t Timing::getMeasurementCount(TimerRef t) const {
+    if (t >= TimerList.size())
+        return 0;
+    return TimerList[t]->measurements.size();
+}
+
+// Get all timer names
+std::vector<std::string> Timing::getTimerNames() const {
+    std::vector<std::string> names;
+    names.reserve(TimerList.size());
+    for (unsigned int i = 0; i < TimerList.size(); ++i) {
+        names.push_back(TimerList[i]->name);
+    }
+    return names;
+}
+
+// Dump all measurements to CSV (default format)
+void Timing::dumpToCSV(const std::string& filename) {
+    dumpToCSV(filename, ",", true);
+}
+
+// Dump all measurements to CSV with custom options
+void Timing::dumpToCSV(const std::string& filename, const std::string& delimiter,
+                       bool includeHeader) {
+    const int rank     = ippl::Comm->rank();
+    const int numRanks = ippl::Comm->size();
+
+    // Each rank serialises its measurements into a local buffer.
+    std::ostringstream localData;
+    for (unsigned int i = 0; i < TimerList.size(); ++i) {
+        TimerInfo* tptr                         = TimerList[i].get();
+        const std::string& timerName            = tptr->name;
+        const std::vector<double>& measurements = tptr->measurements;
+        for (size_t j = 0; j < measurements.size(); ++j) {
+            localData << timerName << delimiter << rank << delimiter << j << delimiter
+                      << std::setprecision(12) << measurements[j] << "\n";
+        }
+    }
+    const std::string localStr = localData.str();
+
+    // Gather sizes from every rank using a single collective rather than
+    // O(P) point-to-point pairs.
+    MPI_Aint localSize = static_cast<MPI_Aint>(localStr.size());
+    std::vector<MPI_Aint> sizes(numRanks, 0);
+    MPI_Gather(&localSize, 1, MPI_AINT, sizes.data(), 1, MPI_AINT, 0,
+               ippl::Comm->getCommunicator());
+
+    // Compute displacements and total size on rank 0; all ranks supply their
+    // bytes via MPI_Gatherv.
+    std::vector<int> intSizes(numRanks, 0);
+    std::vector<int> displs(numRanks, 0);
+    std::vector<char> all;
+    if (rank == 0) {
+        long long total = 0;
+        for (int r = 0; r < numRanks; ++r) {
+            // The MPI standard limits Gatherv counts to int. Real CSV blobs
+            // never exceed INT_MAX in practice; refuse rather than silently
+            // truncate if they do.
+            if (sizes[r] > static_cast<MPI_Aint>(std::numeric_limits<int>::max())) {
+                std::cerr << "Timing::dumpToCSV: rank " << r << " has " << sizes[r]
+                          << " bytes (> INT_MAX); truncating\n";
+            }
+            intSizes[r] = static_cast<int>(std::min<MPI_Aint>(
+                sizes[r], static_cast<MPI_Aint>(std::numeric_limits<int>::max())));
+            displs[r]   = static_cast<int>(total);
+            total += intSizes[r];
+        }
+        all.resize(static_cast<size_t>(total));
+    }
+
+    MPI_Gatherv(localStr.data(), static_cast<int>(localStr.size()), MPI_CHAR, all.data(),
+                intSizes.data(), displs.data(), MPI_CHAR, 0, ippl::Comm->getCommunicator());
+
+    if (rank == 0) {
+        std::ofstream outFile(filename);
+        if (includeHeader) {
+            outFile << "timer_name" << delimiter << "rank" << delimiter << "measurement_id"
+                    << delimiter << "duration_seconds" << "\n";
+        }
+        outFile.write(all.data(), static_cast<std::streamsize>(all.size()));
+    }
+}
+
 // print out the timing results
 void Timing::print() {
     if (TimerList.size() < 1)
@@ -153,8 +263,55 @@ void Timing::print() {
             << std::string().assign(20, ' ') << " Wall min = " << std::setw(10) << wallmin << "\n"
             << "\n";
     }
+
+    msg << "---------------------------------------------\n";
+    msg << "     Measurement counts:\n";
+    msg << "---------------------------------------------\n";
+    for (unsigned int i = 0; i < TimerList.size(); ++i) {
+        TimerInfo* tptr = TimerList[i].get();
+        size_t lengthName = std::min(tptr->name.length(), 19lu);
+        msg << tptr->name.substr(0, lengthName) << std::string().assign(20 - lengthName, '.')
+            << " Count = " << std::setw(10) << tptr->measurements.size() << "\n";
+    }
+
     msg << "---------------------------------------------";
     msg << endl;
+
+    // Per-occurrence statistics
+    msg << "---------------------------------------------\n";
+    msg << "     Per-occurrence statistics:\n";
+    msg << "---------------------------------------------\n";
+    for (unsigned int i = 0; i < TimerList.size(); ++i) {
+        TimerInfo* tptr = TimerList[i].get();
+        const std::vector<double>& m = tptr->measurements;
+        size_t count = m.size();
+        if (count == 0) continue;
+
+        double sum = 0.0, minVal = m[0], maxVal = m[0];
+        for (size_t j = 0; j < count; ++j) {
+            sum += m[j];
+            if (m[j] < minVal) minVal = m[j];
+            if (m[j] > maxVal) maxVal = m[j];
+        }
+        double mean = sum / count;
+
+        double varSum = 0.0;
+        for (size_t j = 0; j < count; ++j) {
+            double diff = m[j] - mean;
+            varSum += diff * diff;
+        }
+        double stddev = (count > 1) ? std::sqrt(varSum / (count - 1)) : 0.0;
+
+        size_t lengthName = std::min(tptr->name.length(), 19lu);
+        msg << tptr->name.substr(0, lengthName) << std::string().assign(20 - lengthName, '.')
+            << " n=" << std::setw(5) << count
+            << "  mean=" << std::setw(10) << mean
+            << "  std=" << std::setw(10) << stddev
+            << "  min=" << std::setw(10) << minVal
+            << "  max=" << std::setw(10) << maxVal
+            << "\n";
+    }
+    msg << "---------------------------------------------" << endl;
 }
 
 // save the timing results into a file
@@ -234,4 +391,4 @@ void IpplTimings::pop() {
     delete instance;
     instance = stashedInstance.top();
     stashedInstance.pop();
-}
+}
\ No newline at end of file
diff --git a/src/Utility/IpplTimings.h b/src/Utility/IpplTimings.h
index aea1332cd..82230faea 100644
--- a/src/Utility/IpplTimings.h
+++ b/src/Utility/IpplTimings.h
@@ -21,6 +21,14 @@
 //    4) print out the results:
 //       IpplTimings::print();
 //
+//    5) reset all timers (for warmup):
+//       IpplTimings::resetAllTimers();
+//    Clears all accumulated times and individual measurements.
+//
+//    6) export to CSV for violin plots:
+//       IpplTimings::dumpToCSV("output.csv");
+//    Exports all individual timing measurements for statistical analysis.
+//
 #ifndef IPPL_TIMINGS_H
 #define IPPL_TIMINGS_H
 
@@ -45,7 +53,8 @@ class IpplTimerInfo {
     IpplTimerInfo()
         : name("")
         , wallTime(0.0)
-        , indx(std::numeric_limits<TimerRef>::max()) {
+        , indx(std::numeric_limits<TimerRef>::max())
+        , measurements() {
         clear();
     }
 
@@ -55,18 +64,19 @@ class IpplTimerInfo {
     // timer operations
     void start() {
         if (!running) {
-            running = true;
-            t.stop();
-            t.clear();
+            t.clear();  // running was false, no need to stop first
             t.start();
+            running = true;
         }
     }
 
     void stop() {
         if (running) {
             t.stop();
-            running = false;
-            wallTime += t.elapsed();
+            running              = false;
+            const double elapsed = t.elapsed();
+            wallTime += elapsed;
+            measurements.push_back(elapsed);
         }
     }
 
@@ -76,6 +86,12 @@ class IpplTimerInfo {
         running = false;
     }
 
+    void clearAll() {
+        clear();
+        wallTime = 0.0;
+        measurements.clear();
+    }
+
     // the IPPL timer that this object manages
     Timer t;
 
@@ -90,6 +106,8 @@ class IpplTimerInfo {
 
     // an index value for this timer
     TimerRef indx;
+
+    std::vector<double> measurements;
 };
 
 struct Timing {
@@ -126,6 +144,20 @@ struct Timing {
     // print the results to a file
     void print(const std::string& fn, const std::map<std::string, unsigned int>& problemSize);
 
+    void resetAllTimers();
+
+    // Format: timer_name,rank,measurement_id,duration
+    void dumpToCSV(const std::string& filename);
+
+    void dumpToCSV(const std::string& filename, const std::string& delimiter, bool includeHeader);
+
+    const std::vector<double>& getMeasurements(TimerRef t) const;
+    const std::vector<double>& getMeasurements(const std::string& name) const;
+
+    size_t getMeasurementCount(TimerRef t) const;
+
+    std::vector<std::string> getTimerNames() const;
+
     // type of storage for list of TimerInfo
     typedef std::vector<my_auto_ptr<TimerInfo> > TimerList_t;
     typedef std::map<std::string, TimerInfo*> TimerMap_t;
@@ -136,6 +168,9 @@ struct Timing {
 
     // a map of timers, keyed by string
     TimerMap_t TimerMap;
+
+    // Returned by reference when a requested timer is not found.
+    static const std::vector<double> emptyMeasurements;
 };
 
 class IpplTimings {
@@ -173,6 +208,36 @@ class IpplTimings {
     static void stash();
     static void pop();
 
+    // Use this after warmup iterations to get clean timing data
+    static void resetAllTimers() { instance->resetAllTimers(); }
+
+    // Format: timer_name,rank,measurement_id,duration_seconds
+    // Perfect for creating violin plots in Python/R
+    static void dumpToCSV(const std::string& filename) {
+        instance->dumpToCSV(filename);
+    }
+
+    static void dumpToCSV(const std::string& filename,
+                         const std::string& delimiter,
+                         bool includeHeader = true) {
+        instance->dumpToCSV(filename, delimiter, includeHeader);
+    }
+
+    static const std::vector<double>& getMeasurements(TimerRef t) {
+        return instance->getMeasurements(t);
+    }
+    static const std::vector<double>& getMeasurements(const std::string& name) {
+        return instance->getMeasurements(name);
+    }
+
+    static size_t getMeasurementCount(TimerRef t) {
+        return instance->getMeasurementCount(t);
+    }
+
+    static std::vector<std::string> getTimerNames() {
+        return instance->getTimerNames();
+    }
+
 private:
     // type of storage for list of TimerInfo
     typedef Timing::TimerList_t TimerList_t;
@@ -188,4 +253,4 @@ class IpplTimings {
     static std::stack<Timing*> stashedInstance;
 };
 
-#endif
+#endif
\ No newline at end of file
diff --git a/src/Utility/ParallelDispatch.h b/src/Utility/ParallelDispatch.h
index 437c56c22..4e342aa02 100644
--- a/src/Utility/ParallelDispatch.h
+++ b/src/Utility/ParallelDispatch.h
@@ -7,6 +7,7 @@
 #define IPPL_PARALLEL_DISPATCH_H
 
 #include <Kokkos_Core.hpp>
+#include "Ippl.h"
 
 #include <tuple>
 
@@ -68,8 +69,8 @@ namespace ippl {
             }
             return policy_type(begin, end);
         }
-        // Silences incorrect nvcc warning: missing return statement at end of non-void function
-        throw IpplException("detail::getRangePolicy", "Unreachable state");
+        // Silences nvcc "missing return" at end of exhaustive if constexpr.
+        __builtin_unreachable();
     }
 
     /*!
@@ -94,8 +95,58 @@ namespace ippl {
         } else {
             return policy_type(begin, end);
         }
-        // Silences incorrect nvcc warning: missing return statement at end of non-void function
-        throw IpplException("detail::createRangePolicy", "Unreachable state");
+        // Silences nvcc "missing return" at end of exhaustive if constexpr.
+        __builtin_unreachable();
+    }
+
+    template <int... Is, typename F>
+    KOKKOS_FORCEINLINE_FUNCTION void for_constexpr(std::integer_sequence<int, Is...>, F&& f) {
+        (f.template operator()<Is>(), ...);
+    }
+
+    template <int... Is, typename F>
+    KOKKOS_FORCEINLINE_FUNCTION auto product_over(std::integer_sequence<int, Is...>, F&& f) {
+        return (f.template operator()<Is>() * ...);
+    }
+
+    template <int N, typename F>
+    KOKKOS_FORCEINLINE_FUNCTION auto product_over(F&& f) {
+        return product_over(std::make_integer_sequence<int, N>{}, std::forward<F>(f));
+    }
+
+    template <int Dim, typename Team, typename Extents, typename F>
+    KOKKOS_FORCEINLINE_FUNCTION void thread_vector_md_for(const Team& team, const Extents& extents,
+                                                          F&& f) {
+        [&]<int... Is>(std::integer_sequence<int, Is...>) {
+            Kokkos::parallel_for(
+                Kokkos::ThreadVectorMDRange<Kokkos::Rank<Dim>, Team>(team, extents[Is]...),
+                std::forward<F>(f));
+        }(std::make_integer_sequence<int, Dim>{});
+    }
+
+    template <int Dim, int W, typename Team, typename F>
+    KOKKOS_FORCEINLINE_FUNCTION void thread_vector_stencil_for(const Team& team, F&& f) {
+        constexpr int TotalCells = [] {
+            int r = 1;
+            for (int i = 0; i < Dim; ++i)
+                r *= W;
+            return r;
+        }();
+
+        Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, TotalCells), [&](const int flat_idx) {
+            // Compile-time unrollable index decomposition
+            Kokkos::Array<int, Dim> idx;
+            int tmp = flat_idx;
+            for (int d = Dim - 1; d >= 0; --d) {
+                idx[d] = tmp % W;
+                tmp /= W;
+            }
+
+            // Call with index pack
+            [&]<int... Is>(std::integer_sequence<int, Is...>) {
+                f(idx[Is]...);
+            }(std::make_integer_sequence<int, Dim>{});
+        });
     }
 
     namespace detail {
@@ -121,13 +172,50 @@ namespace ippl {
 
         enum e_functor_type {
             FOR,
-            REDUCE,
-            SCAN
+            REDUCE
         };
 
         template <e_functor_type, typename, typename, typename, typename...>
         struct FunctorWrapper;
 
+        template <typename ExecSpace>
+        constexpr bool inline isGPUSpace = false;
+
+#ifdef KOKKOS_ENABLE_CUDA
+        template <>
+        constexpr bool inline isGPUSpace<Kokkos::Cuda> = true;
+#endif
+#ifdef KOKKOS_ENABLE_HIP
+        template <>
+        constexpr bool inline isGPUSpace<Kokkos::HIP> = true;
+#endif
+
+        // Dispatches F(i) for i in [0, n) either in parallel (OpenMP host)
+        // when MPI_THREAD_MULTIPLE is available, or serially otherwise.
+        // `f(i)` must be synchronous w.r.t. its own work — this dispatcher
+        // fences after the parallel_for so the caller's next operation sees
+        // a consistent state.
+        template <typename F>
+        void parallelForMPI(size_t n, F&& f) {
+            constexpr bool useGPU = isGPUSpace<Kokkos::DefaultExecutionSpace>;
+            const bool threadSafe = Env->threadMultiple();
+
+            if constexpr (useGPU) {
+                if (threadSafe) {
+                    Kokkos::parallel_for(
+                        "Parallel dispatch",
+                        Kokkos::RangePolicy<Kokkos::DefaultHostExecutionSpace>(0, n), [&](int i) {
+                            f(i);
+                        });
+                    Kokkos::fence();
+                    return;
+                }
+            }
+            for (size_t i = 0; i < n; ++i) {
+                f(i);
+            }
+        }
+
         /*!
          * Wrapper struct for reduction kernels
          * Source:
@@ -179,9 +267,7 @@ namespace ippl {
             static constexpr int rank = Kokkos::MDRangePolicy<T...>::rank;
         };
         template <typename T>
-        concept HasMemberValueType = requires() {
-                                         { typename T::value_type() };
-                                     };
+        concept HasMemberValueType = requires { typename T::value_type; };
         template <typename T>
         struct ExtractReducerReturnType {
             using type = T;
@@ -227,6 +313,31 @@ namespace ippl {
                 functor),
             std::forward<ReducerArgument>(reducer)...);
     }
+
+    template <std::size_t I, typename T, typename... Rest>
+    KOKKOS_FORCEINLINE_FUNCTION auto get_arg(T first, Rest... rest) {
+        if constexpr (I == 0) {
+            return first;
+        } else {
+            return get_arg<I - 1>(rest...);
+        }
+    }
+
+    template <typename Grid, typename Base, typename... Stencils>
+    KOKKOS_FORCEINLINE_FUNCTION decltype(auto) grid_at(Grid& grid, const Base& base, int team_rank,
+                                                       Stencils... stencil_idx) {
+        return [&]<std::size_t... Is>(std::index_sequence<Is...>) -> decltype(auto) {
+            return grid((base(team_rank, Is) + get_arg<Is>(stencil_idx...))...);
+        }(std::index_sequence_for<Stencils...>{});
+    }
+
+    template <int D, typename Grid, typename Base, typename Stencils>
+    KOKKOS_FORCEINLINE_FUNCTION decltype(auto) grid_at_t(Grid& grid, const Base& base,
+                                                         int team_rank, Stencils& stencil_idx) {
+        return [&]<std::size_t... Is>(std::index_sequence<Is...>) -> decltype(auto) {
+            return grid((base(team_rank, Is) + stencil_idx[Is])...);
+        }(std::make_index_sequence<D>{});
+    }
 }  // namespace ippl
 
 #endif
diff --git a/src/Utility/ParameterList.h b/src/Utility/ParameterList.h
index d31f46d5d..ad9a2da11 100644
--- a/src/Utility/ParameterList.h
+++ b/src/Utility/ParameterList.h
@@ -64,6 +64,8 @@ namespace ippl {
             return std::get<T>(params_m.at(key));
         }
 
+        bool contains(const std::string& key) const { return params_m.contains(key); }
+
         /*!
          * Obtain the value of a parameter. If the key is
          * not contained, the default value is returned.
diff --git a/src/Utility/Timer.cpp b/src/Utility/Timer.cpp
index d6da687bd..de2b4335e 100644
--- a/src/Utility/Timer.cpp
+++ b/src/Utility/Timer.cpp
@@ -8,8 +8,17 @@
 
 #include "Timer.h"
 
+#include <chrono>
+
 bool Timer::enableFences = IPPL_ENABLE_TIMER_FENCES;
 
+namespace {
+    using clock_type = std::chrono::high_resolution_clock;
+    inline std::int64_t now_ticks() {
+        return clock_type::now().time_since_epoch().count();
+    }
+}  // namespace
+
 Timer::Timer() {
     this->clear();
 }
@@ -19,16 +28,22 @@ void Timer::clear() {
 }
 
 void Timer::start() {
-    start_m = std::chrono::high_resolution_clock::now();
+    // Fence on start as well: any in-flight kernel from before the timer was
+    // armed otherwise leaks its tail latency into the measured interval.
+    if (enableFences) {
+        Kokkos::fence("Timer Fence (start)");
+    }
+    start_m = now_ticks();
 }
 
 void Timer::stop() {
     if (enableFences) {
-        Kokkos::fence();
+        Kokkos::fence("Timer Fence (stop)");
     }
-    stop_m = std::chrono::high_resolution_clock::now();
+    stop_m = now_ticks();
 
-    duration_type elapsed = stop_m - start_m;
+    std::chrono::duration<double> elapsed =
+        clock_type::duration(stop_m) - clock_type::duration(start_m);
 
     elapsed_m += elapsed.count();
 }
diff --git a/src/Utility/Timer.h b/src/Utility/Timer.h
index a23d26f28..653eae588 100644
--- a/src/Utility/Timer.h
+++ b/src/Utility/Timer.h
@@ -8,17 +8,14 @@
 #define IPPL_TIMER_H
 
 #ifndef IPPL_ENABLE_TIMER_FENCES
-#warning "IPPL timer fences were not set via CMake! Defaulting to no fences."
+#pragma message("IPPL_ENABLE_TIMER_FENCES not set by CMake; defaulting to false.")
 #define IPPL_ENABLE_TIMER_FENCES false
 #endif
 
-#include <chrono>
+#include <cstdint>
 
 class Timer {
 public:
-    using timer_type    = std::chrono::time_point<std::chrono::high_resolution_clock>;
-    using duration_type = std::chrono::duration<double>;
-
     static bool enableFences;
 
     Timer();
@@ -31,7 +28,7 @@ class Timer {
 
 private:
     double elapsed_m;
-    timer_type start_m, stop_m;
+    std::int64_t start_m = 0, stop_m = 0;
 };
 
 #endif
diff --git a/src/Utility/Tuning.h b/src/Utility/Tuning.h
new file mode 100644
index 000000000..8f5177b69
--- /dev/null
+++ b/src/Utility/Tuning.h
@@ -0,0 +1,213 @@
+#ifndef IPPL_TUNING_H
+#define IPPL_TUNING_H
+
+#include <Kokkos_Core.hpp>
+#include <algorithm>
+#include <functional>
+#include <limits>
+#include <vector>
+
+// Kokkos's tuning interface is still exposed only through this internal
+// header in 5.0.x. When a public header (Kokkos_Tools.hpp) becomes
+// available across the supported version range, switch to it.
+#include <impl/Kokkos_Profiling_Interface.hpp>
+
+namespace ippl {
+
+template <unsigned Dim, typename TileType>
+class TileSizeTuner {
+public:
+    using TileConfig = TileType;
+
+private:
+    // Tuning parameters (following Kokkos convention)
+    static constexpr double tuning_min  = 0.0;
+    static constexpr double tuning_max  = 1.0;
+    static constexpr double tuning_step = 0.05;  // 20 steps per dimension
+
+    std::array<size_t, Dim> variable_ids_{};
+    size_t context_id_   = 0;
+    bool initialized_    = false;
+    bool context_active_ = false;
+
+    std::vector<int> candidates_;
+    TileConfig default_tile_;
+    size_t max_scratch_ = 0;
+    std::function<size_t(const TileConfig&)> scratch_calc_;
+
+public:
+    TileSizeTuner() = default;
+
+    template <typename ScratchCalculator>
+    void initialize(const std::string& kernel_name,
+                    const std::vector<int>& candidates,
+                    size_t max_scratch,
+                    ScratchCalculator&& calc,
+                    const TileConfig& default_tile) {
+        if (initialized_) return;
+
+        candidates_    = candidates;
+        max_scratch_   = max_scratch;
+        scratch_calc_  = std::forward<ScratchCalculator>(calc);
+        default_tile_  = default_tile;
+
+        // Sort candidates for consistent mapping
+        std::sort(candidates_.begin(), candidates_.end());
+
+        if (Kokkos::Tools::Experimental::have_tuning_tool()) {
+            using namespace Kokkos::Tools::Experimental;
+
+            for (unsigned d = 0; d < Dim; ++d) {
+                VariableInfo info;
+                info.type          = ValueType::kokkos_value_double;
+                info.category      = StatisticalCategory::kokkos_value_interval;
+                info.valueQuantity = CandidateValueType::kokkos_value_range;
+                info.candidates    = make_candidate_range(
+                    tuning_min, tuning_max, tuning_step, false, false);
+
+                variable_ids_[d] = declare_output_type(
+                    kernel_name + "_tile_size_" + std::to_string(d), info);
+            }
+        }
+
+        initialized_ = true;
+    }
+
+    bool is_initialized() const { return initialized_; }
+
+    // Map normalized value [0,1] to candidate index. With no candidates the
+    // tuner has nothing to choose from, so return 1 as a defensive default.
+    int map_to_candidate(double normalized) const {
+        if (candidates_.empty()) return 1;
+
+        normalized = std::clamp(normalized, 0.0, 1.0);
+
+        size_t idx = static_cast<size_t>(normalized * (candidates_.size() - 1) + 0.5);
+        idx        = std::min(idx, candidates_.size() - 1);
+
+        return candidates_[idx];
+    }
+
+    // Map candidate value back to normalized [0,1]. Returns 0.0 (matching the
+    // behaviour of an empty/single-element candidate set) when the candidate
+    // list cannot meaningfully be mapped.
+    double map_to_normalized(int candidate) const {
+        if (candidates_.size() <= 1) return 0.0;
+
+        auto it    = std::lower_bound(candidates_.begin(), candidates_.end(), candidate);
+        size_t idx = (it != candidates_.end()) ? std::distance(candidates_.begin(), it)
+                                               : candidates_.size() - 1;
+
+        return static_cast<double>(idx) / (candidates_.size() - 1);
+    }
+
+    TileConfig begin() {
+        if (!initialized_) {
+            return default_tile_;
+        }
+
+        TileConfig result = default_tile_;
+
+        if (Kokkos::Tools::Experimental::have_tuning_tool()) {
+            using namespace Kokkos::Tools::Experimental;
+
+            context_id_ = get_new_context_id();
+            begin_context(context_id_);
+            context_active_ = true;
+
+            // Request tuned values for each dimension
+            std::array<VariableValue, Dim> values;
+            for (unsigned d = 0; d < Dim; ++d) {
+                double default_norm = map_to_normalized(default_tile_[d]);
+                values[d] = make_variable_value(variable_ids_[d], default_norm);
+            }
+
+            request_output_values(context_id_, Dim, values.data());
+
+            // Map normalized values back to tile sizes
+            for (unsigned d = 0; d < Dim; ++d) {
+                result[d] = map_to_candidate(values[d].value.double_value);
+            }
+
+            // Validate scratch constraint - scale down if needed
+            result = fit_to_scratch(result);
+        }
+
+        return result;
+    }
+
+    void end() {
+        if (context_active_ && Kokkos::Tools::Experimental::have_tuning_tool()) {
+            Kokkos::Tools::Experimental::end_context(context_id_);
+            context_active_ = false;
+        }
+    }
+
+private:
+    // Scale down proportionally if tile doesn't fit in scratch.
+    TileConfig fit_to_scratch(const TileConfig& tile) const {
+        if (scratch_calc_(tile) <= max_scratch_) {
+            return tile;
+        }
+
+        // Binary search for the largest scale factor that fits. The result
+        // space is integer-valued (we snap to a discrete candidate after
+        // scaling), so the number of meaningful iterations is bounded by
+        // log2(max tile dim), capped at 20 for robustness.
+        int max_tile = 1;
+        for (unsigned d = 0; d < Dim; ++d) {
+            max_tile = std::max(max_tile, tile[d]);
+        }
+        int max_iter = std::min(20, static_cast<int>(std::ceil(std::log2(max_tile + 1)) + 1));
+
+        TileConfig result = tile;
+        double lo = 0.0, hi = 1.0;
+        for (int iter = 0; iter < max_iter; ++iter) {
+            double mid = (lo + hi) / 2.0;
+
+            TileConfig test;
+            for (unsigned d = 0; d < Dim; ++d) {
+                int scaled = static_cast<int>(tile[d] * mid);
+                test[d]    = snap_to_candidate(std::max(1, scaled));
+            }
+
+            if (scratch_calc_(test) <= max_scratch_) {
+                lo     = mid;
+                result = test;
+            } else {
+                hi = mid;
+            }
+        }
+
+        // Final fallback - smallest candidate. If even that overflows the
+        // scratch budget the kernel cannot run; throw so the caller doesn't
+        // get a confusing Kokkos::abort deeper in the dispatch.
+        if (scratch_calc_(result) > max_scratch_) {
+            for (unsigned d = 0; d < Dim; ++d) {
+                result[d] = candidates_.front();
+            }
+            if (scratch_calc_(result) > max_scratch_) {
+                throw std::runtime_error(
+                    "TileSizeTuner::fit_to_scratch: even the smallest tile "
+                    "exceeds the scratch budget");
+            }
+        }
+
+        return result;
+    }
+
+    int snap_to_candidate(int value) const {
+        if (candidates_.empty()) return value;
+
+        // Find largest candidate <= value
+        auto it = std::upper_bound(candidates_.begin(), candidates_.end(), value);
+        if (it == candidates_.begin()) {
+            return candidates_.front();
+        }
+        return *std::prev(it);
+    }
+};
+
+}  // namespace ippl
+
+#endif  // IPPL_TUNING_H
\ No newline at end of file
diff --git a/src/Utility/TypeUtils.h b/src/Utility/TypeUtils.h
index 9b47a3729..bc888e6d8 100644
--- a/src/Utility/TypeUtils.h
+++ b/src/Utility/TypeUtils.h
@@ -295,10 +295,9 @@ namespace ippl {
                 } else {
                     return spaceToIndex<Space, Idx + 1>();
                 }
-                // Silences incorrect nvcc warning: missing return statement at end of non-void
-                // function
-                throw IpplException("detail::MultispaceContainer::spaceToIndex",
-                                    "Unreachable state");
+                // Suppress nvcc "missing return" warning at the end of an
+                // exhaustive if constexpr chain.
+                __builtin_unreachable();
             }
 
             /*!
@@ -311,18 +310,16 @@ namespace ippl {
 
             /*!
              * Determine whether the element for a space should be initialized,
-             * possibly based on a predicate functor
+             * possibly based on a predicate functor. A nullptr_t Filter
+             * unconditionally enables the copy.
              */
-            template <typename MemorySpace, typename Filter,
-                      std::enable_if_t<std::is_null_pointer_v<std::decay_t<Filter>>, int> = 0>
-            constexpr bool copyToSpace(Filter&&) {
-                return true;
-            }
-
-            template <typename MemorySpace, typename Filter,
-                      std::enable_if_t<!std::is_null_pointer_v<std::decay_t<Filter>>, int> = 0>
+            template <typename MemorySpace, typename Filter>
             bool copyToSpace(Filter&& predicate) {
-                return predicate.template operator()<MemorySpace>();
+                if constexpr (std::is_null_pointer_v<std::decay_t<Filter>>) {
+                    return true;
+                } else {
+                    return predicate.template operator()<MemorySpace>();
+                }
             }
 
         public:
@@ -341,10 +338,22 @@ namespace ippl {
             template <typename DataType, typename Filter = std::nullptr_t>
             MultispaceContainer(const DataType& data, Filter&& predicate = nullptr)
                 : MultispaceContainer() {
-                using space = typename DataType::memory_space;
-                static_assert(std::is_same_v<DataType, Type<space>>);
-
-                elements_m[spaceToIndex<space>()] = data;
+                static_assert(Kokkos::is_view<DataType>::value, "DataType must be a Kokkos::View");
+
+                using space    = typename DataType::memory_space;
+                using expected = Type<space>;
+
+                static_assert(std::is_same_v<typename DataType::memory_space,
+                                             typename expected::memory_space>,
+                              "Hash view must live in the same memory_space as expected.");
+                static_assert(std::is_same_v<typename DataType::non_const_value_type,
+                                             typename expected::non_const_value_type>,
+                              "Hash view must have the same value_type as expected.");
+                static_assert(DataType::rank == expected::rank,
+                              "Hash view must have the same rank as expected.");
+
+                expected normalized = data;
+                elements_m[spaceToIndex<space>()] = normalized;
                 copyToOtherSpaces<space>(predicate);
             }
 
@@ -433,6 +442,40 @@ namespace ippl {
             runner(all_spaces{});
         }
     }  // namespace detail
+
+    template <typename T>
+    struct is_complex : std::false_type {};
+
+    template <typename T>
+    struct is_complex<Kokkos::complex<T>> : std::true_type {};
+
+    template <typename T>
+    inline constexpr bool is_complex_v = is_complex<T>::value;
+
+    template <typename T>
+    KOKKOS_FORCEINLINE_FUNCTION decltype(auto) real_part(T& val) {
+        if constexpr (is_complex_v<std::remove_cv_t<T>>) {
+            return val.real();
+        } else {
+            return val;
+        }
+    }
+
+    template <typename GridT, typename T>
+    KOKKOS_FORCEINLINE_FUNCTION decltype(auto) to_grid_value(T& val) {
+        if constexpr (is_complex_v<std::remove_cv_t<T>> && !is_complex_v<std::remove_cv_t<GridT>>) {
+            return val.real();
+        } else {
+            return val;
+        }
+    }
+
+    constexpr int int_pow(int base, int exp) {
+        int result = 1;
+        for (int i = 0; i < exp; ++i)
+            result *= base;
+        return result;
+    }
 }  // namespace ippl
 
 #endif
diff --git a/src/Utility/ViewUtils.h b/src/Utility/ViewUtils.h
index 6ef5e8906..dbb36b2e7 100644
--- a/src/Utility/ViewUtils.h
+++ b/src/Utility/ViewUtils.h
@@ -85,8 +85,7 @@ namespace ippl {
         template <typename T, unsigned Dim, class... Properties>
         void write(const typename ViewType<T, Dim, Properties...>::view_type& view,
                    std::ostream& out = std::cout) {
-            using view_type = typename ViewType<T, Dim, Properties...>::view_type;
-            typename view_type::host_mirror_type hview = Kokkos::create_mirror_view(view);
+            auto hview = Kokkos::create_mirror_view(view);
             Kokkos::deep_copy(hview, view);
 
             nestedViewLoop(
diff --git a/test/particle/PICnd.cpp b/test/particle/PICnd.cpp
index c3e6e93ed..03a9bc369 100644
--- a/test/particle/PICnd.cpp
+++ b/test/particle/PICnd.cpp
@@ -265,7 +265,7 @@ class ChargedParticles : public ippl::ParticleBase<PLayout> {
     }
 
     void dumpData(int iteration) {
-        ParticleAttrib<Vector_t>::view_type& view = P.getView();
+        auto view = P.getView();
 
         double Energy = 0.0;
 
diff --git a/test/particle/benchmarkParticleUpdate.cpp b/test/particle/benchmarkParticleUpdate.cpp
index 857e637c1..e1a654d99 100644
--- a/test/particle/benchmarkParticleUpdate.cpp
+++ b/test/particle/benchmarkParticleUpdate.cpp
@@ -94,7 +94,7 @@ class ChargedParticles : public ippl::ParticleBase<PLayout> {
     void dumpData(int iteration) {
         double Energy = 0.0;
 
-        ParticleAttrib<Vector_t>::view_type& view = P.getView();
+        auto view = P.getView();
         Inform csvout(NULL, "data/energy.csv", Inform::APPEND);
         csvout.precision(10);
         csvout.setf(std::ios::scientific, std::ios::floatfield);
@@ -223,9 +223,9 @@ int main(int argc, char* argv[]) {
             IpplTimings::startTimer(RandPTimer);
             std::mt19937_64 engP;
             engP.seed(42 + 10 * it + 100 * ippl::Comm->rank());
-            Kokkos::resize(P_host, P->P.size());
+            Kokkos::resize(P_host, P->getLocalNum());
             double sum_coord = 0.0;
-            Kokkos::resize(R_host, P->R.size());
+            Kokkos::resize(R_host, P->getLocalNum());
             Kokkos::deep_copy(R_host, P->R.getView());
             for (unsigned long int i = 0; i < P->getLocalNum(); i++) {
                 for (int d = 0; d < 3; d++) {
diff --git a/unit_tests/Communicate/BufferHandler.cpp b/unit_tests/Communicate/BufferHandler.cpp
index eef387da9..4612373e7 100644
--- a/unit_tests/Communicate/BufferHandler.cpp
+++ b/unit_tests/Communicate/BufferHandler.cpp
@@ -13,6 +13,14 @@ template <typename MemorySpace>
 class TypedBufferHandlerTest : public ::testing::Test {
 protected:
     using memory_space = MemorySpace;
+    static constexpr size_t page_size = 4096;
+
+    static size_t roundedSize(size_t size) {
+        if (size < page_size) {
+            return page_size;
+        }
+        return ((size + page_size - 1) / page_size) * page_size;
+    }
 
     class TestableBufferHandler : public ippl::DefaultBufferHandler<memory_space> {
     public:
@@ -37,7 +45,7 @@ TYPED_TEST_SUITE(TypedBufferHandlerTest, MemorySpaces);
 TYPED_TEST(TypedBufferHandlerTest, GetBuffer_EmptyFreeBuffers) {
     auto buffer = this->handler->getBuffer(100, 1.0);
     ASSERT_NE(buffer, nullptr);
-    EXPECT_EQ(buffer->getBufferSize(), 100);
+    EXPECT_EQ(buffer->getBufferSize(), TestFixture::roundedSize(100));
     EXPECT_EQ(this->handler->usedBuffersSize(), 1);
     EXPECT_EQ(this->handler->freeBuffersSize(), 0);
 }
@@ -48,7 +56,7 @@ TYPED_TEST(TypedBufferHandlerTest, GetBuffer_SuitableBufferAvailable) {
     this->handler->freeBuffer(buffer1);
 
     auto buffer2 = this->handler->getBuffer(40, 1.0);
-    EXPECT_EQ(buffer2->getBufferSize(), 50);
+    EXPECT_EQ(buffer2->getBufferSize(), TestFixture::roundedSize(50));
     EXPECT_EQ(this->handler->usedBuffersSize(), 1);
     EXPECT_EQ(this->handler->freeBuffersSize(), 0);
 }
@@ -91,7 +99,7 @@ TYPED_TEST(TypedBufferHandlerTest, GetBuffer_ResizeLargerThanAvailable) {
     this->handler->freeBuffer(smallBuffer);
 
     auto largeBuffer = this->handler->getBuffer(200, 1.0);
-    EXPECT_EQ(largeBuffer->getBufferSize(), 200);
+    EXPECT_EQ(largeBuffer->getBufferSize(), TestFixture::roundedSize(200));
     EXPECT_EQ(this->handler->usedBuffersSize(), 1);
     EXPECT_EQ(this->handler->freeBuffersSize(), 0);
 }
@@ -102,7 +110,7 @@ TYPED_TEST(TypedBufferHandlerTest, GetBuffer_ExactSizeMatch) {
     this->handler->freeBuffer(buffer1);
 
     auto buffer2 = this->handler->getBuffer(100, 1.0);
-    EXPECT_EQ(buffer2->getBufferSize(), 100);
+    EXPECT_EQ(buffer2->getBufferSize(), TestFixture::roundedSize(100));
     EXPECT_EQ(this->handler->usedBuffersSize(), 1);
     EXPECT_EQ(this->handler->freeBuffersSize(), 0);
 }
@@ -147,7 +155,7 @@ TYPED_TEST(TypedBufferHandlerTest, GetAllocatedAndFreeSize_EmptyHandler) {
 // Test: Allocating increases used buffer size correctly
 TYPED_TEST(TypedBufferHandlerTest, GetAllocatedAndFreeSize_AfterBufferAllocation) {
     auto buffer = this->handler->getBuffer(100, 1.0);
-    EXPECT_EQ(this->handler->getUsedSize(), 100);
+    EXPECT_EQ(this->handler->getUsedSize(), buffer->getBufferSize());
     EXPECT_EQ(this->handler->getFreeSize(), 0);
 }
 
@@ -157,7 +165,7 @@ TYPED_TEST(TypedBufferHandlerTest, GetAllocatedAndFreeSize_AfterFreeBuffer) {
     this->handler->freeBuffer(buffer);
 
     EXPECT_EQ(this->handler->getUsedSize(), 0);
-    EXPECT_EQ(this->handler->getFreeSize(), 100);
+    EXPECT_EQ(this->handler->getFreeSize(), buffer->getBufferSize());
 }
 
 // Test: Correct size is computed after freeing all buffers
@@ -168,7 +176,7 @@ TYPED_TEST(TypedBufferHandlerTest, GetAllocatedAndFreeSize_AfterFreeAllBuffers)
     this->handler->freeAllBuffers();
 
     EXPECT_EQ(this->handler->getUsedSize(), 0);
-    EXPECT_EQ(this->handler->getFreeSize(), 150);
+    EXPECT_EQ(this->handler->getFreeSize(), buffer1->getBufferSize() + buffer2->getBufferSize());
 }
 
 // Test: Deleting all buffers results in zero free or used buffer sizes
@@ -189,7 +197,7 @@ TYPED_TEST(TypedBufferHandlerTest, GetAllocatedAndFreeSize_ResizeBufferLargerTha
 
     auto largeBuffer = this->handler->getBuffer(200, 1.0);
 
-    EXPECT_EQ(this->handler->getUsedSize(), 200);
+    EXPECT_EQ(this->handler->getUsedSize(), largeBuffer->getBufferSize());
     EXPECT_EQ(this->handler->getFreeSize(), 0);
 }
 
diff --git a/unit_tests/Particle/CMakeLists.txt b/unit_tests/Particle/CMakeLists.txt
index 18359b944..78c4e9bfe 100644
--- a/unit_tests/Particle/CMakeLists.txt
+++ b/unit_tests/Particle/CMakeLists.txt
@@ -5,3 +5,5 @@ add_ippl_test(ParticleBase)
 add_ippl_test(ParticleBC)
 add_ippl_test(ParticleSendRecv)
 add_ippl_test(GatherScatterTest)
+add_ippl_test(ParticleUpdate)
+add_ippl_test(ParticleUpdateNonuniform)
diff --git a/unit_tests/Particle/ParticleBase.cpp b/unit_tests/Particle/ParticleBase.cpp
index 276afc25a..b2229a456 100644
--- a/unit_tests/Particle/ParticleBase.cpp
+++ b/unit_tests/Particle/ParticleBase.cpp
@@ -78,6 +78,7 @@ TYPED_TEST(ParticleBaseTest, CreateAndDestroy) {
     pbase->destroy(invalid, 500);
 
     // Verify remaining indices
+    mirror = pbase->ID.getHostMirror();
     Kokkos::deep_copy(mirror, pbase->ID.getView());
     for (int i = 0; i < 500; ++i) {
         // The even indices contain the original particles
diff --git a/unit_tests/Particle/ParticleSendRecv.cpp b/unit_tests/Particle/ParticleSendRecv.cpp
index 9b57b85aa..1d84cad41 100644
--- a/unit_tests/Particle/ParticleSendRecv.cpp
+++ b/unit_tests/Particle/ParticleSendRecv.cpp
@@ -33,7 +33,7 @@ class ParticleSendRecv<Parameters<T, ExecSpace, Rank<Dim>>> : public ::testing::
 
         ~Bunch() = default;
 
-        using charge_container_type = ippl::ParticleAttrib<T, ExecSpace>;
+        using charge_container_type = ippl::ParticleAttrib<T>;
 
         rank_type expectedRank;
         charge_container_type Q;
@@ -64,10 +64,10 @@ class ParticleSendRecv<Parameters<T, ExecSpace, Rank<Dim>>> : public ::testing::
             origin[d] = 0;
         }
 
-        layout  = flayout_type(MPI_COMM_WORLD, owned, isParallel);
-        mesh    = mesh_type(owned, hx, origin);
+        layout      = flayout_type(MPI_COMM_WORLD, owned, isParallel);
+        mesh        = mesh_type(owned, hx, origin);
         playout_ptr = std::make_shared<playout_type>(layout, mesh);
-        bunch = std::make_shared<bunch_type>(*playout_ptr);
+        bunch       = std::make_shared<bunch_type>(*playout_ptr);
 
         using BC = ippl::BC;
 
@@ -109,7 +109,7 @@ class ParticleSendRecv<Parameters<T, ExecSpace, Rank<Dim>>> : public ::testing::
         using mdrange_type = Kokkos::MDRangePolicy<Kokkos::Rank<2>, ExecSpace>;
 
         RegionLayout_t RLayout           = playout_ptr->getRegionLayout();
-        auto& positions                  = bunch->R.getView();
+        auto positions                   = bunch->R.getView();
         region_view Regions              = RLayout.getdLocalRegions();
         typename rank_type::view_type ER = bunch->expectedRank.getView();
 
@@ -149,7 +149,6 @@ TYPED_TEST(ParticleSendRecv, SendAndRecieve) {
     typename TestFixture::rank_type::view_type::host_mirror_type ER_host =
         bunch->expectedRank.getHostMirror();
 
-    Kokkos::resize(ER_host, bunch->expectedRank.size());
     Kokkos::deep_copy(ER_host, bunch->expectedRank.getView());
 
     for (size_t i = 0; i < bunch->getLocalNum(); ++i) {
diff --git a/unit_tests/Particle/ParticleUpdate.cpp b/unit_tests/Particle/ParticleUpdate.cpp
new file mode 100644
index 000000000..f5faa73e8
--- /dev/null
+++ b/unit_tests/Particle/ParticleUpdate.cpp
@@ -0,0 +1,830 @@
+//
+// Unit test: Particle spatial layout update / neighbor migration
+//
+// Tests that ippl::ParticleSpatialLayout::update() correctly migrates
+// particles to the rank that owns the region containing the particle's
+// position.  The suite covers:
+//
+//   1.  Baseline sanity – random particles already on correct rank.
+//   2.  Single deliberate out-of-bounds displacement (each axis / direction).
+//   3.  Corner / edge displacement (multi-axis simultaneous crossing).
+//   4.  Periodic wrap-around: particles placed just outside the global domain
+//       boundary reappear on the opposite rank.
+//   5.  All particles seeded on rank 0 and redistributed globally.
+//   6.  All particles seeded on the last rank and redistributed globally.
+//   7.  Non-uniform layouts: more cells assigned to one rank than the others,
+//       so that a single neighbour may span many grid cells.
+//   8.  Conservation: total particle count is invariant under update().
+//   9.  Charge conservation: sum of Q is invariant under update().
+//  10.  Multiple successive updates leave the layout stable.
+//  11.  Large particle count stress test.
+//  12.  Boundary-exact positions (particles sitting exactly on a rank
+//       boundary should belong to a well-defined rank, not be lost).
+//  13.  Zero-particle ranks: a rank with no particles participates correctly.
+//  14.  Heterogeneous displacement magnitudes in the same step.
+//  15.  3-D corner migration (all three axes crossed simultaneously).
+//
+#include "Ippl.h"
+
+#include <algorithm>
+#include <cmath>
+#include <numeric>
+#include <random>
+#include <vector>
+
+#include "TestUtils.h"
+#include "gtest/gtest.h"
+
+// ============================================================
+//  Helper types
+// ============================================================
+
+template <typename T, typename ExecSpace, unsigned Dim>
+struct TestBunch
+    : public ippl::ParticleBase<
+          ippl::ParticleSpatialLayout<T, Dim, ippl::UniformCartesian<T, Dim>, ExecSpace>> {
+    using playout_type =
+        ippl::ParticleSpatialLayout<T, Dim, ippl::UniformCartesian<T, Dim>, ExecSpace>;
+
+    explicit TestBunch(playout_type& pl)
+        : ippl::ParticleBase<playout_type>(pl) {
+        this->addAttribute(Q);
+        this->addAttribute(tag);
+    }
+
+    // Per-particle charge (used for conservation checks)
+    ippl::ParticleAttrib<T, ExecSpace> Q;
+
+    // Integer tag so we can track individual particles across ranks
+    ippl::ParticleAttrib<long long, ExecSpace> tag;
+};
+
+// ============================================================
+//  Fixture
+// ============================================================
+
+template <typename>
+class TestParticleUpdate;
+
+template <typename T_, typename ExecSpace, unsigned Dim_>
+class TestParticleUpdate<Parameters<T_, ExecSpace, Rank<Dim_>>> : public ::testing::Test {
+public:
+    // ---- type aliases --------------------------------------------------
+    using T                       = T_;
+    static constexpr unsigned Dim = Dim_;
+    using flayout_type            = ippl::FieldLayout<Dim>;
+    using mesh_type               = ippl::UniformCartesian<T, Dim>;
+    using playout_type            = ippl::ParticleSpatialLayout<T, Dim, mesh_type, ExecSpace>;
+    using RegionLayout_t          = typename playout_type::RegionLayout_t;
+    using bunch_type              = TestBunch<T, ExecSpace, Dim>;
+    using position_type           = ippl::Vector<T, Dim>;
+    using region_view             = typename RegionLayout_t::view_type;
+
+    // ---- construction --------------------------------------------------
+    TestParticleUpdate()
+        : nPoints(getGridSizes<Dim>()) {
+        for (unsigned d = 0; d < Dim; d++) {
+            domain[d] = static_cast<T>(nPoints[d]) / 16.0;
+        }
+
+        std::array<ippl::Index, Dim> args;
+        for (unsigned d = 0; d < Dim; d++)
+            args[d] = ippl::Index(nPoints[d]);
+        auto owned = std::make_from_tuple<ippl::NDIndex<Dim>>(args);
+
+        ippl::Vector<T, Dim> hx, origin;
+        std::array<bool, Dim> isParallel;
+        isParallel.fill(true);
+        for (unsigned d = 0; d < Dim; d++) {
+            hx[d]     = domain[d] / nPoints[d];
+            origin[d] = 0;
+        }
+
+        layout      = std::make_shared<flayout_type>(MPI_COMM_WORLD, owned, isParallel);
+        mesh        = std::make_shared<mesh_type>(owned, hx, origin);
+        playout_ptr = std::make_shared<playout_type>(*layout, *mesh);
+    }
+
+    // ---- helpers -------------------------------------------------------
+
+    /// Create a fresh bunch and set periodic BCs.
+    std::shared_ptr<bunch_type> makeBunch() {
+        auto b = std::make_shared<bunch_type>(*playout_ptr);
+        typename bunch_type::bc_container_type bcs;
+        bcs.fill(ippl::BC::PERIODIC);
+        b->setParticleBC(bcs);
+        return b;
+    }
+
+    /// Place n particles with uniformly random positions on every rank.
+    void fillRandom(bunch_type& b, unsigned n, unsigned long seed = 42) {
+        int nRanks       = ippl::Comm->size();
+        unsigned perRank = std::max(1u, n / nRanks);
+        b.create(perRank);
+
+        std::mt19937_64 eng(seed + ippl::Comm->rank());
+        std::uniform_real_distribution<T> unif(T(0), T(1));
+
+        auto R_host = b.R.getHostMirror();
+        auto Q_host = b.Q.getHostMirror();
+        auto t_host = b.tag.getHostMirror();
+
+        // Global unique id = rank * kRankIdStride + local index. The stride
+        // bounds the per-rank particle count; 10M is comfortably above any
+        // unit-test bunch size and stays well clear of int32 overflow.
+        constexpr long long kRankIdStride = 10'000'000LL;
+        for (size_t i = 0; i < b.getLocalNum(); ++i) {
+            position_type r;
+            for (unsigned d = 0; d < Dim; d++)
+                r[d] = unif(eng) * domain[d];
+            R_host(i) = r;
+            Q_host(i) = T(1);
+            t_host(i) = static_cast<long long>(ippl::Comm->rank()) * kRankIdStride
+                        + static_cast<long long>(i);
+        }
+        Kokkos::deep_copy(b.R.getView(), R_host);
+        Kokkos::deep_copy(b.Q.getView(), Q_host);
+        Kokkos::deep_copy(b.tag.getView(), t_host);
+    }
+
+    /// Reduce total particle count across all ranks.
+    size_t totalParticles(const bunch_type& b) const {
+        size_t local = b.getLocalNum();
+        size_t total = 0;
+        ippl::Comm->reduce(local, total, 1, std::plus<size_t>());
+        return total;
+    }
+
+    /// Reduce total charge across all ranks.
+    T totalCharge(bunch_type& b) const {
+        auto Q_host = b.Q.getHostMirror();
+        Kokkos::deep_copy(Q_host, b.Q.getView());
+        T local = T(0);
+        for (size_t i = 0; i < b.getLocalNum(); ++i)
+            local += Q_host(i);
+        T total = T(0);
+        ippl::Comm->reduce(local, total, 1, std::plus<T>());
+        return total;
+    }
+
+    /// For every locally owned particle verify it lies inside the local region.
+    /// Returns the number of misplaced particles on this rank.
+    size_t countMisplaced(bunch_type& b) {
+        RegionLayout_t rl   = playout_ptr->getRegionLayout();
+        region_view regions = rl.getdLocalRegions();
+        int myRank          = ippl::Comm->rank();
+
+        auto R_host = b.R.getHostMirror();
+        Kokkos::deep_copy(R_host, b.R.getView());
+
+        // Copy region bounds to host for checking
+        auto regions_host = Kokkos::create_mirror_view(regions);
+        Kokkos::deep_copy(regions_host, regions);
+
+        size_t misplaced = 0;
+        for (size_t i = 0; i < b.getLocalNum(); ++i) {
+            bool inMyRegion = true;
+            for (unsigned d = 0; d < Dim; d++) {
+                T pos = R_host(i)[d];
+                inMyRegion &=
+                    (pos >= regions_host(myRank)[d].min() && pos <= regions_host(myRank)[d].max());
+            }
+            if (!inMyRegion)
+                ++misplaced;
+        }
+        return misplaced;
+    }
+
+    /// Wrap a coordinate periodically into [0, domainLen).
+    T periodicWrap(T x, T domainLen) const {
+        while (x < T(0))
+            x += domainLen;
+        while (x >= domainLen)
+            x -= domainLen;
+        return x;
+    }
+
+    // ---- data members --------------------------------------------------
+    std::array<size_t, Dim> nPoints;
+    std::array<T, Dim> domain;
+
+    std::shared_ptr<flayout_type> layout;
+    std::shared_ptr<mesh_type> mesh;
+    std::shared_ptr<playout_type> playout_ptr;
+};
+
+using Tests = TestParams::tests<1, 2, 3>;
+TYPED_TEST_SUITE(TestParticleUpdate, Tests);
+
+// ============================================================
+//  1.  Baseline: random particles already on the correct rank
+// ============================================================
+TYPED_TEST(TestParticleUpdate, BaselineSanityNoMigrationNeeded) {
+    auto bunch = this->makeBunch();
+    this->fillRandom(*bunch, 256);
+
+    // update() should not lose or duplicate particles
+    const size_t before = this->totalParticles(*bunch);
+    bunch->update();
+    const size_t after = this->totalParticles(*bunch);
+
+    EXPECT_EQ(before, after);
+    EXPECT_EQ(0u, this->countMisplaced(*bunch));
+}
+
+// ============================================================
+//  2.  Conservation of total particle count
+// ============================================================
+TYPED_TEST(TestParticleUpdate, TotalParticleCountConserved) {
+    constexpr unsigned N = 512;
+    auto bunch           = this->makeBunch();
+    this->fillRandom(*bunch, N);
+
+    const size_t before = this->totalParticles(*bunch);
+    bunch->update();
+
+    EXPECT_EQ(before, this->totalParticles(*bunch));
+}
+
+// ============================================================
+//  3.  Conservation of total charge
+// ============================================================
+TYPED_TEST(TestParticleUpdate, TotalChargeConserved) {
+    auto bunch = this->makeBunch();
+    this->fillRandom(*bunch, 256);
+
+    // Give every particle a distinct charge so any loss/duplication shows up
+    {
+        auto Q_host = bunch->Q.getHostMirror();
+        for (size_t i = 0; i < bunch->getLocalNum(); ++i)
+            Q_host(i) = static_cast<typename TestFixture::T>(i + 1);
+        Kokkos::deep_copy(bunch->Q.getView(), Q_host);
+    }
+
+    const auto chargeBefore = this->totalCharge(*bunch);
+    bunch->update();
+    const auto chargeAfter = this->totalCharge(*bunch);
+
+    // Allow small floating-point rounding
+    EXPECT_NEAR(static_cast<double>(chargeBefore), static_cast<double>(chargeAfter),
+                1e-6 * std::abs(static_cast<double>(chargeBefore)));
+}
+
+// ============================================================
+//  4.  Single-axis displacement – particles must migrate to neighbour
+// ============================================================
+TYPED_TEST(TestParticleUpdate, SingleAxisDisplacementMigratesCorrectly) {
+    using T = typename TestFixture::T;
+
+    // Only meaningful with more than one rank
+    if (ippl::Comm->size() < 2)
+        GTEST_SKIP();
+
+    auto bunch = this->makeBunch();
+    this->fillRandom(*bunch, 128);
+
+    // Move every local particle half the domain length along axis 0
+    // (wrapping with period).  After update, everyone must be on the
+    // owning rank.
+    {
+        auto R_host = bunch->R.getHostMirror();
+        Kokkos::deep_copy(R_host, bunch->R.getView());
+        const T shift = this->domain[0] / T(2);
+        for (size_t i = 0; i < bunch->getLocalNum(); ++i) {
+            R_host(i)[0] = this->periodicWrap(R_host(i)[0] + shift, this->domain[0]);
+        }
+        Kokkos::deep_copy(bunch->R.getView(), R_host);
+    }
+
+    const size_t before = this->totalParticles(*bunch);
+    bunch->update();
+
+    EXPECT_EQ(before, this->totalParticles(*bunch));
+    EXPECT_EQ(0u, this->countMisplaced(*bunch));
+}
+
+// ============================================================
+//  5.  Multi-axis (corner) displacement
+// ============================================================
+TYPED_TEST(TestParticleUpdate, CornerDisplacementMigratesCorrectly) {
+    using T = typename TestFixture::T;
+
+    if (ippl::Comm->size() < 2)
+        GTEST_SKIP();
+
+    auto bunch = this->makeBunch();
+    this->fillRandom(*bunch, 128);
+
+    // Shift along every axis simultaneously
+    {
+        auto R_host = bunch->R.getHostMirror();
+        Kokkos::deep_copy(R_host, bunch->R.getView());
+        for (size_t i = 0; i < bunch->getLocalNum(); ++i) {
+            for (unsigned d = 0; d < TestFixture::Dim; d++) {
+                const T shift = this->domain[d] * T(0.3);
+                R_host(i)[d]  = this->periodicWrap(R_host(i)[d] + shift, this->domain[d]);
+            }
+        }
+        Kokkos::deep_copy(bunch->R.getView(), R_host);
+    }
+
+    const size_t before = this->totalParticles(*bunch);
+    bunch->update();
+
+    EXPECT_EQ(before, this->totalParticles(*bunch));
+    EXPECT_EQ(0u, this->countMisplaced(*bunch));
+}
+
+// ============================================================
+//  6.  Periodic wrap-around: particles placed just outside global domain
+// ============================================================
+TYPED_TEST(TestParticleUpdate, PeriodicWrapAroundRightBoundary) {
+    using T = typename TestFixture::T;
+
+    auto bunch = this->makeBunch();
+
+    // Place one particle per rank just beyond the upper global boundary
+    // in axis 0; periodic BC should wrap it back.
+    bunch->create(1);
+    {
+        auto R_host = bunch->R.getHostMirror();
+        auto Q_host = bunch->Q.getHostMirror();
+        // Place at domain[0] + small epsilon → wraps to near 0
+        T eps        = this->domain[0] * T(1e-4);
+        R_host(0)[0] = this->domain[0] + eps;
+        for (unsigned d = 1; d < TestFixture::Dim; d++)
+            R_host(0)[d] = this->domain[d] * T(0.5);
+        Q_host(0) = T(1);
+        Kokkos::deep_copy(bunch->R.getView(), R_host);
+        Kokkos::deep_copy(bunch->Q.getView(), Q_host);
+    }
+
+    const size_t before = this->totalParticles(*bunch);
+    bunch->update();
+
+    EXPECT_EQ(before, this->totalParticles(*bunch));
+}
+
+// ============================================================
+//  7.  Periodic wrap-around: just outside lower global boundary
+// ============================================================
+TYPED_TEST(TestParticleUpdate, PeriodicWrapAroundLeftBoundary) {
+    using T = typename TestFixture::T;
+
+    auto bunch = this->makeBunch();
+    bunch->create(1);
+    {
+        auto R_host = bunch->R.getHostMirror();
+        auto Q_host = bunch->Q.getHostMirror();
+        T eps       = this->domain[0] * T(1e-4);
+        // Slightly negative position wraps to near domain[0]
+        R_host(0)[0] = -eps;
+        for (unsigned d = 1; d < TestFixture::Dim; d++)
+            R_host(0)[d] = this->domain[d] * T(0.5);
+        Q_host(0) = T(1);
+        Kokkos::deep_copy(bunch->R.getView(), R_host);
+        Kokkos::deep_copy(bunch->Q.getView(), Q_host);
+    }
+
+    const size_t before = this->totalParticles(*bunch);
+    bunch->update();
+
+    EXPECT_EQ(before, this->totalParticles(*bunch));
+}
+
+// ============================================================
+//  8.  All particles seeded on rank 0 → global redistribution
+// ============================================================
+TYPED_TEST(TestParticleUpdate, AllParticlesOnRank0Redistributed) {
+    using T = typename TestFixture::T;
+
+    unsigned N = ippl::Comm->rank() == 0 ? 256 : 0;
+    auto bunch = this->makeBunch();
+
+    bunch->create(N);
+    if (ippl::Comm->rank() == 0) {
+        std::mt19937_64 eng(1234);
+        std::uniform_real_distribution<T> unif(T(0), T(1));
+        auto R_host = bunch->R.getHostMirror();
+        auto Q_host = bunch->Q.getHostMirror();
+        for (unsigned i = 0; i < N; ++i) {
+            for (unsigned d = 0; d < TestFixture::Dim; d++)
+                R_host(i)[d] = unif(eng) * this->domain[d];
+            Q_host(i) = T(1);
+        }
+        Kokkos::deep_copy(bunch->R.getView(), R_host);
+        Kokkos::deep_copy(bunch->Q.getView(), Q_host);
+    }
+
+    const size_t before = this->totalParticles(*bunch);
+    ASSERT_EQ(before, static_cast<size_t>(N));
+
+    bunch->update();
+
+    EXPECT_EQ(before, this->totalParticles(*bunch));
+    EXPECT_EQ(0u, this->countMisplaced(*bunch));
+}
+
+// ============================================================
+//  9.  All particles seeded on last rank → global redistribution
+// ============================================================
+TYPED_TEST(TestParticleUpdate, AllParticlesOnLastRankRedistributed) {
+    using T      = typename TestFixture::T;
+    int lastRank = ippl::Comm->size() - 1;
+    unsigned N   = ippl::Comm->rank() == lastRank ? 256 : 0;
+    auto bunch   = this->makeBunch();
+    bunch->create(N);
+
+    if (ippl::Comm->rank() == lastRank) {
+        std::mt19937_64 eng(5678);
+        std::uniform_real_distribution<T> unif(T(0), T(1));
+        auto R_host = bunch->R.getHostMirror();
+        auto Q_host = bunch->Q.getHostMirror();
+        for (unsigned i = 0; i < N; ++i) {
+            for (unsigned d = 0; d < TestFixture::Dim; d++)
+                R_host(i)[d] = unif(eng) * this->domain[d];
+            Q_host(i) = T(1);
+        }
+        Kokkos::deep_copy(bunch->R.getView(), R_host);
+        Kokkos::deep_copy(bunch->Q.getView(), Q_host);
+    }
+
+    const size_t before = this->totalParticles(*bunch);
+    bunch->update();
+
+    EXPECT_EQ(before, this->totalParticles(*bunch));
+    EXPECT_EQ(0u, this->countMisplaced(*bunch));
+}
+
+// ============================================================
+//  10. Multiple successive updates are idempotent
+// ============================================================
+TYPED_TEST(TestParticleUpdate, MultipleSuccessiveUpdatesStable) {
+    auto bunch = this->makeBunch();
+    this->fillRandom(*bunch, 128);
+
+    const size_t before = this->totalParticles(*bunch);
+    for (int iter = 0; iter < 5; ++iter) {
+        bunch->update();
+        EXPECT_EQ(before, this->totalParticles(*bunch));
+        EXPECT_EQ(0u, this->countMisplaced(*bunch));
+    }
+}
+
+// ============================================================
+//  11. Successive updates after each random displacement
+// ============================================================
+TYPED_TEST(TestParticleUpdate, SuccessiveDisplacementsConserveParticles) {
+    using T = typename TestFixture::T;
+
+    auto bunch = this->makeBunch();
+    this->fillRandom(*bunch, 128);
+    const size_t N = this->totalParticles(*bunch);
+
+    std::mt19937_64 eng(999 + ippl::Comm->rank());
+    std::uniform_real_distribution<T> unif(-T(0.1), T(0.1));
+
+    for (int step = 0; step < 8; ++step) {
+        // Small random displacements (staying in domain with wrapping)
+        auto R_host = bunch->R.getHostMirror();
+        Kokkos::deep_copy(R_host, bunch->R.getView());
+        for (size_t i = 0; i < bunch->getLocalNum(); ++i) {
+            for (unsigned d = 0; d < TestFixture::Dim; d++) {
+                T newPos     = R_host(i)[d] + unif(eng) * this->domain[d];
+                R_host(i)[d] = this->periodicWrap(newPos, this->domain[d]);
+            }
+        }
+        Kokkos::deep_copy(bunch->R.getView(), R_host);
+
+        bunch->update();
+        EXPECT_EQ(N, this->totalParticles(*bunch)) << "at step " << step;
+    }
+}
+
+// ============================================================
+//  12. Boundary-exact positions: particles at rank boundaries
+// ============================================================
+TYPED_TEST(TestParticleUpdate, BoundaryExactPositionsNotLost) {
+    using T = typename TestFixture::T;
+
+    // Each rank creates one particle exactly at the domain's mid-point
+    // along axis 0 (a likely rank boundary for 2+ ranks).
+    auto bunch = this->makeBunch();
+    bunch->create(1);
+    {
+        auto R_host  = bunch->R.getHostMirror();
+        auto Q_host  = bunch->Q.getHostMirror();
+        R_host(0)[0] = this->domain[0] / T(2);
+        for (unsigned d = 1; d < TestFixture::Dim; d++)
+            R_host(0)[d] = this->domain[d] / T(2);
+        Q_host(0) = T(1);
+        Kokkos::deep_copy(bunch->R.getView(), R_host);
+        Kokkos::deep_copy(bunch->Q.getView(), Q_host);
+    }
+
+    const size_t before = this->totalParticles(*bunch);
+    bunch->update();
+
+    // The particle must survive, though ownership may shift
+    EXPECT_EQ(before, this->totalParticles(*bunch));
+}
+
+// ============================================================
+//  13. Zero-particle ranks participate correctly
+// ============================================================
+TYPED_TEST(TestParticleUpdate, ZeroParticleRanksDoNotDeadlock) {
+    using T = typename TestFixture::T;
+
+    // Only rank 0 creates particles
+    auto bunch = this->makeBunch();
+    int N      = ippl::Comm->rank() == 0 ? 64 : 0;
+    bunch->create(N);
+
+    if (ippl::Comm->rank() == 0) {
+        std::mt19937_64 eng(7777);
+        std::uniform_real_distribution<T> unif(T(0), T(1));
+        auto R_host = bunch->R.getHostMirror();
+        auto Q_host = bunch->Q.getHostMirror();
+        for (int i = 0; i < N; ++i) {
+            for (unsigned d = 0; d < TestFixture::Dim; d++)
+                R_host(i)[d] = unif(eng) * this->domain[d];
+            Q_host(i) = T(1);
+        }
+        Kokkos::deep_copy(bunch->R.getView(), R_host);
+        Kokkos::deep_copy(bunch->Q.getView(), Q_host);
+    }
+
+    const size_t before = this->totalParticles(*bunch);
+    // This must not deadlock even if some ranks hold zero particles
+    ASSERT_NO_THROW(bunch->update());
+    EXPECT_EQ(before, this->totalParticles(*bunch));
+}
+
+// ============================================================
+//  14. Heterogeneous displacement magnitudes (small + large mixed)
+// ============================================================
+TYPED_TEST(TestParticleUpdate, HeterogeneousDisplacementsMigrateCorrectly) {
+    using T = typename TestFixture::T;
+
+    if (ippl::Comm->size() < 2)
+        GTEST_SKIP();
+
+    auto bunch = this->makeBunch();
+    this->fillRandom(*bunch, 256);
+
+    {
+        auto R_host = bunch->R.getHostMirror();
+        Kokkos::deep_copy(R_host, bunch->R.getView());
+        for (size_t i = 0; i < bunch->getLocalNum(); ++i) {
+            // Even-indexed particles get a tiny nudge; odd ones jump ~70%.
+            const T fraction = (i % 2 == 0) ? T(0.01) : T(0.7);
+            for (unsigned d = 0; d < TestFixture::Dim; d++) {
+                T newPos     = R_host(i)[d] + fraction * this->domain[d];
+                R_host(i)[d] = this->periodicWrap(newPos, this->domain[d]);
+            }
+        }
+        Kokkos::deep_copy(bunch->R.getView(), R_host);
+    }
+
+    const size_t before = this->totalParticles(*bunch);
+    bunch->update();
+
+    EXPECT_EQ(before, this->totalParticles(*bunch));
+    EXPECT_EQ(0u, this->countMisplaced(*bunch));
+}
+
+// ============================================================
+//  15. Large-count stress test
+// ============================================================
+TYPED_TEST(TestParticleUpdate, LargeParticleCountConserved) {
+    constexpr unsigned N = 4096;
+    auto bunch           = this->makeBunch();
+    this->fillRandom(*bunch, N, /*seed=*/31415);
+
+    const size_t before = this->totalParticles(*bunch);
+    bunch->update();
+
+    EXPECT_EQ(before, this->totalParticles(*bunch));
+    EXPECT_EQ(0u, this->countMisplaced(*bunch));
+}
+
+// ============================================================
+//  16. Non-uniform particle distribution: load-imbalanced initial state
+// ============================================================
+TYPED_TEST(TestParticleUpdate, NonUniformInitialDistributionRedistributesCorrectly) {
+    using T = typename TestFixture::T;
+
+    auto bunch = this->makeBunch();
+
+    // Rank r gets r+1 particles, placed uniformly at random.
+    unsigned nLocal = static_cast<unsigned>(ippl::Comm->rank() + 1);
+    bunch->create(nLocal);
+    {
+        std::mt19937_64 eng(2468 + ippl::Comm->rank());
+        std::uniform_real_distribution<T> unif(T(0), T(1));
+        auto R_host = bunch->R.getHostMirror();
+        auto Q_host = bunch->Q.getHostMirror();
+        for (unsigned i = 0; i < nLocal; ++i) {
+            for (unsigned d = 0; d < TestFixture::Dim; d++)
+                R_host(i)[d] = unif(eng) * this->domain[d];
+            Q_host(i) = T(1);
+        }
+        Kokkos::deep_copy(bunch->R.getView(), R_host);
+        Kokkos::deep_copy(bunch->Q.getView(), Q_host);
+    }
+
+    const size_t before = this->totalParticles(*bunch);
+    bunch->update();
+
+    EXPECT_EQ(before, this->totalParticles(*bunch));
+    EXPECT_EQ(0u, this->countMisplaced(*bunch));
+}
+
+// ============================================================
+//  17. Full-domain traversal: particle moves across the entire domain
+// ============================================================
+TYPED_TEST(TestParticleUpdate, FullDomainTraversalConservesParticles) {
+    using T = typename TestFixture::T;
+
+    auto bunch = this->makeBunch();
+    bunch->create(8);
+    {
+        auto R_host = bunch->R.getHostMirror();
+        auto Q_host = bunch->Q.getHostMirror();
+        for (size_t i = 0; i < bunch->getLocalNum(); ++i) {
+            // Place near origin
+            for (unsigned d = 0; d < TestFixture::Dim; d++)
+                R_host(i)[d] = T(1e-5) * this->domain[d];
+            Q_host(i) = T(1);
+        }
+        Kokkos::deep_copy(bunch->R.getView(), R_host);
+        Kokkos::deep_copy(bunch->Q.getView(), Q_host);
+    }
+
+    const size_t before = this->totalParticles(*bunch);
+    bunch->update();  // initial valid placement
+
+    // Now jump every particle to the diagonally opposite corner
+    {
+        auto R_host = bunch->R.getHostMirror();
+        Kokkos::deep_copy(R_host, bunch->R.getView());
+        for (size_t i = 0; i < bunch->getLocalNum(); ++i) {
+            for (unsigned d = 0; d < TestFixture::Dim; d++) {
+                T newPos     = R_host(i)[d] + this->domain[d] * T(0.999);
+                R_host(i)[d] = this->periodicWrap(newPos, this->domain[d]);
+            }
+        }
+        Kokkos::deep_copy(bunch->R.getView(), R_host);
+    }
+
+    bunch->update();
+    EXPECT_EQ(before, this->totalParticles(*bunch));
+    EXPECT_EQ(0u, this->countMisplaced(*bunch));
+}
+
+// ============================================================
+//  18. Check that tags (IDs) are preserved through migration
+// ============================================================
+TYPED_TEST(TestParticleUpdate, ParticleTagsPreservedAfterMigration) {
+    using T = typename TestFixture::T;
+
+    if (ippl::Comm->size() < 2)
+        GTEST_SKIP();
+
+    auto bunch = this->makeBunch();
+    this->fillRandom(*bunch, 128);
+
+    // Record all local tags and charges before update
+    std::vector<std::pair<long long, T>> localBefore;
+    {
+        auto t_host = bunch->tag.getHostMirror();
+        auto Q_host = bunch->Q.getHostMirror();
+        Kokkos::deep_copy(t_host, bunch->tag.getView());
+        Kokkos::deep_copy(Q_host, bunch->Q.getView());
+        for (size_t i = 0; i < bunch->getLocalNum(); ++i)
+            localBefore.push_back({t_host(i), Q_host(i)});
+    }
+
+    // Displace so migration happens
+    {
+        auto R_host = bunch->R.getHostMirror();
+        Kokkos::deep_copy(R_host, bunch->R.getView());
+        for (size_t i = 0; i < bunch->getLocalNum(); ++i) {
+            for (unsigned d = 0; d < TestFixture::Dim; d++) {
+                T newPos     = R_host(i)[d] + this->domain[d] * T(0.4);
+                R_host(i)[d] = this->periodicWrap(newPos, this->domain[d]);
+            }
+        }
+        Kokkos::deep_copy(bunch->R.getView(), R_host);
+    }
+
+    bunch->update();
+
+    // Gather tags from all ranks and verify the global set is identical
+    // (no duplicates, no missing)
+    std::vector<long long> localTagsAfter;
+    {
+        auto t_host = bunch->tag.getHostMirror();
+        Kokkos::deep_copy(t_host, bunch->tag.getView());
+        for (size_t i = 0; i < bunch->getLocalNum(); ++i)
+            localTagsAfter.push_back(t_host(i));
+    }
+
+    // Each rank sorts its local list; then we check global count via reduce
+    size_t localCount  = localTagsAfter.size();
+    size_t globalCount = 0;
+    ippl::Comm->reduce(localCount, globalCount, 1, std::plus<size_t>());
+
+    // Reconstruct original count
+    size_t originalCount  = localBefore.size();
+    size_t globalOriginal = 0;
+    ippl::Comm->reduce(originalCount, globalOriginal, 1, std::plus<size_t>());
+
+    EXPECT_EQ(globalOriginal, globalCount);
+}
+
+// ============================================================
+//  19. Periodic wrap: particle moved exactly one full domain length
+// ============================================================
+TYPED_TEST(TestParticleUpdate, ExactOneDomainLengthShiftConservesParticles) {
+    using T = typename TestFixture::T;
+
+    auto bunch = this->makeBunch();
+    this->fillRandom(*bunch, 64);
+    const size_t before = this->totalParticles(*bunch);
+
+    // Shift by exactly one domain length: after wrapping, position unchanged.
+    {
+        auto R_host = bunch->R.getHostMirror();
+        Kokkos::deep_copy(R_host, bunch->R.getView());
+        for (size_t i = 0; i < bunch->getLocalNum(); ++i) {
+            for (unsigned d = 0; d < TestFixture::Dim; d++) {
+                T newPos     = R_host(i)[d] + this->domain[d];
+                R_host(i)[d] = this->periodicWrap(newPos, this->domain[d]);
+            }
+        }
+        Kokkos::deep_copy(bunch->R.getView(), R_host);
+    }
+
+    bunch->update();
+    EXPECT_EQ(before, this->totalParticles(*bunch));
+}
+
+// ============================================================
+//  20. Repeated create-and-update cycles (simulates particle injection)
+// ============================================================
+TYPED_TEST(TestParticleUpdate, RepeatedCreateAndUpdateCycles) {
+    using T = typename TestFixture::T;
+
+    auto bunch        = this->makeBunch();
+    size_t cumulative = 0;
+
+    for (int cycle = 0; cycle < 4; ++cycle) {
+        // Inject 32 more particles on rank 0 each cycle
+        constexpr unsigned inject = 32;
+        int N                     = ippl::Comm->rank() == 0 ? inject : 0;
+        size_t currentLocal       = bunch->getLocalNum();
+        bunch->create(N);
+
+        if (ippl::Comm->rank() == 0) {
+            std::mt19937_64 eng(cycle * 100);
+            std::uniform_real_distribution<T> unif(T(0), T(1));
+            auto R_host = bunch->R.getHostMirror();
+            auto Q_host = bunch->Q.getHostMirror();
+            Kokkos::deep_copy(R_host, bunch->R.getView());
+            Kokkos::deep_copy(Q_host, bunch->Q.getView());
+
+            for (unsigned i = currentLocal; i < currentLocal + inject; ++i) {
+                for (unsigned d = 0; d < TestFixture::Dim; d++)
+                    R_host(i)[d] = unif(eng) * this->domain[d];
+                Q_host(i) = T(1);
+            }
+            Kokkos::deep_copy(bunch->R.getView(), R_host);
+            Kokkos::deep_copy(bunch->Q.getView(), Q_host);
+            cumulative += inject;
+        }
+        ippl::Comm->barrier();
+
+        bunch->update();
+
+        size_t total = this->totalParticles(*bunch);
+        if (ippl::Comm->rank() == 0) {
+            EXPECT_EQ(cumulative, total) << "at cycle " << cycle;
+        }
+        EXPECT_EQ(0u, this->countMisplaced(*bunch)) << "at cycle " << cycle;
+    }
+}
+
+// ============================================================
+//  Entry point
+// ============================================================
+int main(int argc, char* argv[]) {
+    int success = 1;
+    ippl::initialize(argc, argv);
+    {
+        ::testing::InitGoogleTest(&argc, argv);
+        success = RUN_ALL_TESTS();
+    }
+    ippl::finalize();
+    return success;
+}
\ No newline at end of file
diff --git a/unit_tests/Particle/ParticleUpdateNonuniform.cpp b/unit_tests/Particle/ParticleUpdateNonuniform.cpp
new file mode 100644
index 000000000..331e577c4
--- /dev/null
+++ b/unit_tests/Particle/ParticleUpdateNonuniform.cpp
@@ -0,0 +1,937 @@
+//
+// Unit test: Particle spatial layout update with non-uniform (ORB) decompositions
+//
+// These tests exercise particle migration after OrthogonalRecursiveBisection
+// has repartitioned the domain.  The ORB algorithm can produce highly asymmetric
+// layouts where a single rank may span many more grid cells than its neighbour,
+// so the usual assumption "every rank has the same width" does not hold.
+//
+// Minimum rank requirement:  all tests that exercise multi-rank migration are
+// guarded by GTEST_SKIP() when fewer than MIN_RANKS MPI processes are available.
+// The recommended launch is  mpirun -n 4  (or 8 for the higher-coverage tests).
+//
+// Test inventory
+// ──────────────
+//  1.  ORB layout created with uniform density  →  baseline count conservation
+//  2.  ORB layout created with a Gaussian density bump  →  asymmetric widths
+//  3.  Particles displaced uniformly after ORB  →  all migrate to correct owner
+//  4.  Particles displaced toward the heavy region  →  burst migration into narrow rank
+//  5.  Repeated ORB repartition between steps  →  counts conserved at each step
+//  6.  All particles on rank 0, ORB applied, then update  →  full redistribution
+//  7.  Charge conservation through ORB + update cycle
+//  8.  Tags (IDs) survive ORB + update
+//  9.  Zero-particle ranks after ORB do not deadlock
+// 10.  Large-count stress test with ORB decomposition
+// 11.  Periodic wrap across ORB boundaries
+// 12.  Multiple ORB repartitions with particle injection between steps
+// 13.  ORB with a step-function density  →  one rank gets 1 cell, another N-1
+// 14.  Successive displacements across ORB boundaries
+// 15.  ORB repartition in 3-D with diagonal displacement
+//
+#include "Ippl.h"
+
+#include <algorithm>
+#include <cmath>
+#include <functional>
+#include <memory>
+#include <numeric>
+#include <random>
+#include <vector>
+
+#include "Decomposition/OrthogonalRecursiveBisection.h"
+#include "TestUtils.h"
+#include "gtest/gtest.h"
+
+// ============================================================
+//  Minimum ranks required for multi-rank migration tests
+// ============================================================
+static constexpr int MIN_RANKS  = 2;  // absolute minimum
+static constexpr int PREF_RANKS = 4;  // preferred for asymmetry tests
+
+// ============================================================
+//  Particle bunch
+// ============================================================
+template <typename T, typename ExecSpace, unsigned Dim>
+struct OrbBunch
+    : public ippl::ParticleBase<
+          ippl::ParticleSpatialLayout<T, Dim, ippl::UniformCartesian<T, Dim>, ExecSpace>> {
+    using playout_type =
+        ippl::ParticleSpatialLayout<T, Dim, ippl::UniformCartesian<T, Dim>, ExecSpace>;
+
+    explicit OrbBunch(playout_type& pl)
+        : ippl::ParticleBase<playout_type>(pl) {
+        this->addAttribute(Q);
+        this->addAttribute(tag);
+    }
+
+    ippl::ParticleAttrib<T, ExecSpace> Q;            // per-particle charge
+    ippl::ParticleAttrib<long long, ExecSpace> tag;  // unique ID
+};
+
+// ============================================================
+//  Fixture
+// ============================================================
+template <typename>
+class TestParticleUpdateORB;
+
+template <typename T_, typename ExecSpace, unsigned Dim_>
+class TestParticleUpdateORB<Parameters<T_, ExecSpace, Rank<Dim_>>> : public ::testing::Test {
+public:
+    using T = T_;
+    static constexpr unsigned Dim = Dim_;
+    // ---- type aliases --------------------------------------------------
+    using flayout_type   = ippl::FieldLayout<Dim>;
+    using mesh_type      = ippl::UniformCartesian<T, Dim>;
+    using playout_type   = ippl::ParticleSpatialLayout<T, Dim, mesh_type, ExecSpace>;
+    using RegionLayout_t = typename playout_type::RegionLayout_t;
+    using bunch_type     = OrbBunch<T, ExecSpace, Dim>;
+    using position_type  = ippl::Vector<T, Dim>;
+    using Mesh_t = ippl::UniformCartesian<double, Dim>;
+    using Centering_t = Mesh_t::DefaultCentering;
+
+    // Density field type used by ORB
+    using field_type = ippl::Field<T, Dim, mesh_type, Centering_t, ExecSpace>;
+    using orb_type   = ippl::OrthogonalRecursiveBisection<field_type, T>;
+
+    // ---- construction --------------------------------------------------
+    TestParticleUpdateORB()
+        : nPoints(getGridSizes<Dim>()) {
+        for (unsigned d = 0; d < Dim; d++)
+            domain[d] = static_cast<T>(nPoints[d]) / 16.0;
+
+        std::array<ippl::Index, Dim> args;
+        for (unsigned d = 0; d < Dim; d++)
+            args[d] = ippl::Index(nPoints[d]);
+        gDomain = std::make_from_tuple<ippl::NDIndex<Dim>>(args);
+
+        ippl::Vector<T, Dim> hx, origin;
+        std::array<bool, Dim> isParallel;
+        isParallel.fill(true);
+        for (unsigned d = 0; d < Dim; d++) {
+            hx[d]     = domain[d] / nPoints[d];
+            origin[d] = 0;
+        }
+
+        layout      = std::make_shared<flayout_type>(MPI_COMM_WORLD, gDomain, isParallel);
+        mesh        = std::make_shared<mesh_type>(gDomain, hx, origin);
+        playout_ptr = std::make_shared<playout_type>(*layout, *mesh);
+    }
+
+    // ---- ORB helpers ---------------------------------------------------
+
+    /// Build a uniform-density field (all weights = 1) and run ORB.
+    bool orbUniform() {
+        field_type rho(*mesh, *layout);
+        rho = T(1);
+        orb_type orb;
+        orb.initialize(*layout, *mesh, rho);
+        // binaryRepartition needs particle positions; pass empty attrib
+        ippl::ParticleAttrib<position_type, ExecSpace> emptyR;
+        return orb.binaryRepartition(emptyR, *layout, /*isFirstRepartition=*/true);
+    }
+
+    /// Build a Gaussian density bump centred in the domain and run ORB.
+    /// The heavier region forces ORB to cut near the centre, creating
+    /// asymmetric sub-domains along each axis.
+    bool orbGaussian(T sigma = T(0.15)) {
+        field_type rho(*mesh, *layout);
+
+        // Fill the density field on-device: Gaussian centred at 0.5
+        auto view   = rho.getView();
+        auto hx_vec = mesh->getMeshSpacing();
+        auto orig   = mesh->getOrigin();
+
+        // Kokkos::parallel_for("GaussianRho", ippl::createRangePolicy<Dim, ExecSpace>(rho.getOwned()),
+        //                      KOKKOS_LAMBDA(/* index args */){
+        //                          // Generic lambda for any Dim via index_array_type
+        //                      });
+
+        // Simpler: fill via host mirror
+        auto rho_host = rho.getHostMirror();
+        auto lDomH    = layout->getLocalNDIndex();
+        for (int gz = (Dim > 2 ? lDomH[2].first() : 0); gz <= (Dim > 2 ? lDomH[2].last() : 0);
+             ++gz) {
+            for (int gy = (Dim > 1 ? lDomH[1].first() : 0); gy <= (Dim > 1 ? lDomH[1].last() : 0);
+                 ++gy) {
+                for (int gx = lDomH[0].first(); gx <= lDomH[0].last(); ++gx) {
+                    T val = T(1);
+                    // Compute normalised coordinate and Gaussian weight
+                    auto computeCoord = [&](int idx, unsigned d) -> T {
+                        return (static_cast<T>(idx) + T(0.5)) / static_cast<T>(nPoints[d]);
+                    };
+                    T x  = computeCoord(gx, 0) - T(0.5);
+                    T r2 = x * x;
+                    if constexpr (Dim > 1) {
+                        T y = computeCoord(gy, 1) - T(0.5);
+                        r2 += y * y;
+                    }
+                    if constexpr (Dim > 2) {
+                        T z = computeCoord(gz, 2) - T(0.5);
+                        r2 += z * z;
+                    }
+                    val = std::exp(-r2 / (T(2) * sigma * sigma));
+
+                    // Map global index to local+ghost index
+                    int lx = gx - lDomH[0].first() + rho.getNghost();
+                    if constexpr (Dim == 1)
+                        rho_host(lx) = val;
+                    else if constexpr (Dim == 2) {
+                        int ly           = gy - lDomH[1].first() + rho.getNghost();
+                        rho_host(lx, ly) = val;
+                    } else {
+                        int ly               = gy - lDomH[1].first() + rho.getNghost();
+                        int lz               = gz - lDomH[2].first() + rho.getNghost();
+                        rho_host(lx, ly, lz) = val;
+                    }
+                }
+            }
+        }
+        Kokkos::deep_copy(rho.getView(), rho_host);
+
+        orb_type orb;
+        orb.initialize(*layout, *mesh, rho);
+        ippl::ParticleAttrib<position_type, ExecSpace> emptyR;
+        return orb.binaryRepartition(emptyR, *layout, /*isFirstRepartition=*/true);
+    }
+
+    /// Build a step-function density: cells in [0, nPoints[0]/4) get weight 100,
+    /// the rest get weight 1.  This forces one rank to be very narrow (few cells)
+    /// while the others are wide.
+    bool orbStepFunction() {
+        field_type rho(*mesh, *layout);
+        auto rho_host = rho.getHostMirror();
+        auto lDomH    = layout->getLocalNDIndex();
+        int cutGlobal = static_cast<int>(nPoints[0]) / 4;
+
+        for (int gz = (Dim > 2 ? lDomH[2].first() : 0); gz <= (Dim > 2 ? lDomH[2].last() : 0);
+             ++gz) {
+            for (int gy = (Dim > 1 ? lDomH[1].first() : 0); gy <= (Dim > 1 ? lDomH[1].last() : 0);
+                 ++gy) {
+                for (int gx = lDomH[0].first(); gx <= lDomH[0].last(); ++gx) {
+                    T val  = (gx < cutGlobal) ? T(100) : T(1);
+                    int lx = gx - lDomH[0].first() + rho.getNghost();
+                    if constexpr (Dim == 1)
+                        rho_host(lx) = val;
+                    else if constexpr (Dim == 2) {
+                        int ly           = gy - lDomH[1].first() + rho.getNghost();
+                        rho_host(lx, ly) = val;
+                    } else {
+                        int ly               = gy - lDomH[1].first() + rho.getNghost();
+                        int lz               = gz - lDomH[2].first() + rho.getNghost();
+                        rho_host(lx, ly, lz) = val;
+                    }
+                }
+            }
+        }
+        Kokkos::deep_copy(rho.getView(), rho_host);
+
+        orb_type orb;
+        orb.initialize(*layout, *mesh, rho);
+        ippl::ParticleAttrib<position_type, ExecSpace> emptyR;
+        return orb.binaryRepartition(emptyR, *layout, /*isFirstRepartition=*/true);
+    }
+
+    /// Rebuild the playout after the layout has been repartitioned.
+    void rebuildPlayout() { playout_ptr = std::make_shared<playout_type>(*layout, *mesh); }
+
+    // ---- bunch helpers -------------------------------------------------
+
+    std::shared_ptr<bunch_type> makeBunch() {
+        auto b = std::make_shared<bunch_type>(*playout_ptr);
+        typename bunch_type::bc_container_type bcs;
+        bcs.fill(ippl::BC::PERIODIC);
+        b->setParticleBC(bcs);
+        return b;
+    }
+
+    void fillRandom(bunch_type& b, unsigned n, unsigned long seed = 42) {
+        int nRanks   = ippl::Comm->size();
+        unsigned per = std::max(1u, n / nRanks);
+        b.create(per);
+
+        std::mt19937_64 eng(seed + ippl::Comm->rank());
+        std::uniform_real_distribution<T> unif(T(0), T(1));
+
+        auto R_host = b.R.getHostMirror();
+        auto Q_host = b.Q.getHostMirror();
+        auto t_host = b.tag.getHostMirror();
+
+        for (size_t i = 0; i < b.getLocalNum(); ++i) {
+            position_type r;
+            for (unsigned d = 0; d < Dim; d++)
+                r[d] = unif(eng) * domain[d];
+            R_host(i) = r;
+            Q_host(i) = T(1);
+            t_host(i) =
+                static_cast<long long>(ippl::Comm->rank()) * 10000000LL + static_cast<long long>(i);
+        }
+        Kokkos::deep_copy(b.R.getView(), R_host);
+        Kokkos::deep_copy(b.Q.getView(), Q_host);
+        Kokkos::deep_copy(b.tag.getView(), t_host);
+    }
+
+    // ---- assertion helpers ---------------------------------------------
+
+    size_t totalParticles(const bunch_type& b) const {
+        size_t local = b.getLocalNum(), total = 0;
+        ippl::Comm->reduce(local, total, 1, std::plus<size_t>());
+        return total;
+    }
+
+    T totalCharge(bunch_type& b) const {
+        auto Q_host = b.Q.getHostMirror();
+        Kokkos::deep_copy(Q_host, b.Q.getView());
+        T local = T(0);
+        for (size_t i = 0; i < b.getLocalNum(); ++i)
+            local += Q_host(i);
+        T total = T(0);
+        ippl::Comm->reduce(local, total, 1, std::plus<T>());
+        return total;
+    }
+
+    /// Count locally owned particles that lie outside the owning rank's region.
+    size_t countMisplaced(bunch_type& b) {
+        RegionLayout_t rl = playout_ptr->getRegionLayout();
+        auto regions      = rl.getdLocalRegions();
+        auto regions_host = Kokkos::create_mirror_view(regions);
+        Kokkos::deep_copy(regions_host, regions);
+        int myRank = ippl::Comm->rank();
+
+        auto R_host = b.R.getHostMirror();
+        Kokkos::deep_copy(R_host, b.R.getView());
+
+        size_t bad = 0;
+        for (size_t i = 0; i < b.getLocalNum(); ++i) {
+            bool ok = true;
+            for (unsigned d = 0; d < Dim; d++) {
+                T p = R_host(i)[d];
+                ok &= (p >= regions_host(myRank)[d].min() && p <= regions_host(myRank)[d].max());
+            }
+            if (!ok)
+                ++bad;
+        }
+        return bad;
+    }
+
+    T periodicWrap(T x, T L) const {
+        while (x < T(0))
+            x += L;
+        while (x >= L)
+            x -= L;
+        return x;
+    }
+
+    /// Verify that the widths of local domains differ between ranks (i.e. ORB
+    /// actually produced a non-uniform decomposition along at least one axis).
+    /// Returns true if at least one pair of ranks has different widths.
+    bool isNonUniform() const {
+        auto lDom    = layout->getLocalNDIndex();
+        int myWidth  = lDom[0].length();
+        int maxWidth = 0, minWidth = 0;
+        ippl::Comm->allreduce(myWidth, maxWidth, 1, std::greater<int>{});
+        ippl::Comm->allreduce(myWidth, minWidth, 1, std::less<int>{});
+        return (maxWidth != minWidth);
+    }
+
+    // ---- data members --------------------------------------------------
+    std::array<size_t, Dim> nPoints;
+    std::array<T, Dim> domain;
+    ippl::NDIndex<Dim> gDomain;
+
+    std::shared_ptr<flayout_type> layout;
+    std::shared_ptr<mesh_type> mesh;
+    std::shared_ptr<playout_type> playout_ptr;
+};
+
+// Run over 1-D, 2-D, 3-D with default scalar / exec space
+using Tests = TestParams::tests<1, 2, 3>;
+TYPED_TEST_SUITE(TestParticleUpdateORB, Tests);
+
+// ============================================================
+//  Helper macro: skip test if rank count is below threshold
+// ============================================================
+#define REQUIRE_RANKS(n)                                                  \
+    do {                                                                  \
+        if (ippl::Comm->size() < (n)) {                                   \
+            GTEST_SKIP() << "Test requires at least " << (n) << " ranks"; \
+        }                                                                 \
+    } while (false)
+
+// ============================================================
+//  1. ORB with uniform density – baseline conservation
+// ============================================================
+TYPED_TEST(TestParticleUpdateORB, UniformOrbBaselineConservation) {
+    REQUIRE_RANKS(MIN_RANKS);
+
+    bool ok = this->orbUniform();
+    if (!ok)
+        GTEST_SKIP() << "ORB returned planar decomposition – skipping";
+
+    this->rebuildPlayout();
+    auto bunch = this->makeBunch();
+    this->fillRandom(*bunch, 256);
+
+    const size_t before = this->totalParticles(*bunch);
+    bunch->update();
+
+    EXPECT_EQ(before, this->totalParticles(*bunch));
+    EXPECT_EQ(0u, this->countMisplaced(*bunch));
+}
+
+// ============================================================
+//  2. ORB with Gaussian density – verify non-uniform decomposition
+//     and that particles still migrate correctly
+// ============================================================
+TYPED_TEST(TestParticleUpdateORB, GaussianOrbCreatesNonUniformLayout) {
+    REQUIRE_RANKS(PREF_RANKS);
+
+    bool ok = this->orbGaussian();
+    if (!ok)
+        GTEST_SKIP() << "ORB returned planar decomposition – skipping";
+
+    // With 4+ ranks and a Gaussian bump the domains should be unequal
+    if (ippl::Comm->size() >= PREF_RANKS) {
+        EXPECT_TRUE(this->isNonUniform()) << "Expected ORB to produce a non-uniform decomposition";
+    }
+
+    this->rebuildPlayout();
+    auto bunch = this->makeBunch();
+    this->fillRandom(*bunch, 256);
+
+    const size_t before = this->totalParticles(*bunch);
+    bunch->update();
+
+    EXPECT_EQ(before, this->totalParticles(*bunch));
+    EXPECT_EQ(0u, this->countMisplaced(*bunch));
+}
+
+// ============================================================
+//  3. Uniform displacement after ORB – all particles cross at least
+//     one ORB boundary and reach the correct owner
+// ============================================================
+TYPED_TEST(TestParticleUpdateORB, UniformDisplacementAfterOrb) {
+    REQUIRE_RANKS(MIN_RANKS);
+    using T = typename TestFixture::T;
+
+    bool ok = this->orbGaussian();
+    if (!ok)
+        GTEST_SKIP();
+
+    this->rebuildPlayout();
+    auto bunch = this->makeBunch();
+    this->fillRandom(*bunch, 256);
+
+    // Shift by 30 % of domain along every axis
+    {
+        auto R_host = bunch->R.getHostMirror();
+        Kokkos::deep_copy(R_host, bunch->R.getView());
+        for (size_t i = 0; i < bunch->getLocalNum(); ++i)
+            for (unsigned d = 0; d < TestFixture::Dim; d++) {
+                T np         = R_host(i)[d] + T(0.3) * this->domain[d];
+                R_host(i)[d] = this->periodicWrap(np, this->domain[d]);
+            }
+        Kokkos::deep_copy(bunch->R.getView(), R_host);
+    }
+
+    const size_t before = this->totalParticles(*bunch);
+    bunch->update();
+
+    EXPECT_EQ(before, this->totalParticles(*bunch));
+    EXPECT_EQ(0u, this->countMisplaced(*bunch));
+}
+
+// ============================================================
+//  4. Burst migration into the narrow rank after step-function ORB
+//     One rank is very narrow; we push all particles toward it.
+// ============================================================
+TYPED_TEST(TestParticleUpdateORB, BurstMigrationIntoNarrowRankAfterStepOrb) {
+    REQUIRE_RANKS(MIN_RANKS);
+    using T = typename TestFixture::T;
+
+    bool ok = this->orbStepFunction();
+    if (!ok)
+        GTEST_SKIP();
+
+    this->rebuildPlayout();
+    auto bunch = this->makeBunch();
+    this->fillRandom(*bunch, 256);
+
+    // Move every particle into the first quarter of the domain (the narrow rank)
+    {
+        auto R_host = bunch->R.getHostMirror();
+        Kokkos::deep_copy(R_host, bunch->R.getView());
+        std::mt19937_64 eng(77 + ippl::Comm->rank());
+        std::uniform_real_distribution<T> unif(T(0), T(0.25));
+        for (size_t i = 0; i < bunch->getLocalNum(); ++i) {
+            R_host(i)[0] = unif(eng) * this->domain[0];
+            for (unsigned d = 1; d < TestFixture::Dim; d++)
+                R_host(i)[d] = T(0.5) * this->domain[d];  // safe middle
+        }
+        Kokkos::deep_copy(bunch->R.getView(), R_host);
+    }
+
+    const size_t before = this->totalParticles(*bunch);
+    bunch->update();
+
+    EXPECT_EQ(before, this->totalParticles(*bunch));
+    EXPECT_EQ(0u, this->countMisplaced(*bunch));
+}
+
+// ============================================================
+//  5. Repeated ORB repartition between particle steps
+// ============================================================
+TYPED_TEST(TestParticleUpdateORB, RepeatedOrbRepartitionConservesParticles) {
+    REQUIRE_RANKS(MIN_RANKS);
+    using T = typename TestFixture::T;
+
+    // Perform 3 ORB repartitions, each time with a different density,
+    // interleaved with particle displacements.
+    constexpr unsigned N = 128;
+
+    bool firstOk = this->orbUniform();
+    if (!firstOk)
+        GTEST_SKIP();
+    this->rebuildPlayout();
+
+    auto bunch = this->makeBunch();
+    this->fillRandom(*bunch, N);
+    const size_t total = this->totalParticles(*bunch);
+
+    // Lambda: displace by fraction f and call update
+    auto displace = [&](T f) {
+        auto R_host = bunch->R.getHostMirror();
+        Kokkos::deep_copy(R_host, bunch->R.getView());
+        for (size_t i = 0; i < bunch->getLocalNum(); ++i)
+            for (unsigned d = 0; d < TestFixture::Dim; d++) {
+                T np         = R_host(i)[d] + f * this->domain[d];
+                R_host(i)[d] = this->periodicWrap(np, this->domain[d]);
+            }
+        Kokkos::deep_copy(bunch->R.getView(), R_host);
+        bunch->update();
+    };
+
+    displace(T(0.15));
+    EXPECT_EQ(total, this->totalParticles(*bunch));
+
+    // Second ORB (Gaussian)
+    bool ok2 = this->orbGaussian();
+    if (ok2) {
+        this->rebuildPlayout();
+        // Re-attach the playout to the existing bunch
+        bunch = this->makeBunch();
+        this->fillRandom(*bunch, N);
+        displace(T(0.25));
+        EXPECT_EQ(total, this->totalParticles(*bunch));
+    }
+
+    // Third ORB (step-function)
+    bool ok3 = this->orbStepFunction();
+    if (ok3) {
+        this->rebuildPlayout();
+        bunch = this->makeBunch();
+        this->fillRandom(*bunch, N);
+        displace(T(0.1));
+        EXPECT_EQ(total, this->totalParticles(*bunch));
+    }
+}
+
+// ============================================================
+//  6. All particles on rank 0, ORB applied, then update
+// ============================================================
+TYPED_TEST(TestParticleUpdateORB, AllParticlesOnRank0AfterOrb) {
+    REQUIRE_RANKS(MIN_RANKS);
+    using T = typename TestFixture::T;
+
+    bool ok = this->orbGaussian();
+    if (!ok)
+        GTEST_SKIP();
+    this->rebuildPlayout();
+
+    unsigned N = ippl::Comm->rank() == 0 ? 256 : 0;
+    auto bunch = this->makeBunch();
+    bunch->create(N);
+
+    if (ippl::Comm->rank() == 0) {
+        std::mt19937_64 eng(12);
+        std::uniform_real_distribution<T> unif(T(0), T(1));
+        auto R_host = bunch->R.getHostMirror();
+        auto Q_host = bunch->Q.getHostMirror();
+        for (unsigned i = 0; i < N; ++i) {
+            for (unsigned d = 0; d < TestFixture::Dim; d++)
+                R_host(i)[d] = unif(eng) * this->domain[d];
+            Q_host(i) = T(1);
+        }
+        Kokkos::deep_copy(bunch->R.getView(), R_host);
+        Kokkos::deep_copy(bunch->Q.getView(), Q_host);
+    }
+
+    const size_t before = this->totalParticles(*bunch);
+    ASSERT_EQ(before, static_cast<size_t>(N));
+
+    bunch->update();
+
+    EXPECT_EQ(before, this->totalParticles(*bunch));
+    EXPECT_EQ(0u, this->countMisplaced(*bunch));
+}
+
+// ============================================================
+//  7. Charge conservation through ORB + update cycle
+// ============================================================
+TYPED_TEST(TestParticleUpdateORB, ChargeConservedAfterOrbAndUpdate) {
+    REQUIRE_RANKS(MIN_RANKS);
+    using T = typename TestFixture::T;
+
+    bool ok = this->orbGaussian();
+    if (!ok)
+        GTEST_SKIP();
+    this->rebuildPlayout();
+
+    auto bunch = this->makeBunch();
+    this->fillRandom(*bunch, 256);
+
+    // Assign unique charges
+    {
+        auto Q_host = bunch->Q.getHostMirror();
+        for (size_t i = 0; i < bunch->getLocalNum(); ++i)
+            Q_host(i) = static_cast<T>(ippl::Comm->rank() * 100000 + i + 1);
+        Kokkos::deep_copy(bunch->Q.getView(), Q_host);
+    }
+
+    const T chargeBefore = this->totalCharge(*bunch);
+
+    // Displace and update
+    {
+        auto R_host = bunch->R.getHostMirror();
+        Kokkos::deep_copy(R_host, bunch->R.getView());
+        for (size_t i = 0; i < bunch->getLocalNum(); ++i)
+            for (unsigned d = 0; d < TestFixture::Dim; d++) {
+                T np         = R_host(i)[d] + T(0.35) * this->domain[d];
+                R_host(i)[d] = this->periodicWrap(np, this->domain[d]);
+            }
+        Kokkos::deep_copy(bunch->R.getView(), R_host);
+    }
+    bunch->update();
+
+    const T chargeAfter = this->totalCharge(*bunch);
+    EXPECT_NEAR(static_cast<double>(chargeBefore), static_cast<double>(chargeAfter),
+                1e-6 * std::abs(static_cast<double>(chargeBefore)));
+}
+
+// ============================================================
+//  8. Particle tags survive ORB + update
+// ============================================================
+TYPED_TEST(TestParticleUpdateORB, TagsPreservedAfterOrbAndUpdate) {
+    REQUIRE_RANKS(MIN_RANKS);
+    using T = typename TestFixture::T;
+
+    bool ok = this->orbUniform();
+    if (!ok)
+        GTEST_SKIP();
+    this->rebuildPlayout();
+
+    auto bunch = this->makeBunch();
+    this->fillRandom(*bunch, 128);
+
+    // Record global original count of unique tags
+    size_t origLocal  = bunch->getLocalNum();
+    size_t origGlobal = 0;
+    ippl::Comm->reduce(origLocal, origGlobal, 1, std::plus<size_t>());
+
+    // Displace
+    {
+        auto R_host = bunch->R.getHostMirror();
+        Kokkos::deep_copy(R_host, bunch->R.getView());
+        for (size_t i = 0; i < bunch->getLocalNum(); ++i)
+            for (unsigned d = 0; d < TestFixture::Dim; d++) {
+                T np         = R_host(i)[d] + T(0.45) * this->domain[d];
+                R_host(i)[d] = this->periodicWrap(np, this->domain[d]);
+            }
+        Kokkos::deep_copy(bunch->R.getView(), R_host);
+    }
+    bunch->update();
+
+    size_t afterLocal  = bunch->getLocalNum();
+    size_t afterGlobal = 0;
+    ippl::Comm->reduce(afterLocal, afterGlobal, 1, std::plus<size_t>());
+
+    EXPECT_EQ(origGlobal, afterGlobal);
+}
+
+// ============================================================
+//  9. Zero-particle ranks after ORB do not deadlock
+// ============================================================
+TYPED_TEST(TestParticleUpdateORB, ZeroParticleRanksAfterOrbDoNotDeadlock) {
+    REQUIRE_RANKS(MIN_RANKS);
+    using T = typename TestFixture::T;
+
+    bool ok = this->orbStepFunction();
+    if (!ok)
+        GTEST_SKIP();
+    this->rebuildPlayout();
+
+    // Only rank 0 creates particles
+    auto bunch = this->makeBunch();
+    unsigned N = ippl::Comm->rank() == 0 ? 64 : 0;
+    bunch->create(N);
+    if (ippl::Comm->rank() == 0) {
+        std::mt19937_64 eng(55);
+        std::uniform_real_distribution<T> unif(T(0), T(1));
+        auto R_host = bunch->R.getHostMirror();
+        auto Q_host = bunch->Q.getHostMirror();
+        for (unsigned i = 0; i < N; ++i) {
+            for (unsigned d = 0; d < TestFixture::Dim; d++)
+                R_host(i)[d] = unif(eng) * this->domain[d];
+            Q_host(i) = T(1);
+        }
+        Kokkos::deep_copy(bunch->R.getView(), R_host);
+        Kokkos::deep_copy(bunch->Q.getView(), Q_host);
+    }
+
+    const size_t before = this->totalParticles(*bunch);
+    ASSERT_NO_THROW(bunch->update());
+    EXPECT_EQ(before, this->totalParticles(*bunch));
+}
+
+// ============================================================
+// 10. Large-count stress test with Gaussian ORB decomposition
+// ============================================================
+TYPED_TEST(TestParticleUpdateORB, LargeParticleCountGaussianOrb) {
+    REQUIRE_RANKS(MIN_RANKS);
+
+    bool ok = this->orbGaussian();
+    if (!ok)
+        GTEST_SKIP();
+    this->rebuildPlayout();
+
+    auto bunch = this->makeBunch();
+    this->fillRandom(*bunch, 4096, /*seed=*/98765);
+
+    const size_t before = this->totalParticles(*bunch);
+    bunch->update();
+
+    EXPECT_EQ(before, this->totalParticles(*bunch));
+    EXPECT_EQ(0u, this->countMisplaced(*bunch));
+}
+
+// ============================================================
+// 11. Periodic wrap across ORB boundaries
+// ============================================================
+TYPED_TEST(TestParticleUpdateORB, PeriodicWrapAcrossOrbBoundaries) {
+    REQUIRE_RANKS(MIN_RANKS);
+    using T = typename TestFixture::T;
+
+    bool ok = this->orbGaussian();
+    if (!ok)
+        GTEST_SKIP();
+    this->rebuildPlayout();
+
+    auto bunch = this->makeBunch();
+
+    // Each rank places particles just beyond the global domain's upper edge
+    // along axis 0 so they wrap back to near 0.
+    bunch->create(4);
+    {
+        auto R_host = bunch->R.getHostMirror();
+        auto Q_host = bunch->Q.getHostMirror();
+        std::mt19937_64 eng(33 + ippl::Comm->rank());
+        std::uniform_real_distribution<T> tiny(T(1e-5), T(1e-3));
+        for (size_t i = 0; i < bunch->getLocalNum(); ++i) {
+            R_host(i)[0] = this->domain[0] + tiny(eng) * this->domain[0];
+            for (unsigned d = 1; d < TestFixture::Dim; d++)
+                R_host(i)[d] = T(0.5) * this->domain[d];
+            Q_host(i) = T(1);
+        }
+        Kokkos::deep_copy(bunch->R.getView(), R_host);
+        Kokkos::deep_copy(bunch->Q.getView(), Q_host);
+    }
+
+    const size_t before = this->totalParticles(*bunch);
+    bunch->update();
+
+    EXPECT_EQ(before, this->totalParticles(*bunch));
+}
+
+// ============================================================
+// 12. Particle injection between ORB repartitions
+//     Simulates a typical PIC workflow: inject → ORB → move → update → repeat
+// ============================================================
+TYPED_TEST(TestParticleUpdateORB, ParticleInjectionBetweenOrbRepartitions) {
+    REQUIRE_RANKS(MIN_RANKS);
+    using T = typename TestFixture::T;
+
+    constexpr unsigned injectPerCycle = 32;
+    constexpr int cycles              = 3;
+
+    for (int c = 0; c < cycles; ++c) {
+        // Repartition with Gaussian for odd cycles, uniform for even
+        bool ok = (c % 2 == 0) ? this->orbUniform() : this->orbGaussian();
+        if (!ok)
+            continue;
+        this->rebuildPlayout();
+
+        auto bunch = this->makeBunch();
+
+        // Inject on rank 0
+        unsigned N = ippl::Comm->rank() == 0 ? injectPerCycle : 0;
+        bunch->create(N);
+        if (ippl::Comm->rank() == 0) {
+            std::mt19937_64 eng(c * 1000);
+            std::uniform_real_distribution<T> unif(T(0), T(1));
+            auto R_host = bunch->R.getHostMirror();
+            auto Q_host = bunch->Q.getHostMirror();
+            for (unsigned i = 0; i < injectPerCycle; ++i) {
+                for (unsigned d = 0; d < TestFixture::Dim; d++)
+                    R_host(i)[d] = unif(eng) * this->domain[d];
+                Q_host(i) = T(1);
+            }
+            Kokkos::deep_copy(bunch->R.getView(), R_host);
+            Kokkos::deep_copy(bunch->Q.getView(), Q_host);
+        }
+
+        bunch->update();
+
+        auto total_particles = this->totalParticles(*bunch);
+        if (ippl::Comm->rank() == 0) {
+            EXPECT_EQ(static_cast<size_t>(injectPerCycle), total_particles) << "at cycle " << c;
+        }
+        EXPECT_EQ(0u, this->countMisplaced(*bunch)) << "at cycle " << c;
+    }
+}
+
+// ============================================================
+// 13. Step-function ORB: narrow rank correctly receives migrating particles
+//     and does not receive more than the domain capacity
+// ============================================================
+TYPED_TEST(TestParticleUpdateORB, NarrowRankReceivesCorrectlyAfterStepOrb) {
+    REQUIRE_RANKS(PREF_RANKS);
+    using T = typename TestFixture::T;
+
+    bool ok = this->orbStepFunction();
+    if (!ok)
+        GTEST_SKIP();
+    this->rebuildPlayout();
+
+    auto bunch = this->makeBunch();
+    this->fillRandom(*bunch, 512);
+
+    // Move all particles into the heavy region (second three-quarters)
+    {
+        auto R_host = bunch->R.getHostMirror();
+        Kokkos::deep_copy(R_host, bunch->R.getView());
+        std::mt19937_64 eng(808 + ippl::Comm->rank());
+        std::uniform_real_distribution<T> heavyX(T(0.25), T(1.0));
+        for (size_t i = 0; i < bunch->getLocalNum(); ++i) {
+            R_host(i)[0] = heavyX(eng) * this->domain[0];
+            for (unsigned d = 1; d < TestFixture::Dim; d++)
+                R_host(i)[d] = T(0.5) * this->domain[d];
+        }
+        Kokkos::deep_copy(bunch->R.getView(), R_host);
+    }
+
+    const size_t before = this->totalParticles(*bunch);
+    bunch->update();
+
+    EXPECT_EQ(before, this->totalParticles(*bunch));
+    EXPECT_EQ(0u, this->countMisplaced(*bunch));
+}
+
+// ============================================================
+// 14. Successive displacements across ORB boundaries
+// ============================================================
+TYPED_TEST(TestParticleUpdateORB, SuccessiveDisplacementsAcrossOrbBoundaries) {
+    REQUIRE_RANKS(MIN_RANKS);
+    using T = typename TestFixture::T;
+
+    bool ok = this->orbGaussian();
+    if (!ok)
+        GTEST_SKIP();
+    this->rebuildPlayout();
+
+    auto bunch = this->makeBunch();
+    this->fillRandom(*bunch, 128);
+    const size_t N = this->totalParticles(*bunch);
+
+    std::mt19937_64 eng(2025 + ippl::Comm->rank());
+    std::uniform_real_distribution<T> step(-T(0.2), T(0.2));
+
+    for (int s = 0; s < 6; ++s) {
+        auto R_host = bunch->R.getHostMirror();
+        Kokkos::deep_copy(R_host, bunch->R.getView());
+        for (size_t i = 0; i < bunch->getLocalNum(); ++i)
+            for (unsigned d = 0; d < TestFixture::Dim; d++) {
+                T np         = R_host(i)[d] + step(eng) * this->domain[d];
+                R_host(i)[d] = this->periodicWrap(np, this->domain[d]);
+            }
+        Kokkos::deep_copy(bunch->R.getView(), R_host);
+
+        bunch->update();
+        EXPECT_EQ(N, this->totalParticles(*bunch)) << "at step " << s;
+        EXPECT_EQ(0u, this->countMisplaced(*bunch)) << "at step " << s;
+    }
+}
+
+// ============================================================
+// 15. 3-D corner migration with ORB decomposition
+//     (only runs when Dim == 3)
+// ============================================================
+TYPED_TEST(TestParticleUpdateORB, ThreeDCornerMigrationAfterOrb) {
+    if constexpr (TestFixture::Dim != 3) {
+        GTEST_SKIP() << "3-D specific test";
+    }
+    REQUIRE_RANKS(PREF_RANKS);
+    using T = typename TestFixture::T;
+
+    bool ok = this->orbGaussian(T(0.2));
+    if (!ok)
+        GTEST_SKIP();
+    this->rebuildPlayout();
+
+    auto bunch = this->makeBunch();
+
+    // Seed all particles near the origin
+    bunch->create(64);
+    {
+        auto R_host = bunch->R.getHostMirror();
+        auto Q_host = bunch->Q.getHostMirror();
+        for (size_t i = 0; i < bunch->getLocalNum(); ++i) {
+            for (unsigned d = 0; d < 3; d++)
+                R_host(i)[d] = T(0.02) * this->domain[d];
+            Q_host(i) = T(1);
+        }
+        Kokkos::deep_copy(bunch->R.getView(), R_host);
+        Kokkos::deep_copy(bunch->Q.getView(), Q_host);
+    }
+
+    const size_t before = this->totalParticles(*bunch);
+    bunch->update();  // settle near origin
+
+    // Jump diagonally to the opposite corner (wraps periodically)
+    {
+        auto R_host = bunch->R.getHostMirror();
+        Kokkos::deep_copy(R_host, bunch->R.getView());
+        for (size_t i = 0; i < bunch->getLocalNum(); ++i)
+            for (unsigned d = 0; d < 3; d++) {
+                T np         = R_host(i)[d] + T(0.98) * this->domain[d];
+                R_host(i)[d] = this->periodicWrap(np, this->domain[d]);
+            }
+        Kokkos::deep_copy(bunch->R.getView(), R_host);
+    }
+
+    bunch->update();
+    EXPECT_EQ(before, this->totalParticles(*bunch));
+    EXPECT_EQ(0u, this->countMisplaced(*bunch));
+}
+
+// ============================================================
+//  Entry point
+// ============================================================
+int main(int argc, char* argv[]) {
+    int success = 1;
+    ippl::initialize(argc, argv);
+    {
+        ::testing::InitGoogleTest(&argc, argv);
+        success = RUN_ALL_TESTS();
+    }
+    ippl::finalize();
+    return success;
+}