From ac07afa89a93fc2d5b4f33c9e47d0a1329f45b9b Mon Sep 17 00:00:00 2001 From: Fischill Paul Date: Sun, 31 May 2026 22:39:50 +0200 Subject: [PATCH 01/14] Extract particle communication update infrastructure Split the communication and particle-update infrastructure from PR501 on top of the PCG split. This brings in reusable communication buffers, page-granular archive allocation, particle attribute serialization hooks, packed particle send IDs, particle sorting buffers, and the rewritten ParticleSpatialLayout update path. Keep this branch independent from the later interpolation, FFT, NUFFT, and PIF splits by dropping those APIs from the extracted ParticleAttrib changes. Add particle update regression coverage and update existing tests for live-view and page-sized buffer semantics. Validated with a Debug Serial Kokkos 5.0.0 build: full 1-rank ctest passes, plus ParticleSendRecv, ParticleUpdate, and ParticleUpdateNonuniform pass under mpiexec -n 2. --- PR501_SPLIT_MAP.md | 51 + src/Communicate/Archive.h | 57 +- src/Communicate/Archive.hpp | 373 +++++-- src/Communicate/BufferHandler.hpp | 30 +- src/Communicate/Buffers.cpp | 4 +- src/Communicate/Buffers.hpp | 10 +- src/Communicate/Communicator.cpp | 36 +- src/Communicate/Communicator.h | 54 +- src/Communicate/CommunicatorLogging.cpp | 35 +- src/Communicate/Environment.cpp | 18 +- src/Communicate/Environment.h | 3 + src/Communicate/LogEntry.cpp | 25 +- src/Communicate/LogEntry.h | 5 + src/Communicate/LoggingBufferHandler.h | 1 - src/Field/BareField.hpp | 4 +- src/Field/HaloCells.h | 7 +- src/Field/HaloCells.hpp | 17 +- src/FieldLayout/FieldLayout.h | 44 +- src/FieldLayout/FieldLayout.hpp | 28 +- src/FieldLayout/SubFieldLayout.hpp | 2 +- src/Ippl.h | 3 +- src/Particle/ParticleAttrib.h | 77 +- src/Particle/ParticleAttrib.hpp | 100 +- src/Particle/ParticleAttribBase.h | 11 +- src/Particle/ParticleBase.h | 17 +- src/Particle/ParticleBase.hpp | 103 +- src/Particle/ParticleSort.h | 331 +++++++ src/Particle/ParticleSpatialLayout.h | 115 ++- src/Particle/ParticleSpatialLayout.hpp | 623 ++++++------ src/Particle/SortBuffer.h | 185 ++++ src/Utility/BufferView.h | 229 +++++ src/Utility/IpplTimings.cpp | 159 ++- src/Utility/IpplTimings.h | 79 +- src/Utility/ParallelDispatch.h | 129 ++- src/Utility/ParameterList.h | 2 + src/Utility/Timer.cpp | 23 +- src/Utility/Timer.h | 9 +- src/Utility/Tuning.h | 213 ++++ src/Utility/TypeUtils.h | 79 +- src/Utility/ViewUtils.h | 3 +- unit_tests/Communicate/BufferHandler.cpp | 24 +- unit_tests/Particle/CMakeLists.txt | 2 + unit_tests/Particle/ParticleBase.cpp | 1 + unit_tests/Particle/ParticleSendRecv.cpp | 10 +- unit_tests/Particle/ParticleUpdate.cpp | 830 ++++++++++++++++ .../Particle/ParticleUpdateNonuniform.cpp | 937 ++++++++++++++++++ 46 files changed, 4400 insertions(+), 698 deletions(-) create mode 100644 src/Particle/ParticleSort.h create mode 100644 src/Particle/SortBuffer.h create mode 100644 src/Utility/BufferView.h create mode 100644 src/Utility/Tuning.h create mode 100644 unit_tests/Particle/ParticleUpdate.cpp create mode 100644 unit_tests/Particle/ParticleUpdateNonuniform.cpp diff --git a/PR501_SPLIT_MAP.md b/PR501_SPLIT_MAP.md index ba0e88300..90dd59c21 100644 --- a/PR501_SPLIT_MAP.md +++ b/PR501_SPLIT_MAP.md @@ -363,3 +363,54 @@ Assessment: - This is a good standalone first PR. - It has a narrow file set and clear motivation: remove repeated allocation work from the PCG/preconditioner loop. - Recommended next validation before opening: run the relevant solver integration tests or CSCS GPU tests that originally showed PCG allocation/slowdown symptoms. + +### 2026-05-31: PR 2 Prototype - Communication And Particle Update Infrastructure + +Status: extracted on top of `pr501-pcg`. + +Branch: + +```text +pr501-communication-particle-update +``` + +Included PR501 areas: + +- Communication buffer/archive updates from `9507a796` and `f2820064` +- Particle layout rewrite and sort buffers from `2878b90b` +- Particle update regression tests from the particle portion of `f0560f76` +- Local integration fixes needed to keep this split independent from the later interpolation/FFT/NUFFT PRs + +Deliberately excluded: + +- Higher-order interpolation/gather/scatter APIs +- FFT backend and NUFFT transform integration +- PIF example/application changes + +Validation: + +```text +cmake -S . -B build-pr501-communication-debug -DCMAKE_BUILD_TYPE=Debug -DIPPL_ENABLE_UNIT_TESTS=ON -DIPPL_ENABLE_FFT=ON -DIPPL_DEFAULT_TEST_PROCS=1 +cmake --build build-pr501-communication-debug --parallel 8 +ctest --test-dir build-pr501-communication-debug --output-on-failure +/opt/homebrew/Cellar/open-mpi/5.0.8/bin/mpiexec -n 2 build-pr501-communication-debug/unit_tests/Particle/ParticleSendRecv +/opt/homebrew/Cellar/open-mpi/5.0.8/bin/mpiexec -n 2 build-pr501-communication-debug/unit_tests/Particle/ParticleUpdate +/opt/homebrew/Cellar/open-mpi/5.0.8/bin/mpiexec -n 2 build-pr501-communication-debug/unit_tests/Particle/ParticleUpdateNonuniform +``` + +Result: + +```text +100% tests passed, 0 tests failed out of 36 +Total Test time (real) = 38.64 sec + +2-rank ParticleSendRecv: passed +2-rank ParticleUpdate: passed +2-rank ParticleUpdateNonuniform: passed +``` + +Assessment: + +- This is a coherent second PR after `pr501-pcg`. +- It is still larger than the PCG split, but it avoids the major algorithmic dependencies from interpolation, FFT, NUFFT, and PIF. +- Recommended next validation before opening: run the same particle tests on CUDA/HIP builds and larger MPI rank counts, especially the original `ParticleSendRecv` failure configurations. diff --git a/src/Communicate/Archive.h b/src/Communicate/Archive.h index 6ec74f946..2e8561e36 100644 --- a/src/Communicate/Archive.h +++ b/src/Communicate/Archive.h @@ -9,6 +9,9 @@ // that they have type char and thus contain raw bytes, unlike other typed buffers // such as detail::FieldBufferData used by HaloCells. // +// On CUDA/HIP the internal buffer is allocated directly via cudaMalloc/hipMalloc +// so that the pointer is page-aligned (4K) and compatible with MPI IPC. +// #ifndef IPPL_ARCHIVE_H #define IPPL_ARCHIVE_H @@ -32,6 +35,7 @@ namespace ippl { using pointer_type = typename buffer_type::pointer_type; Archive(size_type size = 0); + ~Archive(); /*! * Serialize. @@ -40,6 +44,10 @@ namespace ippl { template void serialize(const Kokkos::View& view, size_type nsends); + template + void serialize(const Kokkos::View& view, const HashView& hash, + size_type nsends); + /*! * Serialize vector attributes * @@ -52,6 +60,10 @@ namespace ippl { void serialize(const Kokkos::View*, ViewArgs...>& view, size_type nsends); + template + void serialize(const Kokkos::View*, ViewArgs...>& view, + const HashView& hash, size_type nsends); + /*! * Deserialize. * @param view to put data to @@ -59,6 +71,14 @@ namespace ippl { template void deserialize(Kokkos::View& view, size_type nrecvs); + template + void deserialize(Kokkos::View& view, size_type offset, + size_type nrecvs); + + template + void deserialize(Kokkos::View*, ViewArgs...>& view, size_type offset, + size_type nrecvs); + /*! * Deserialize vector attributes * @@ -71,33 +91,50 @@ namespace ippl { void deserialize(Kokkos::View*, ViewArgs...>& view, size_type nrecvs); /*! - * @returns a pointer to the data of the buffer + * @returns a pointer to the data of the buffer. + * On GPU this is a page-aligned device pointer from cudaMalloc/hipMalloc. */ - pointer_type getBuffer() { return buffer_m.data(); } + pointer_type getBuffer() { return bufferData(); } /*! - * @returns the size of the buffer + * @returns the number of bytes written so far */ size_type getSize() const { return writepos_m; } - size_type getBufferSize() const { return buffer_m.size(); } - - void resizeBuffer(size_type size) { Kokkos::resize(buffer_m, size); } + /*! + * @returns the total capacity of the buffer in bytes + */ + size_type getBufferSize() const { return bufferSize(); } - void reallocBuffer(size_type size) { Kokkos::realloc(buffer_m, size); } + void resizeBuffer(size_type size); + void reallocBuffer(size_type size); void resetWritePos() { writepos_m = 0; } void resetReadPos() { readpos_m = 0; } - ~Archive() = default; - private: //! write position for serialization size_type writepos_m; //! read position for deserialization size_type readpos_m; - //! serialized data + +#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) + //! Raw device pointer from cudaMalloc/hipMalloc (page-aligned, IPC-safe) + pointer_type buffer_ptr_m = nullptr; + size_type buffer_size_m = 0; + + pointer_type bufferData() const { return buffer_ptr_m; } + size_type bufferSize() const { return buffer_size_m; } + + void gpuAlloc(size_type size); + void gpuFree(); +#else + //! serialized data (standard Kokkos view on CPU) buffer_type buffer_m; + + pointer_type bufferData() const { return buffer_m.data(); } + size_type bufferSize() const { return buffer_m.size(); } +#endif }; } // namespace detail } // namespace ippl diff --git a/src/Communicate/Archive.hpp b/src/Communicate/Archive.hpp index a55c33c7d..19e7b2423 100644 --- a/src/Communicate/Archive.hpp +++ b/src/Communicate/Archive.hpp @@ -4,6 +4,12 @@ // #include "Archive.h" +#if defined(KOKKOS_ENABLE_CUDA) +#include +#elif defined(KOKKOS_ENABLE_HIP) +#include +#endif + namespace ippl { namespace detail { KOKKOS_INLINE_FUNCTION void copyBytes(char* dst, const char* src, size_t size) { @@ -12,117 +18,364 @@ namespace ippl { } } + template + struct SerializeHashFunctor { + const T* view_data; + HashView hash; + BufferPtr buf; + size_t elem_size; + size_t wpos; + + KOKKOS_INLINE_FUNCTION void operator()(const size_t i) const { + const char* src = reinterpret_cast(view_data + hash(i)); + char* dst = buf + i * elem_size + wpos; + copyBytes(dst, src, elem_size); + } + }; + + template + struct SerializeHashVectorFunctor { + const Vector* view_data; + HashView hash; + BufferPtr buf; + size_t elem_size; + size_t wpos; + + KOKKOS_INLINE_FUNCTION void operator()(const size_t i, const size_t d) const { + const Vector* vec = view_data + hash(i); + const T value = (*vec)[d]; + const char* src = reinterpret_cast(&value); + char* dst = buf + (Dim * i + d) * elem_size + wpos; + copyBytes(dst, src, elem_size); + } + }; + + // ================================================================= + // GPU buffer management + // ================================================================= + +#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) + + template + void Archive::gpuAlloc(size_type size) { + if (size == 0) return; +#if defined(KOKKOS_ENABLE_HIP) + // HSA IPC (used by Cray MPICH for large-message GPU transfers) + // requires allocation sizes to be multiples of the GPU page + // granularity (64 KB on MI250X / MI300X). Without this, + // hsa_amd_ipc_memory_attach fails with INVALID_ARGUMENT. + static constexpr size_type kGranularity = 65536; + size = ((size + kGranularity - 1) / kGranularity) * kGranularity; +#endif + void* ptr = nullptr; +#if defined(KOKKOS_ENABLE_CUDA) + cudaMalloc(&ptr, size); +#else + hipMalloc(&ptr, size); +#endif + buffer_ptr_m = static_cast(ptr); + buffer_size_m = size; + } + + template + void Archive::gpuFree() { + if (buffer_ptr_m) { +#if defined(KOKKOS_ENABLE_CUDA) + cudaFree(buffer_ptr_m); +#else + hipFree(buffer_ptr_m); +#endif + buffer_ptr_m = nullptr; + buffer_size_m = 0; + } + } + + template + Archive::Archive(size_type size) + : writepos_m(0) + , readpos_m(0) { + gpuAlloc(size); + } + + template + Archive::~Archive() { + gpuFree(); + } + + template + void Archive::resizeBuffer(size_type size) { + if (size <= buffer_size_m) return; + +#if defined(KOKKOS_ENABLE_HIP) + static constexpr size_type kGranularity = 65536; + size = ((size + kGranularity - 1) / kGranularity) * kGranularity; +#endif + pointer_type new_ptr = nullptr; + void* vptr = nullptr; +#if defined(KOKKOS_ENABLE_CUDA) + cudaMalloc(&vptr, size); +#else + hipMalloc(&vptr, size); +#endif + new_ptr = static_cast(vptr); + + if (buffer_ptr_m && buffer_size_m > 0) { +#if defined(KOKKOS_ENABLE_CUDA) + cudaMemcpy(new_ptr, buffer_ptr_m, buffer_size_m, cudaMemcpyDeviceToDevice); + cudaFree(buffer_ptr_m); +#else + hipMemcpy(new_ptr, buffer_ptr_m, buffer_size_m, hipMemcpyDeviceToDevice); + hipFree(buffer_ptr_m); +#endif + } + + buffer_ptr_m = new_ptr; + buffer_size_m = size; + } + + template + void Archive::reallocBuffer(size_type size) { + gpuFree(); + gpuAlloc(size); + } + +#else // CPU path + template Archive::Archive(size_type size) : writepos_m(0) , readpos_m(0) , buffer_m("buffer", size) {} - // ----------------------------------- - // Scalar serialize + template + Archive::~Archive() = default; + + template + void Archive::resizeBuffer(size_type size) { + Kokkos::resize(buffer_m, size); + } + + template + void Archive::reallocBuffer(size_type size) { + Kokkos::realloc(buffer_m, size); + } + +#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP + + // ================================================================= + // Serialize — scalar + // ================================================================= + template template void Archive::serialize(const Kokkos::View& view, size_type nsends) { - constexpr size_t size = sizeof(T); - char* dst_ptr = (char*)(buffer_m.data()) + writepos_m; - char* src_ptr = (char*)(view.data()); - assert(writepos_m + (nsends * size) <= buffer_m.size()); - // construct temp views of the src/dst buffers of the correct size (bytes) - using src_view_type = - Kokkos::View::memory_space, - Kokkos::MemoryTraits>; - using dst_view_type = - Kokkos::View>; - src_view_type src_view(src_ptr, size * nsends); - dst_view_type dst_view(dst_ptr, size * nsends); - Kokkos::deep_copy(dst_view, src_view); + using exec_space = typename Kokkos::View::execution_space; + using policy_type = Kokkos::RangePolicy; + + size_t size = sizeof(T); + auto base = bufferData(); + auto writepos = writepos_m; + Kokkos::parallel_for( + "Archive::serialize()", policy_type(0, nsends), KOKKOS_LAMBDA(const size_type i) { + const char* src = reinterpret_cast(view.data() + i); + char* dst = base + i * size + writepos; + copyBytes(dst, src, size); + }); Kokkos::fence(); - writepos_m += (nsends * size); + writepos_m += size * nsends; } - // ----------------------------------- - // Vector serialize + // ================================================================= + // Serialize — scalar with hash + // ================================================================= + + template + template + void Archive::serialize(const Kokkos::View& view, + const HashView& hash, size_type nsends) { + using exec_space = HashView::execution_space; + using policy_type = Kokkos::RangePolicy; + using BufferPtr = pointer_type; + + SerializeHashFunctor f{view.data(), hash, bufferData(), + sizeof(T), writepos_m}; + + Kokkos::parallel_for("Archive::serialize(hash)", policy_type(0, nsends), f); + Kokkos::fence(); + writepos_m += sizeof(T) * nsends; + } + + // ================================================================= + // Serialize — vector + // ================================================================= + template template void Archive::serialize( const Kokkos::View*, ViewArgs...>& view, size_type nsends) { - constexpr size_t size = sizeof(T); - char* dst_ptr = (char*)(buffer_m.data()); - ippl::Vector* src_ptr = view.data(); - auto wp = writepos_m; - // The Kokkos range policies expect int64 - // so we have to explicitly specify size_type (uint64) using exec_space = typename Kokkos::View::execution_space; + + size_t size = sizeof(T); + auto base = bufferData(); + auto writepos = writepos_m; + // Default index type for range policies is int64, + // so we have to explicitly specify size_type (uint64) using mdrange_t = Kokkos::MDRangePolicy, Kokkos::IndexType, exec_space>; Kokkos::parallel_for( - "Archive::serialize()", mdrange_t({0, 0}, {(long int)nsends, Dim}), + "Archive::serialize()", + // The constructor for Kokkos range policies always + // expects int64 regardless of index type provided + // by template parameters, so the typecast is necessary + // to avoid compiler warnings + mdrange_t({0, 0}, {(long int)nsends, Dim}), KOKKOS_LAMBDA(const size_type i, const size_t d) { - const char* src = reinterpret_cast(&src_ptr[i][d]); - char* dst = dst_ptr + (Dim * i + d) * size + wp; + const T value = view.data()[i][d]; + const char* src = reinterpret_cast(&value); + char* dst = base + (Dim * i + d) * size + writepos; copyBytes(dst, src, size); }); + Kokkos::fence(); + writepos_m += Dim * size * nsends; + } + + // ================================================================= + // Serialize — vector with hash + // ================================================================= + template + template + void Archive::serialize( + const Kokkos::View*, ViewArgs...>& view, const HashView& hash, + size_type nsends) { + using exec_space = typename HashView::execution_space; + size_t size = sizeof(T); + using BufferPtr = pointer_type; + using mdrange_t = + Kokkos::MDRangePolicy, Kokkos::IndexType, exec_space>; + + SerializeHashVectorFunctor f{ + view.data(), hash, bufferData(), size, writepos_m}; + + Kokkos::parallel_for("Archive::serialize(hash, vector)", + mdrange_t({0, 0}, {(long int)nsends, Dim}), f); Kokkos::fence(); writepos_m += Dim * size * nsends; } - // ----------------------------------- - // Scalar Deserialize + // ================================================================= + // Deserialize — scalar + // ================================================================= + template template void Archive::deserialize(Kokkos::View& view, size_type nrecvs) { - // if we have to enlarge the destination view + using exec_space = typename Kokkos::View::execution_space; + using policy_type = Kokkos::RangePolicy; + + size_t size = sizeof(T); if (nrecvs > view.extent(0)) { Kokkos::realloc(view, nrecvs); } - // - constexpr size_t size = sizeof(T); - char* src_ptr = (char*)(buffer_m.data()) + readpos_m; - char* dst_ptr = (char*)(view.data()); - assert(readpos_m + (nrecvs * size) <= buffer_m.size()); - // construct temp views of the src/dst buffers of the correct size (bytes) - using src_view_type = - Kokkos::View>; - using dst_view_type = - Kokkos::View::memory_space, - Kokkos::MemoryTraits>; - src_view_type src_view(src_ptr, size * nrecvs); - dst_view_type dst_view(dst_ptr, size * nrecvs); - Kokkos::deep_copy(dst_view, src_view); + auto base = bufferData(); + auto readpos = readpos_m; + Kokkos::parallel_for( + "Archive::deserialize()", policy_type(0, nrecvs), KOKKOS_LAMBDA(const size_type i) { + const char* src = base + i * size + readpos; + char* dst = reinterpret_cast(view.data() + i); + copyBytes(dst, src, size); + }); + // Wait for deserialization kernel to complete + // (as with serialization kernels) Kokkos::fence(); - readpos_m += (nrecvs * size); + readpos_m += size * nrecvs; } - // ----------------------------------- - // Vector Deserialize + // ================================================================= + // Deserialize — vector + // ================================================================= + template template void Archive::deserialize(Kokkos::View*, ViewArgs...>& view, - size_type nrecvs) - { - // if we have to enlarge the destination view + size_type nrecvs) { + using exec_space = typename Kokkos::View::execution_space; + + size_t size = sizeof(T); if (nrecvs > view.extent(0)) { Kokkos::realloc(view, nrecvs); } - // - constexpr size_t size = sizeof(T); - char* src_ptr = (char*)(buffer_m.data()); - ippl::Vector* dst_ptr = view.data(); - auto rp = readpos_m; - using exec_space = typename Kokkos::View::execution_space; using mdrange_t = Kokkos::MDRangePolicy, Kokkos::IndexType, exec_space>; + auto base = bufferData(); + auto readpos = readpos_m; Kokkos::parallel_for( "Archive::deserialize()", mdrange_t({0, 0}, {(long int)nrecvs, Dim}), KOKKOS_LAMBDA(const size_type i, const size_t d) { - const char* src = src_ptr + (Dim * i + d) * size + rp; - char* dst = reinterpret_cast(&dst_ptr[i][d]); + const char* src = base + (Dim * i + d) * size + readpos; + T value; + char* dst = reinterpret_cast(&value); + copyBytes(dst, src, size); + view.data()[i](d) = value; + }); + Kokkos::fence(); + readpos_m += Dim * size * nrecvs; + } + + // ================================================================= + // Deserialize — scalar with offset + // ================================================================= + + template + template + void Archive::deserialize(Kokkos::View& view, + size_type offset, size_type nrecvs) { + using exec_space = typename Kokkos::View::execution_space; + using policy_type = Kokkos::RangePolicy; + size_t size = sizeof(T); + if (offset + nrecvs > view.extent(0)) { + Kokkos::resize(view, offset + nrecvs); + } + auto base = bufferData(); + auto readpos = readpos_m; + Kokkos::parallel_for( + "Archive::deserialize(offset)", policy_type(0, nrecvs), + KOKKOS_LAMBDA(const size_type i) { + const char* src = base + i * size + readpos; + char* dst = reinterpret_cast(view.data() + offset + i); + copyBytes(dst, src, size); + }); + Kokkos::fence(); + readpos_m += size * nrecvs; + } + + // ================================================================= + // Deserialize — vector with offset + // ================================================================= + + template + template + void Archive::deserialize(Kokkos::View*, ViewArgs...>& view, + size_type offset, size_type nrecvs) { + using exec_space = typename Kokkos::View::execution_space; + size_t size = sizeof(T); + if (offset + nrecvs > view.extent(0)) { + Kokkos::resize(view, offset + nrecvs); + } + using mdrange_t = + Kokkos::MDRangePolicy, Kokkos::IndexType, exec_space>; + auto base = bufferData(); + auto readpos = readpos_m; + Kokkos::parallel_for( + "Archive::deserialize(offset, vector)", mdrange_t({0, 0}, {(long int)nrecvs, Dim}), + KOKKOS_LAMBDA(const size_type i, const size_t d) { + const char* src = base + (Dim * i + d) * size + readpos; + T value; + char* dst = reinterpret_cast(&value); copyBytes(dst, src, size); + view.data()[offset + i](d) = value; }); Kokkos::fence(); readpos_m += Dim * size * nrecvs; diff --git a/src/Communicate/BufferHandler.hpp b/src/Communicate/BufferHandler.hpp index c6d57f0a3..e27369c06 100644 --- a/src/Communicate/BufferHandler.hpp +++ b/src/Communicate/BufferHandler.hpp @@ -1,6 +1,8 @@ #ifndef IPPL_BUFFER_HANDLER_HPP #define IPPL_BUFFER_HANDLER_HPP +#include + namespace ippl { template @@ -11,6 +13,15 @@ namespace ippl { DefaultBufferHandler::getBuffer(size_type size, double overallocation) { size_type requiredSize = static_cast(size * overallocation); + // Round up to page granularity to avoid 0-byte or tiny allocations + // and to align with the GPU-aware MPI registration cache. + constexpr size_type PAGE_SIZE = 4096; + if (requiredSize < PAGE_SIZE) { + requiredSize = PAGE_SIZE; + } else { + requiredSize = ((requiredSize + PAGE_SIZE - 1) / PAGE_SIZE) * PAGE_SIZE; + } + auto freeBuffer = findFreeBuffer(requiredSize); if (freeBuffer != nullptr) { return getFreeBuffer(freeBuffer); @@ -120,17 +131,12 @@ namespace ippl { template typename DefaultBufferHandler::buffer_type DefaultBufferHandler::reallocateLargestFreeBuffer(size_type requiredSize) { - auto largest_it = std::prev(free_buffers.end()); - buffer_type buffer = *largest_it; - - freeSize_m -= buffer->getBufferSize(); - usedSize_m += requiredSize; - - free_buffers.erase(buffer); - buffer->reallocBuffer(requiredSize); - - used_buffers.insert(buffer); - return buffer; + // Always allocate a + // fresh buffer instead of reallocating the largest free one: a + // free + alloc cycle can release the device pointer that a GPU-aware + // MPI's registration cache still holds. Keeping the old buffers in the free + // pool and returning a new buffer at a new address sidesteps this. + return allocateNewBuffer(requiredSize); } template @@ -145,4 +151,4 @@ namespace ippl { } // namespace ippl -#endif +#endif \ No newline at end of file diff --git a/src/Communicate/Buffers.cpp b/src/Communicate/Buffers.cpp index 026633fec..e99100bc2 100644 --- a/src/Communicate/Buffers.cpp +++ b/src/Communicate/Buffers.cpp @@ -32,13 +32,13 @@ namespace ippl { } void Communicator::deleteAllBuffers() { - buffer_handlers_m->forAll([](BufferHandler&& bh) { + getBufferHandler().forAll([](BufferHandler&& bh) { bh.deleteAllBuffers(); }); } void Communicator::freeAllBuffers() { - buffer_handlers_m->forAll([](BufferHandler&& bh) { + getBufferHandler().forAll([](BufferHandler&& bh) { bh.freeAllBuffers(); }); } diff --git a/src/Communicate/Buffers.hpp b/src/Communicate/Buffers.hpp index 98c0b513f..597461b4d 100644 --- a/src/Communicate/Buffers.hpp +++ b/src/Communicate/Buffers.hpp @@ -20,21 +20,23 @@ // exchanging particle data between ranks. // +#include "Utility/IpplTimings.h" namespace ippl { namespace mpi { template Communicator::buffer_type Communicator::getBuffer(size_type size, double overallocation) { - auto& buffer_handler = buffer_handlers_m->get(); + auto& buffer_handler = getBufferHandler().get(); - return buffer_handler.getBuffer(size * sizeof(T), - std::max(overallocation, defaultOveralloc_m)); + auto buf = buffer_handler.getBuffer(size * sizeof(T), + std::max(overallocation, defaultOveralloc_m)); + return buf; } template void Communicator::freeBuffer(Communicator::buffer_type buffer) { - auto& buffer_handler = buffer_handlers_m->get(); + auto& buffer_handler = getBufferHandler().get(); buffer_handler.freeBuffer(buffer); } diff --git a/src/Communicate/Communicator.cpp b/src/Communicate/Communicator.cpp index 2c33f939f..e1a8b777b 100644 --- a/src/Communicate/Communicator.cpp +++ b/src/Communicate/Communicator.cpp @@ -3,25 +3,27 @@ namespace ippl::mpi { + namespace { + // Populate rank_m / size_m from the live communicator. + void cacheRankAndSize(const MPI_Comm& comm, int& rank, int& size) { + MPI_Comm_rank(comm, &rank); + MPI_Comm_size(comm, &size); + } + } // namespace + Communicator::Communicator() - : buffer_handlers_m(get_buffer_handler_instance()) - , comm_m(new MPI_Comm(MPI_COMM_WORLD)) { - MPI_Comm_rank(*comm_m, &rank_m); - MPI_Comm_size(*comm_m, &size_m); + : comm_m(new MPI_Comm(MPI_COMM_WORLD)) { + cacheRankAndSize(*comm_m, rank_m, size_m); } Communicator::Communicator(MPI_Comm comm) { - buffer_handlers_m = get_buffer_handler_instance(); - comm_m = std::make_shared(comm); - MPI_Comm_rank(*comm_m, &rank_m); - MPI_Comm_size(*comm_m, &size_m); + comm_m = std::make_shared(comm); + cacheRankAndSize(*comm_m, rank_m, size_m); } Communicator& Communicator::operator=(MPI_Comm comm) { - buffer_handlers_m = get_buffer_handler_instance(); - comm_m = std::make_shared(comm); - MPI_Comm_rank(*comm_m, &rank_m); - MPI_Comm_size(*comm_m, &size_m); + comm_m = std::make_shared(comm); + cacheRankAndSize(*comm_m, rank_m, size_m); return *this; } @@ -41,14 +43,4 @@ namespace ippl::mpi { return (flag != 0); } - // --------------------------------------- - // singleton access to buffer manager - // --------------------------------------- - std::shared_ptr Communicator::get_buffer_handler_instance() { - static std::shared_ptr comm_buff_handler_ptr{nullptr}; - if (comm_buff_handler_ptr == nullptr) { - comm_buff_handler_ptr = std::make_shared(); - } - return comm_buff_handler_ptr; - } } // namespace ippl::mpi diff --git a/src/Communicate/Communicator.h b/src/Communicate/Communicator.h index 9819ab3bd..7685d20d5 100644 --- a/src/Communicate/Communicator.h +++ b/src/Communicate/Communicator.h @@ -15,6 +15,7 @@ //////////////////////////////////////////////// // For message size check; see below + #include #include @@ -27,7 +28,6 @@ namespace ippl { namespace mpi { - class Communicator : public TagMaker { public: Communicator(); @@ -157,16 +157,21 @@ namespace ippl { const MPI_Comm& getCommunicator() const noexcept { return *comm_m; } + // MPI uses int for byte counts; messages exceeding INT_MAX must + // be split. This shared check aborts the run with a single + // diagnostic instead of silently truncating. + void assertMessageSize(size_type msize) const { + if (msize > static_cast(INT_MAX)) { + std::cerr << "Communicator: message size " << msize + << " bytes exceeds INT_MAX (" << INT_MAX << ")\n"; + MPI_Abort(*comm_m, -1); + } + } + template void recv(int src, int tag, Buffer& buffer, Archive& ar, size_type msize, size_type nrecvs) { - // Temporary fix. MPI communication seems to have problems when the - // count argument exceeds the range of int, so large messages should - // be split into smaller messages - if (msize > INT_MAX) { - std::cerr << "Message size exceeds range of int" << std::endl; - this->abort(); - } + assertMessageSize(msize); MPI_Status status; MPI_Recv(ar.getBuffer(), msize, MPI_BYTE, src, tag, *comm_m, &status); @@ -176,20 +181,27 @@ namespace ippl { template void isend(int dest, int tag, Buffer& buffer, Archive& ar, MPI_Request& request, size_type nsends) { - if (ar.getSize() > INT_MAX) { - std::cerr << "Message size exceeds range of int" << std::endl; - this->abort(); - } + assertMessageSize(ar.getSize()); buffer.serialize(ar, nsends); MPI_Isend(ar.getBuffer(), ar.getSize(), MPI_BYTE, dest, tag, *comm_m, &request); } + template + void isend(int dest, int tag, Archive& ar, MPI_Request& request) { + assertMessageSize(ar.getSize()); + MPI_Isend(ar.getBuffer(), ar.getSize(), MPI_BYTE, dest, tag, *comm_m, &request); + } + + template + void recv(int src, int tag, Archive& ar, size_type msize) { + assertMessageSize(msize); + MPI_Status status; + MPI_Recv(ar.getBuffer(), msize, MPI_BYTE, src, tag, *comm_m, &status); + } + template void irecv(int src, int tag, Archive& ar, MPI_Request& request, size_type msize) { - if (msize > INT_MAX) { - std::cerr << "Message size exceeds range of int" << std::endl; - this->abort(); - } + assertMessageSize(msize); MPI_Irecv(ar.getBuffer(), msize, MPI_BYTE, src, tag, *comm_m, &request); } @@ -201,7 +213,11 @@ namespace ippl { std::vector gatherLogsFromAllRanks(const std::vector& localLogs); void writeLogsToFile(const std::vector& allLogs, const std::string& filename); - std::shared_ptr buffer_handlers_m; + static buffer_handler_type& getBufferHandler() { + static buffer_handler_type handler; + return handler; + } + double defaultOveralloc_m = 1.0; ///////////////////////////////////////////////////////////////////////////////////// @@ -210,12 +226,10 @@ namespace ippl { std::shared_ptr comm_m; int size_m; int rank_m; - - public: - std::shared_ptr get_buffer_handler_instance(); }; } // namespace mpi + } // namespace ippl #include "Communicate/Collectives.hpp" diff --git a/src/Communicate/CommunicatorLogging.cpp b/src/Communicate/CommunicatorLogging.cpp index 541ff4074..39c0197e8 100644 --- a/src/Communicate/CommunicatorLogging.cpp +++ b/src/Communicate/CommunicatorLogging.cpp @@ -7,8 +7,16 @@ #include "Communicate/Communicator.h" #include "Communicate/LogEntry.h" +#include "Communicate/LoggingBufferHandler.h" namespace ippl::mpi { + + template + struct is_a_logger : std::false_type {}; + + template + struct is_a_logger> : std::true_type {}; + void Communicator::printLogs(const std::string& filename) { std::vector localLogs = gatherLocalLogs(); @@ -24,20 +32,17 @@ namespace ippl::mpi { } } - template - struct is_a_logger : std::false_type {}; - - template - struct is_a_logger > : std::true_type {}; - std::vector Communicator::gatherLocalLogs() { std::vector localLogs; - if constexpr (is_a_logger::value) { - buffer_handlers_m->forAll([&](auto& loggingHandler) { - const auto& logs = loggingHandler.getLogs(); + + getBufferHandler().forAll([&](auto& handler) { + using handler_t = std::decay_t; + if constexpr (is_a_logger::value) { + const auto& logs = handler.getLogs(); localLogs.insert(localLogs.end(), logs.begin(), logs.end()); - }); - } + } + }); + return localLogs; } @@ -88,11 +93,9 @@ namespace ippl::mpi { size_t offset = 0; while (offset < buffer.size()) { - LogEntry logEntry = LogEntry::deserialize(buffer, offset); - - logs.push_back(logEntry); - - offset += logEntry.serialize().size(); + // deserializeAdvance walks offset past the consumed bytes - + // avoids the O(N^2) blowup of re-serializing each entry + logs.push_back(LogEntry::deserializeAdvance(buffer, offset)); } return logs; } diff --git a/src/Communicate/Environment.cpp b/src/Communicate/Environment.cpp index 8e19df365..706c821f1 100644 --- a/src/Communicate/Environment.cpp +++ b/src/Communicate/Environment.cpp @@ -9,9 +9,23 @@ namespace ippl { namespace mpi { Environment::Environment(int& argc, char**& argv, const MPI_Comm& comm) - : comm_m(comm) { + : comm_m(comm) + , threadMultiple_m(false) { if (!initialized()) { - MPI_Init(&argc, &argv); + int provided = MPI_THREAD_SINGLE; + int rc = MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided); + if (rc != MPI_SUCCESS) { + std::cerr << "MPI_Init_thread failed (rc=" << rc << ")" << std::endl; + std::exit(EXIT_FAILURE); + } + threadMultiple_m = (provided >= MPI_THREAD_MULTIPLE); + if (!threadMultiple_m) { + int rank = 0; + MPI_Comm_rank(comm_m, &rank); + if (rank == 0) { + std::cerr << "MPI doesn't support MPI_THREAD_MULTIPLE!" << std::endl; + } + } } } diff --git a/src/Communicate/Environment.h b/src/Communicate/Environment.h index fd03a9ee4..bad195df4 100644 --- a/src/Communicate/Environment.h +++ b/src/Communicate/Environment.h @@ -23,10 +23,13 @@ namespace ippl { static bool finalized(); + bool threadMultiple() noexcept { return threadMultiple_m; } + void abort(int errorcode = -1) noexcept { MPI_Abort(comm_m, errorcode); } private: MPI_Comm comm_m; + bool threadMultiple_m; }; } // namespace mpi } // namespace ippl diff --git a/src/Communicate/LogEntry.cpp b/src/Communicate/LogEntry.cpp index 444221d6d..66539e879 100644 --- a/src/Communicate/LogEntry.cpp +++ b/src/Communicate/LogEntry.cpp @@ -40,28 +40,31 @@ namespace ippl { return buffer; } - LogEntry LogEntry::deserialize(const std::vector& buffer, size_t offset) { + LogEntry LogEntry::deserializeAdvance(const std::vector& buffer, size_t& offset) { LogEntry entry; - size_t current_pos = offset; - entry.methodName = deserializeString(buffer, current_pos); - entry.usedSize = deserializeBasicType(buffer, current_pos); - entry.freeSize = deserializeBasicType(buffer, current_pos); - entry.memorySpace = deserializeString(buffer, current_pos); - entry.rank = deserializeBasicType(buffer, current_pos); + entry.methodName = deserializeString(buffer, offset); + entry.usedSize = deserializeBasicType(buffer, offset); + entry.freeSize = deserializeBasicType(buffer, offset); + entry.memorySpace = deserializeString(buffer, offset); + entry.rank = deserializeBasicType(buffer, offset); - auto duration = deserializeBasicType(buffer, current_pos); + auto duration = deserializeBasicType(buffer, offset); entry.timestamp = std::chrono::time_point( std::chrono::high_resolution_clock::duration(duration)); - size_t mapSize = deserializeBasicType(buffer, current_pos); + size_t mapSize = deserializeBasicType(buffer, offset); for (size_t i = 0; i < mapSize; ++i) { - std::string key = deserializeString(buffer, current_pos); - std::string value = deserializeString(buffer, current_pos); + std::string key = deserializeString(buffer, offset); + std::string value = deserializeString(buffer, offset); entry.parameters[key] = value; } return entry; } + LogEntry LogEntry::deserialize(const std::vector& buffer, size_t offset) { + return deserializeAdvance(buffer, offset); + } + } // namespace ippl diff --git a/src/Communicate/LogEntry.h b/src/Communicate/LogEntry.h index a853c3431..1929915c3 100644 --- a/src/Communicate/LogEntry.h +++ b/src/Communicate/LogEntry.h @@ -20,6 +20,11 @@ namespace ippl { std::vector serialize() const; static LogEntry deserialize(const std::vector& buffer, size_t offset = 0); + + /// Variant of deserialize() that advances the caller's offset past + /// the just-read entry. Lets callers walk a byte-stream of entries + /// without re-serializing each one to compute its size. + static LogEntry deserializeAdvance(const std::vector& buffer, size_t& offset); }; template diff --git a/src/Communicate/LoggingBufferHandler.h b/src/Communicate/LoggingBufferHandler.h index f6a0bd5d6..95fb418cd 100644 --- a/src/Communicate/LoggingBufferHandler.h +++ b/src/Communicate/LoggingBufferHandler.h @@ -1,7 +1,6 @@ #ifndef IPPL_LOGGING_BUFFER_HANDLER_H #define IPPL_LOGGING_BUFFER_HANDLER_H -#include #include #include #include diff --git a/src/Field/BareField.hpp b/src/Field/BareField.hpp index 695576af3..14a36eb45 100644 --- a/src/Field/BareField.hpp +++ b/src/Field/BareField.hpp @@ -146,7 +146,7 @@ namespace ippl { template void BareField::fillHalo() { if (layout_m->comm.size() > 1) { - halo_m.fillHalo(dview_m, layout_m); + halo_m.fillHalo(dview_m, layout_m, nghost_m); } if (layout_m->isAllPeriodic_m) { using Op = typename detail::HaloCells::assign; @@ -157,7 +157,7 @@ namespace ippl { template void BareField::accumulateHalo() { if (layout_m->comm.size() > 1) { - halo_m.accumulateHalo(dview_m, layout_m); + halo_m.accumulateHalo(dview_m, layout_m, nghost_m); } if (layout_m->isAllPeriodic_m) { using Op = typename detail::HaloCells::rhs_plus_assign; diff --git a/src/Field/HaloCells.h b/src/Field/HaloCells.h index c4d87b9bd..6e7606083 100644 --- a/src/Field/HaloCells.h +++ b/src/Field/HaloCells.h @@ -57,7 +57,7 @@ namespace ippl { * @param view the original field data * @param layout the field layout storing the domain decomposition */ - void accumulateHalo(view_type& view, Layout_t* layout); + void accumulateHalo(view_type& view, Layout_t* layout, int nghost); /*! * Send halo data to internal cells for only the physical cells @@ -75,7 +75,7 @@ namespace ippl { * @param view the original field data * @param layout the field layout storing the domain decomposition */ - void fillHalo(view_type&, Layout_t* layout); + void fillHalo(view_type&, Layout_t* layout, int nghost); /*! * Pack the field data to be sent into a contiguous array. @@ -139,7 +139,8 @@ namespace ippl { * unpack function call */ template - void exchangeBoundaries(view_type& view, Layout_t* layout, SendOrder order, int nghost = 1); + void exchangeBoundaries(view_type& view, Layout_t* layout, SendOrder order, + int nghost); /*! * Extract the subview of the original data. This does not copy. diff --git a/src/Field/HaloCells.hpp b/src/Field/HaloCells.hpp index eb56642ce..d99ea638f 100644 --- a/src/Field/HaloCells.hpp +++ b/src/Field/HaloCells.hpp @@ -7,6 +7,7 @@ #include #include "Utility/IpplException.h" +#include "Utility/ParallelDispatch.h" #include "Communicate/Communicator.h" @@ -16,8 +17,8 @@ namespace ippl { HaloCells::HaloCells() {} template - void HaloCells::accumulateHalo(view_type& view, Layout_t* layout) { - exchangeBoundaries(view, layout, HALO_TO_INTERNAL); + void HaloCells::accumulateHalo(view_type& view, Layout_t* layout, int nghost) { + exchangeBoundaries(view, layout, HALO_TO_INTERNAL, nghost); } template @@ -25,8 +26,8 @@ namespace ippl { exchangeBoundaries(view, layout, HALO_TO_INTERNAL_NOGHOST, nghost); } template - void HaloCells::fillHalo(view_type& view, Layout_t* layout) { - exchangeBoundaries(view, layout, INTERNAL_TO_HALO); + void HaloCells::fillHalo(view_type& view, Layout_t* layout, int nghost) { + exchangeBoundaries(view, layout, INTERNAL_TO_HALO, nghost); } template @@ -45,9 +46,9 @@ namespace ippl { auto ldom = layout->getLocalNDIndex(); for (const auto& axis : ldom) { if ((axis.length() == 1) && (Dim != 1)) { - throw std::runtime_error( - "HaloCells: Cannot do neighbour exchange when domain decomposition " - "contains planes!"); + throw IpplException( + "HaloCells::exchangeBoundaries", + "Cannot do neighbour exchange when domain decomposition contains planes."); } } @@ -62,7 +63,7 @@ namespace ippl { totalRequests += componentNeighbors.size(); } - int me=Comm->rank(); + int me = Comm->rank(); using memory_space = typename view_type::memory_space; using buffer_type = mpi::Communicator::buffer_type; diff --git a/src/FieldLayout/FieldLayout.h b/src/FieldLayout/FieldLayout.h index 780040e8b..c8f1c1e2b 100644 --- a/src/FieldLayout/FieldLayout.h +++ b/src/FieldLayout/FieldLayout.h @@ -201,7 +201,7 @@ namespace ippl { FieldLayout(const mpi::Communicator& = MPI_COMM_WORLD); FieldLayout(mpi::Communicator, const NDIndex& domain, std::array decomp, - bool isAllPeriodic = false); + bool isAllPeriodic = false, int nghost = 1); // Destructor: Everything deletes itself automatically ... the base // class destructors inform all the FieldLayoutUser's we're going away. @@ -213,38 +213,29 @@ namespace ippl { // FieldLayout constructors: void initialize(const NDIndex& domain, std::array decomp, - bool isAllPeriodic = false); + bool isAllPeriodic = false, int nghost = 1); // Return the domain. const NDIndex& getDomain() const { return gDomain_m; } - // Compare FieldLayouts to see if they represent the same domain; if - // dimensionalities are different, the NDIndex operator==() will return - // false: + // Compare FieldLayouts. Different dimensionalities or different global + // domains are not equal; same global domain but different per-rank + // local-domain decompositions are also not equal. template bool operator==(const FieldLayout& x) const { - - // Throw exception if the domains are not the same - if (gDomain_m != x.getDomain()) { - throw std::runtime_error("FieldLayout: only FieldLayouts with the same global domain should be compared"); - } - - return gDomain_m == x.getDomain(); - } - - bool operator==(const FieldLayout& x) const { - - // Throw exception if the domains are not the same - if (gDomain_m != x.getDomain()) { - throw std::runtime_error("FieldLayout: only FieldLayouts with the same global domain should be compared"); - } - - for (unsigned int i = 0; i < Dim; ++i) { - if (hLocalDomains_m(comm.rank())[i] != x.getLocalNDIndex()[i]) { + if constexpr (Dim != Dim2) { + return false; + } else { + if (gDomain_m != x.getDomain()) { return false; } + for (unsigned int i = 0; i < Dim; ++i) { + if (hLocalDomains_m(comm.rank())[i] != x.getLocalNDIndex()[i]) { + return false; + } + } + return true; } - return true; } // for the requested dimension, report if the distribution is @@ -330,7 +321,7 @@ namespace ippl { * Finds all neighboring ranks based on the field layout * @param nghost number of ghost cells (default 1) */ - void findNeighbors(int nghost = 1); + void findNeighbors(int nghost); /*! * Adds a neighbor to the neighbor list @@ -389,6 +380,9 @@ namespace ippl { // Minimum width of all the local domains for each dimension unsigned int minWidth_m[Dim]; + // Nghost needed for computing send/receive ranges + int nghost_m; + void calcWidths(); }; diff --git a/src/FieldLayout/FieldLayout.hpp b/src/FieldLayout/FieldLayout.hpp index 2bcb1bcd1..ad6ae5a53 100644 --- a/src/FieldLayout/FieldLayout.hpp +++ b/src/FieldLayout/FieldLayout.hpp @@ -37,7 +37,8 @@ namespace ippl { FieldLayout::FieldLayout(const mpi::Communicator& communicator) : comm(communicator) , dLocalDomains_m("local domains (device)", 0) - , hLocalDomains_m(Kokkos::create_mirror_view(dLocalDomains_m)) { + , hLocalDomains_m(Kokkos::create_mirror_view(dLocalDomains_m)) + , nghost_m(1) { for (unsigned int d = 0; d < Dim; ++d) { minWidth_m[d] = 0; } @@ -45,9 +46,9 @@ namespace ippl { template FieldLayout::FieldLayout(mpi::Communicator communicator, const NDIndex& domain, - std::array isParallel, bool isAllPeriodic) + std::array isParallel, bool isAllPeriodic, int nghost) : FieldLayout(communicator) { - initialize(domain, isParallel, isAllPeriodic); + initialize(domain, isParallel, isAllPeriodic, nghost); } template @@ -60,7 +61,7 @@ namespace ippl { hLocalDomains_m(i) = domains[i]; } - findNeighbors(); + findNeighbors(nghost_m); Kokkos::deep_copy(dLocalDomains_m, hLocalDomains_m); @@ -69,7 +70,7 @@ namespace ippl { template void FieldLayout::initialize(const NDIndex& domain, std::array isParallel, - bool isAllPeriodic) { + bool isAllPeriodic, int nghost) { int nRanks = comm.size(); gDomain_m = domain; @@ -78,11 +79,17 @@ namespace ippl { isParallelDim_m = isParallel; + nghost_m = nghost; + if (nRanks < 2) { Kokkos::resize(dLocalDomains_m, nRanks); Kokkos::resize(hLocalDomains_m, nRanks); hLocalDomains_m(0) = domain; Kokkos::deep_copy(dLocalDomains_m, hLocalDomains_m); + // Even on a single rank we must populate minWidth_m so that + // getDistribution(d) returns the right answer; without this the + // serial-build path silently reports every dim as parallel. + calcWidths(); return; } @@ -95,8 +102,11 @@ namespace ippl { } if (totparelems < nRanks) { - throw std::runtime_error("FieldLayout:initialize: domain can only be partitioned in to " - + std::to_string(totparelems) + " local domains, but there are " + std::to_string(nRanks) + " ranks, decrease the number of ranks or increase the domain."); + throw std::runtime_error( + "FieldLayout:initialize: domain can only be partitioned in to " + + std::to_string(totparelems) + " local domains, but there are " + + std::to_string(nRanks) + + " ranks, decrease the number of ranks or increase the domain."); } Kokkos::resize(dLocalDomains_m, nRanks); @@ -105,7 +115,7 @@ namespace ippl { detail::Partitioner partitioner; partitioner.split(domain, hLocalDomains_m, isParallel, nRanks); - findNeighbors(); + findNeighbors(nghost); Kokkos::deep_copy(dLocalDomains_m, hLocalDomains_m); @@ -288,7 +298,7 @@ namespace ippl { // 0 - touching the lower axis value // 1 - touching the upper axis value // 2 - parallel to the axis - if (intersect[d].length() == 1) { + if (intersect[d].length() == static_cast(nghost)) { if (gnd[d].first() != intersect[d].first()) { index += digit; } diff --git a/src/FieldLayout/SubFieldLayout.hpp b/src/FieldLayout/SubFieldLayout.hpp index 53ee24c11..50a83f9e2 100644 --- a/src/FieldLayout/SubFieldLayout.hpp +++ b/src/FieldLayout/SubFieldLayout.hpp @@ -102,7 +102,7 @@ namespace ippl { } } - this->findNeighbors(); + this->findNeighbors(this->nghost_m); Kokkos::deep_copy(this->dLocalDomains_m, this->hLocalDomains_m); diff --git a/src/Ippl.h b/src/Ippl.h index 174c72a29..a606c655c 100644 --- a/src/Ippl.h +++ b/src/Ippl.h @@ -10,7 +10,6 @@ #include "Types/IpplTypes.h" #include "Utility/Inform.h" -#include "Utility/ParallelDispatch.h" #include "Communicate/Communicator.h" #include "Communicate/Environment.h" @@ -46,6 +45,8 @@ namespace ippl { } // namespace detail } // namespace ippl +#include "Utility/ParallelDispatch.h" + // FIMXE remove (only for backwards compatibility) #include "IpplCore.h" diff --git a/src/Particle/ParticleAttrib.h b/src/Particle/ParticleAttrib.h index 3d41b9029..7a720475f 100644 --- a/src/Particle/ParticleAttrib.h +++ b/src/Particle/ParticleAttrib.h @@ -20,8 +20,12 @@ #include "Expression/IpplExpressions.h" +#ifdef IPPL_ENABLE_FFT +#include "FFT/FFT.h" +#endif #include "Interpolation/CIC.h" #include "Particle/ParticleAttribBase.h" +#include "Particle/SortBuffer.h" namespace ippl { @@ -73,32 +77,46 @@ namespace ippl { void unpack(size_type) override; void serialize(detail::Archive& ar, size_type nsends) override { - ar.serialize(buf_m, nsends); + ar.serialize(dview_m, nsends); + } + + void serialize(detail::Archive& ar, const hash_type& hash, + size_type nsends) override { + ar.serialize(dview_m, hash, nsends); } void deserialize(detail::Archive& ar, size_type nrecvs) override { ar.deserialize(buf_m, nrecvs); } - virtual ~ParticleAttrib() = default; - + void deserialize(detail::Archive& ar, size_type offset, + size_type nrecvs) override { + this->resize(offset + nrecvs); + ar.deserialize(dview_m, offset, nrecvs); + } + + KOKKOS_INLINE_FUNCTION virtual ~ParticleAttrib() = default; + size_type size() const override { return dview_m.extent(0); } - + size_type packedSize(const size_type count) const override { return count * sizeof(value_type); } - - void resize(size_type n) { Kokkos::resize(dview_m, n); } /*! - * @brief Reallocate the underlying view with a new size. + * @brief Resize the underlying view, preserving existing entries on grow. * - * This function reallocates the device view to a new size. Existing data is - * discarded and should not be relied upon after this call. Note that this function does not - * apply overallocation. For use from outside, call `ParticleAttrib::alloc(size_type)` - * instead. + * Capacity-only operation; does not change the live particle count. + * Overallocation is the caller's responsibility (`alloc()` / `create()` + * apply it when sizing this view). + */ + void resize(size_type n) { Kokkos::resize(dview_m, n); } + + /*! + * @brief Reallocate the underlying view, discarding existing entries. * - * @param n The new size to allocate in the internal view. + * Capacity-only operation. Does not apply the default overallocation + * factor — call `alloc()` instead from the outside. */ void realloc(size_type n) { Kokkos::realloc(dview_m, n); } @@ -112,11 +130,22 @@ namespace ippl { KOKKOS_INLINE_FUNCTION T& operator()(const size_t i) const { return dview_m(i); } - view_type& getView() { return dview_m; } - - const view_type& getView() const { return dview_m; } + /*! + * Returns a subview of the underlying storage covering only the live + * particle range [0, size()). + */ + view_type getView() { + return Kokkos::subview(dview_m, + Kokkos::make_pair(size_type(0), + static_cast(*(this->localNum_mp)))); + } + const view_type getView() const { + return Kokkos::subview(dview_m, + Kokkos::make_pair(size_type(0), + static_cast(*(this->localNum_mp)))); + } - host_mirror_type getHostMirror() const { return Kokkos::create_mirror(dview_m); } + host_mirror_type getHostMirror() const { return Kokkos::create_mirror(getView()); } void set_name(const std::string& name_) override { size_t len = name_.size(); @@ -227,9 +256,21 @@ namespace ippl { void internalCopy(const hash_type& indices) override; private: - view_type dview_m; - view_type buf_m; + view_type dview_m{"ParticleAttrib::dview", 0}; + view_type buf_m{"ParticleAttrib::buf", 0}; }; + + namespace detail { + template + struct AttribTraits; + + template + struct AttribTraits> { + using value_type = T; + using view_type = + std::decay_t>().getView())>; + }; + } // namespace detail } // namespace ippl #include "Particle/ParticleAttrib.hpp" diff --git a/src/Particle/ParticleAttrib.hpp b/src/Particle/ParticleAttrib.hpp index 1422bb9dd..955f83db5 100644 --- a/src/Particle/ParticleAttrib.hpp +++ b/src/Particle/ParticleAttrib.hpp @@ -17,8 +17,15 @@ #include "Communicate/DataTypes.h" +#include "Utility/BufferView.h" #include "Utility/IpplTimings.h" +#ifdef IPPL_ENABLE_FFT +#include "FFT/FFT.h" +#endif +#include "Particle/ParticleSort.h" +#include "Particle/SortBuffer.h" + namespace ippl { template @@ -71,17 +78,12 @@ namespace ippl { Kokkos::parallel_for( "ParticleAttrib::pack()", policy_type(0, size), KOKKOS_LAMBDA(const size_t i) { buf(i) = dview(hash(i)); }); - Kokkos::fence(); } template void ParticleAttrib::unpack(size_type nrecvs) { - auto size = dview_m.extent(0); size_type required = *(this->localNum_mp) + nrecvs; - if (size < required) { - int overalloc = Comm->getDefaultOverallocation(); - this->resize(required * overalloc); - } + this->resize(required); size_type count = *(this->localNum_mp); auto buf = buf_m; @@ -94,7 +96,6 @@ namespace ippl { } template - // KOKKOS_INLINE_FUNCTION ParticleAttrib& ParticleAttrib::operator=(T x) { auto dview = dview_m; using policy_type = Kokkos::RangePolicy; @@ -106,7 +107,6 @@ namespace ippl { template template - // KOKKOS_INLINE_FUNCTION ParticleAttrib& ParticleAttrib::operator=( detail::Expression const& expr) { using capture_type = detail::CapturedExpression; @@ -147,11 +147,11 @@ namespace ippl { const NDIndex& lDom = layout.getLocalNDIndex(); const int nghost = f.getNghost(); - //using policy_type = Kokkos::RangePolicy; const bool useHashView = hash_array.extent(0) > 0; if (useHashView && (iteration_policy.end() > hash_array.extent(0))) { Inform m("scatter"); - m << "Hash array was passed to scatter, but size does not match iteration policy." << endl; + m << "Hash array was passed to scatter, but size does not match iteration policy." + << endl; ippl::Comm->abort(); } auto dview = dview_m; @@ -159,10 +159,8 @@ namespace ippl { Kokkos::parallel_for( "ParticleAttrib::scatter", iteration_policy, KOKKOS_LAMBDA(const size_t idx) { - // map index to possible hash_map size_t mapped_idx = useHashView ? hash_array(idx) : idx; - // find nearest grid point vector_type l = (ppview(mapped_idx) - origin) * invdx + 0.5; Vector index = l; Vector whi = l - index; @@ -170,7 +168,6 @@ namespace ippl { Vector args = index - lDom.first() + nghost; - // scatter const value_type& val = dview(mapped_idx); detail::scatterToField(std::make_index_sequence<1 << Field::dim>{}, view, wlo, whi, args, val); @@ -219,7 +216,6 @@ namespace ippl { Kokkos::parallel_for( "ParticleAttrib::gather", policy_type(0, *(this->localNum_mp)), KOKKOS_LAMBDA(const size_t idx) { - // find nearest grid point vector_type l = (ppview(idx) - origin) * invdx + 0.5; Vector index = l; Vector whi = l - index; @@ -227,9 +223,8 @@ namespace ippl { Vector args = index - lDom.first() + nghost; - // gather - value_type gathered = detail::gatherFromField(std::make_index_sequence<1 << Field::dim>{}, - view, wlo, whi, args); + value_type gathered = detail::gatherFromField( + std::make_index_sequence<1 << Field::dim>{}, view, wlo, whi, args); if (addToAttribute) { dview(idx) += gathered; } else { @@ -240,10 +235,8 @@ namespace ippl { } template - void ParticleAttrib::applyPermutation( - const hash_type& permutation) { - - const auto view = this->getView(); + void ParticleAttrib::applyPermutation(const hash_type& permutation) { + const auto view = this->getView(); // trimmed to localNum_mp const auto size = this->getParticleCount(); view_type temp("copy", size); @@ -255,31 +248,28 @@ namespace ippl { Kokkos::fence(); - Kokkos::deep_copy(Kokkos::subview(view, Kokkos::make_pair(0, size)), temp); + Kokkos::deep_copy(view, temp); } - template - void ParticleAttrib::internalCopy( - const hash_type &indices) { + template + void ParticleAttrib::internalCopy(const hash_type& indices) { auto copySize = indices.size(); - create(copySize); - auto view = this->getView(); - const auto size = this->getParticleCount(); + // Snapshot the current count BEFORE create() increments localNum_mp. + const size_type oldSize = *(this->localNum_mp); - using policy_type = Kokkos::RangePolicy; + create(copySize); // localNum_mp becomes oldSize + copySize + + auto view = this->getView(); Kokkos::parallel_for( - "Copy to temp", policy_type(0, copySize), - KOKKOS_LAMBDA(const size_type &i) { - view(size + i) = view(i); - }); + "internalCopy", Kokkos::RangePolicy(0, copySize), + KOKKOS_LAMBDA(const size_type& i) { view(oldSize + i) = view(indices(i)); }); Kokkos::fence(); } /* - * Non-class function - * + * Non-class functions */ /** @@ -287,7 +277,7 @@ namespace ippl { * * This overload preserves legacy functionality by providing a default iteration policy. * It calls the member scatter() with a default Kokkos::RangePolicy. - * + * * @note The default behaviour is to scatter all particles without any custom index mapping. * * @tparam Attrib1 The type of the particle attribute. @@ -298,19 +288,21 @@ namespace ippl { * @param f The field onto which the data is scattered. * @param pp The ParticleAttrib representing particle positions. */ - template > + template > inline void scatter(const Attrib1& attrib, Field& f, const Attrib2& pp) { - attrib.scatter(f, pp, policy_type(0, attrib.getParticleCount())); + attrib.scatter(f, pp, policy_type(0, attrib.getParticleCount())); } /** - * @brief Non-class interface for scattering with a custom iteration policy and optional index array. + * @brief Non-class interface for scattering with a custom iteration policy and optional index + * array. * * This overload allows the caller to specify a custom `Kokkos::range_policy` and an optional * `ippl::hash_type` array. It forwards the parameters to the member scatter() function. - * - * @note See ParticleAttrib::scatter() for more information on the custom iteration functionality. + * + * @note See ParticleAttrib::scatter() for more information on the custom iteration + * functionality. * * @tparam Attrib1 The type of the particle attribute. * @tparam Field The type of the field. @@ -322,33 +314,34 @@ namespace ippl { * @param iteration_policy A custom `Kokkos::range_policy` defining the iteration range. * @param hash_array An optional `ippl::hash_type` array for index mapping. */ - template > - inline void scatter(const Attrib1& attrib, Field& f, const Attrib2& pp, - policy_type iteration_policy, typename Attrib1::hash_type hash_array = {}) { + template > + inline void scatter(const Attrib1& attrib, Field& f, const Attrib2& pp, + policy_type iteration_policy, + typename Attrib1::hash_type hash_array = {}) { attrib.scatter(f, pp, iteration_policy, hash_array); } /** * @brief Non-class interface for gathering field data into a particle attribute. - * + * * This interface calls the member ParticleAttrib::gather() function with the provided * parameters and preserving legacy behavior by assigning `addToAttribute` a default value. - * + * * @note See ParticleAttrib::gather() for more information on the behavior of `addToAttribute`. - * + * * @tparam Attrib1 The type of the particle attribute. * @tparam Field The type of the field. * @tparam Attrib2 The type of the particle position attribute. * @param attrib The particle attribute to gather data into. * @param f The field from which data is gathered. * @param pp The ParticleAttrib representing particle positions. - * @param addToAttribute If true, the gathered field value is added to the current attribute value; - * otherwise, the attribute value is overwritten. + * @param addToAttribute If true, the gathered field value is added to the current attribute + * value; otherwise, the attribute value is overwritten. */ template - inline void gather(Attrib1& attrib, Field& f, const Attrib2& pp, - const bool addToAttribute = false) { + inline void gather(Attrib1& attrib, Field& f, const Attrib2& pp, + const bool addToAttribute = false) { attrib.gather(f, pp, addToAttribute); } @@ -374,4 +367,5 @@ namespace ippl { DefineParticleReduction(Max, max, if (myVal > valL) valL = myVal, std::greater) DefineParticleReduction(Min, min, if (myVal < valL) valL = myVal, std::less) DefineParticleReduction(Prod, prod, valL *= myVal, std::multiplies) + } // namespace ippl diff --git a/src/Particle/ParticleAttribBase.h b/src/Particle/ParticleAttribBase.h index 4dea4ac8e..4526f5dbe 100644 --- a/src/Particle/ParticleAttribBase.h +++ b/src/Particle/ParticleAttribBase.h @@ -73,19 +73,22 @@ namespace ippl { virtual void unpack(size_type) = 0; - virtual void serialize(Archive& ar, size_type nsends) = 0; - + virtual void serialize(Archive& ar, size_type nsends) = 0; + virtual void serialize(detail::Archive& ar, const hash_type& hash, + size_type nsends) = 0; virtual void deserialize(Archive& ar, size_type nrecvs) = 0; + virtual void deserialize(detail::Archive& ar, size_type offset, + size_type nrecvs) = 0; virtual size_type size() const = 0; - virtual ~ParticleAttribBase() = default; + KOKKOS_INLINE_FUNCTION virtual ~ParticleAttribBase() = default; void setParticleCount(size_type& num) { localNum_mp = # } size_type getParticleCount() const { return *localNum_mp; } virtual void applyPermutation(const hash_type&) = 0; - virtual void internalCopy(const hash_type&) = 0; + virtual void internalCopy(const hash_type&) = 0; protected: const size_type* localNum_mp; diff --git a/src/Particle/ParticleBase.h b/src/Particle/ParticleBase.h index 9fbed2faa..9793da2a9 100644 --- a/src/Particle/ParticleBase.h +++ b/src/Particle/ParticleBase.h @@ -299,9 +299,18 @@ namespace ippl { /* This function does not alter the totalNum_m member function. It should only be called * during the update function where we know the number of particles remains the same. */ + template + void internalDestroy(const F& invalid_functor, const size_type destroyNum); template void internalDestroy(const Kokkos::View& invalid, - const size_type destroyNum); + const size_type destroyNum) { + using view_type = Kokkos::View; + using memory_space = typename view_type::memory_space; + using execution_space = typename view_type::execution_space; + internalDestroy( + KOKKOS_LAMBDA(size_t i) { return invalid(i); }, destroyNum); + } /*! * Sends particles to another rank @@ -316,6 +325,9 @@ namespace ippl { void sendToRank(int rank, int tag, std::vector& requests, const HashType& hash); + template + MPI_Request sendToRank(int rank, int tag, const HashType& hash); + /*! * Receives particles from another rank * @param rank the source rank @@ -324,6 +336,9 @@ namespace ippl { */ void recvFromRank(int rank, int tag, size_type nRecvs); + std::pair> postRecvFromRank(int rank, int tag, + size_type nRecvs); + /*! * Serialize to do MPI calls. * @param ar archive diff --git a/src/Particle/ParticleBase.hpp b/src/Particle/ParticleBase.hpp index a71a917d6..fbbe72638 100644 --- a/src/Particle/ParticleBase.hpp +++ b/src/Particle/ParticleBase.hpp @@ -62,8 +62,6 @@ namespace ippl { addAttribute(ID); } addAttribute(R); - ID.set_name("ID"); - R.set_name("position"); } template @@ -175,9 +173,9 @@ namespace ippl { } template - template - void ParticleBase::internalDestroy( - const Kokkos::View& invalid, const size_type destroyNum) { + template + void ParticleBase::internalDestroy(const F& invalid_functor, + const size_type destroyNum) { PAssert(destroyNum <= localNum_m); // If there aren't any particles to delete, do nothing @@ -193,19 +191,18 @@ namespace ippl { return; } - using view_type = Kokkos::View; - using memory_space = typename view_type::memory_space; - using execution_space = typename view_type::execution_space; - using policy_type = Kokkos::RangePolicy; - auto& locDeleteIndex = deleteIndex_m.get(); - auto& locKeepIndex = keepIndex_m.get(); + using policy_type = Kokkos::RangePolicy; + auto& locDeleteIndex = deleteIndex_m.get(); + auto& locKeepIndex = keepIndex_m.get(); - // Resize buffers, if necessary + // Resize buffers per-memory-space. The previous code mistakenly used + // the outer `memory_space` template parameter inside the per-space + // lambda, leaving every other space's del/keep buffers undersized. detail::runForAllSpaces([&]() { - if (attributes_m.template get().size() > 0) { + if (attributes_m.template get().size() > 0) { int overalloc = Comm->getDefaultOverallocation(); - auto& del = deleteIndex_m.get(); - auto& keep = keepIndex_m.get(); + auto& del = deleteIndex_m.template get(); + auto& keep = keepIndex_m.template get(); if (del.size() < destroyNum) { Kokkos::realloc(del, destroyNum * overalloc); Kokkos::realloc(keep, destroyNum * overalloc); @@ -220,10 +217,10 @@ namespace ippl { Kokkos::parallel_scan( "Scan in ParticleBase::destroy()", policy_type(0, localNum_m - destroyNum), KOKKOS_LAMBDA(const size_t i, int& idx, const bool final) { - if (final && invalid(i)) { + if (final && invalid_functor(i)) { locDeleteIndex(idx) = i; } - if (invalid(i)) { + if (invalid_functor(i)) { idx += 1; } }); @@ -245,10 +242,10 @@ namespace ippl { "Second scan in ParticleBase::destroy()", Kokkos::RangePolicy(localNum_m - destroyNum, localNum_m), KOKKOS_LAMBDA(const size_t i, int& idx, const bool final) { - if (final && !invalid(i)) { + if (final && !invalid_functor(i)) { locKeepIndex(idx) = i; } - if (!invalid(i)) { + if (!invalid_functor(i)) { idx += 1; } }); @@ -280,27 +277,39 @@ namespace ippl { template template - void ParticleBase::sendToRank(int rank, int tag, - std::vector& requests, - const HashType& hash) { - size_type nSends = hash.size(); - requests.resize(requests.size() + 1); + MPI_Request ParticleBase::sendToRank(int rank, int tag, const HashType& hash) { + size_type nSends = hash.size(); + MPI_Request request = MPI_REQUEST_NULL; auto hashes = hash_container_type(hash, [&]() { return attributes_m.template get().size() > 0; }); - pack(hashes); detail::runForAllSpaces([&]() { size_type bufSize = packedSize(nSends); if (bufSize == 0) { return; } - auto buf = Comm->getBuffer(bufSize); - Comm->isend(rank, tag++, *this, *buf, requests.back(), nSends); + forAllAttributes([&](Attribute& att) { + att->serialize(*buf, hashes.template get(), nSends); + }); + + Comm->isend(rank, tag++, *buf, request); buf->resetWritePos(); }); + return request; + } + + template + template + void ParticleBase::sendToRank(int rank, int tag, + std::vector& requests, + const HashType& hash) { + auto hashes = hash_container_type(hash, [&]() { + return attributes_m.template get().size() > 0; + }); + requests.push_back(sendToRank(rank, tag, hash)); } template @@ -313,10 +322,46 @@ namespace ippl { auto buf = Comm->getBuffer(bufSize); - Comm->recv(rank, tag++, *this, *buf, bufSize, nRecvs); + Comm->recv(rank, tag++, *buf, bufSize); + forAllAttributes([&](Attribute& att) { + att->deserialize(*buf, localNum_m, nRecvs); + }); + buf->resetReadPos(); }); - unpack(nRecvs); + localNum_m += nRecvs; + } + + template + std::pair> + ParticleBase::postRecvFromRank(int rank, int tag, size_type nRecvs) { + MPI_Request request = MPI_REQUEST_NULL; + + // Collect (buf, nRecvs) per memory space for deferred deserialization + auto deferred = std::make_shared>>(); + + detail::runForAllSpaces([&]() { + size_type bufSize = packedSize(nRecvs); + if (bufSize == 0) + return; + + auto buf = Comm->getBuffer(bufSize); + Comm->irecv(rank, tag++, *buf, request, bufSize); + + deferred->push_back([this, buf, nRecvs](size_type offset) { + forAllAttributes([&](Attribute& att) { + att->deserialize(*buf, offset, nRecvs); + }); + buf->resetReadPos(); + }); + }); + + // Finalize takes the offset (localNum_m after destruction) and advances it + return {request, [this, deferred, nRecvs](size_type offset) { + for (auto& fn : *deferred) + fn(offset); + localNum_m += nRecvs; + }}; } template diff --git a/src/Particle/ParticleSort.h b/src/Particle/ParticleSort.h new file mode 100644 index 000000000..f5eaa3336 --- /dev/null +++ b/src/Particle/ParticleSort.h @@ -0,0 +1,331 @@ +#ifndef IPPL_PARTICLE_SORT_H +#define IPPL_PARTICLE_SORT_H + +#include + +#include +#include +#include +#include +#include + +#include "Particle/SortBuffer.h" + +#ifdef KOKKOS_ENABLE_CUDA +#include +#endif + +#ifdef KOKKOS_ENABLE_HIP +#include +#endif + +namespace ippl { + namespace detail { + + /** + * @brief Validate that all permutation indices are in bounds + */ + template + void validatePermutation(const IndexView& permute, size_t n, + const char* label = "sortParticles") { + using size_type = size_t; + + size_type num_invalid = 0; + Kokkos::parallel_reduce( + label, Kokkos::RangePolicy(0, n), + KOKKOS_LAMBDA(size_type i, size_type & count) { + if (permute(i) >= n) + count++; + }, + num_invalid); + Kokkos::fence(); + + if (num_invalid > 0) { + size_type first_invalid_idx = n; + Kokkos::parallel_reduce( + "find_first_invalid", Kokkos::RangePolicy(0, n), + KOKKOS_LAMBDA(size_type i, size_type & first_idx) { + if (permute(i) >= n && i < first_idx) + first_idx = i; + }, + Kokkos::Min(first_invalid_idx)); + Kokkos::fence(); + + auto permute_host = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, permute); + + throw std::runtime_error(std::string(label) + ": Found " + + std::to_string(num_invalid) + + " invalid permutation indices. First invalid: permute[" + + std::to_string(first_invalid_idx) + + "] = " + std::to_string(permute_host(first_invalid_idx)) + + " (n = " + std::to_string(n) + ")"); + } + } + + /** + * @brief Compute Morton code (Z-order curve) for spatial sorting + */ + template + KOKKOS_INLINE_FUNCTION uint64_t computeMortonCode(const Vector& position, + const Vector& origin, + const Vector& invdx, + const Vector& ngrid) { + uint64_t morton = 0; + constexpr int bits_per_dim = 21; // 21*3 = 63 bits, safe for 3D + + uint32_t gridIndices[Dim]; + for (unsigned d = 0; d < Dim; ++d) { + T sx = (position[d] - origin[d]) * invdx[d]; + sx -= ngrid[d] * Kokkos::floor(sx / ngrid[d]); // wrap to [0, ngrid) + gridIndices[d] = static_cast(sx); + } + + for (int bit = 0; bit < bits_per_dim; ++bit) { + for (unsigned d = 0; d < Dim; ++d) { + if (gridIndices[d] & (1u << bit)) { + morton |= (uint64_t(1) << (bit * Dim + d)); + } + } + } + return morton; + } + + /** + * @brief Functor to compute Morton codes for all particles + */ + template + struct ComputeMortonCodesFunctor { + using size_type = size_t; + + PositionView positions; + KeyView keys; + Vector origin; + Vector invdx; + Vector ngrid; + + KOKKOS_INLINE_FUNCTION void operator()(size_type i) const { + keys(i) = computeMortonCode(positions(i), origin, invdx, ngrid); + } + }; + + // ------------------------------------------------------------------- + // Platform-specific sort implementations + // ------------------------------------------------------------------- + + /** + * @brief Sort particles on host using std::sort on (morton, index) pairs + * + * @return Subview of the buffered permute array, valid until next ensureCapacity + */ + template + Kokkos::View sortParticlesHost( + Kokkos::View*, Kokkos::HostSpace> positions, + const Vector& origin, const Vector& invdx, + const Vector& ngrid, size_t n) { + auto& bufs = ippl::detail::getDefaultBinSortBuffers(); + bufs.ensureCapacity(n, /*n_bins_p1=*/1); + + auto& permute = bufs.permute(); + + // Build (key, index) pairs, sort, then extract permutation + std::vector> pairs(n); + for (size_t i = 0; i < n; ++i) { + pairs[i] = {computeMortonCode(positions(i), origin, invdx, ngrid), + i}; + } + std::sort(pairs.begin(), pairs.end()); // pair has lexicographic < by .first + + for (size_t i = 0; i < n; ++i) { + permute(i) = pairs[i].second; + } + + return permute; + } + +#ifdef KOKKOS_ENABLE_CUDA + /** + * @brief Sort particles on CUDA using CUB DeviceRadixSort with buffered temporaries + * + * @return Subview of the buffered permOut array containing sorted particle indices + */ + template + Kokkos::View sortParticlesCuda( + Kokkos::View*, Kokkos::CudaSpace> positions, + const Vector& origin, const Vector& invdx, + const Vector& ngrid, size_t n) { + using size_type = size_t; + auto& bufs = ippl::detail::getDefaultBinSortBuffers(); + bufs.ensureCapacity(n, /*n_bins_p1=*/1); + + // Step 1: compute Morton codes into binKeys, init permute to identity + auto keys = bufs.binKeys(); + auto permute = bufs.permute(); + + Kokkos::parallel_for( + "MortonSort::ComputeKeys", Kokkos::RangePolicy(0, n), + ComputeMortonCodesFunctor{ + positions, keys, origin, invdx, ngrid}); + + Kokkos::parallel_for( + "MortonSort::InitPermute", Kokkos::RangePolicy(0, n), + KOKKOS_LAMBDA(size_type i) { permute(i) = i; }); + Kokkos::fence(); + + // Step 2: query temp storage, grow buffer if needed, then sort + auto keys_out = bufs.keysOut(); + auto perm_out = bufs.permOut(); + + size_t temp_bytes = 0; + cub::DeviceRadixSort::SortPairs(nullptr, temp_bytes, keys.data(), keys_out.data(), + permute.data(), perm_out.data(), static_cast(n)); + + bufs.ensureTempStorage(temp_bytes); + + cub::DeviceRadixSort::SortPairs(bufs.tempStorage().data(), temp_bytes, keys.data(), + keys_out.data(), permute.data(), perm_out.data(), + static_cast(n)); + Kokkos::fence(); + + return bufs.permOut(); + } +#endif + +#ifdef KOKKOS_ENABLE_HIP + /** + * @brief Sort particles on HIP using rocPRIM radix_sort_pairs with buffered temporaries + * + * @return Subview of the buffered permOut array containing sorted particle indices + */ + template + Kokkos::View sortParticlesHip( + Kokkos::View*, Kokkos::HIPSpace> positions, const Vector& origin, + const Vector& invdx, const Vector& ngrid, size_t n) { + using size_type = size_t; + auto& bufs = ippl::detail::getDefaultBinSortBuffers(); + bufs.ensureCapacity(n, /*n_bins_p1=*/1); + + auto keys = bufs.binKeys(); + auto permute = bufs.permute(); + + Kokkos::parallel_for( + "MortonSort::ComputeKeys", Kokkos::RangePolicy(0, n), + ComputeMortonCodesFunctor{ + positions, keys, origin, invdx, ngrid}); + + Kokkos::parallel_for( + "MortonSort::InitPermute", Kokkos::RangePolicy(0, n), + KOKKOS_LAMBDA(size_type i) { permute(i) = i; }); + Kokkos::fence(); + + auto keys_out = bufs.keysOut(); + auto perm_out = bufs.permOut(); + + size_t temp_bytes = 0; + std::ignore = rocprim::radix_sort_pairs(nullptr, temp_bytes, keys.data(), keys_out.data(), + permute.data(), perm_out.data(), n, 0, sizeof(uint64_t) * 8, + Kokkos::HIP().hip_stream()); + + bufs.ensureTempStorage(temp_bytes); + + std::ignore = rocprim::radix_sort_pairs(bufs.tempStorage().data(), temp_bytes, keys.data(), + keys_out.data(), permute.data(), perm_out.data(), n, 0, + sizeof(uint64_t) * 8, Kokkos::HIP().hip_stream()); + Kokkos::fence(); + + return bufs.permOut(); + } +#endif + + /** + * @brief Generic sort dispatcher — selects CUDA/HIP/host implementation + */ + template + Kokkos::View sortParticles( + Kokkos::View*, typename ExecSpace::memory_space> positions, + const Vector& origin, const Vector& invdx, + const Vector& ngrid, size_t n) { + using memory_space = typename ExecSpace::memory_space; + +#ifdef KOKKOS_ENABLE_CUDA + if constexpr (std::is_same_v) { + return sortParticlesCuda(positions, origin, invdx, ngrid, n); + } +#endif +#ifdef KOKKOS_ENABLE_HIP + if constexpr (std::is_same_v) { + return sortParticlesHip(positions, origin, invdx, ngrid, n); + } +#endif + if constexpr (std::is_same_v) { + return sortParticlesHost(positions, origin, invdx, ngrid, n); + } else { + // Generic fallback: sort on host, copy result to device buffer + auto positions_host = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, positions); + + Vector ngrid_host; + for (unsigned d = 0; d < Dim; ++d) + ngrid_host[d] = ngrid[d]; + + auto permute_host = + sortParticlesHost(positions_host, origin, invdx, ngrid_host, n); + + auto& bufs = ippl::detail::getDefaultBinSortBuffers(); + bufs.ensureCapacity(n, /*n_bins_p1=*/1); + + Kokkos::deep_copy(Kokkos::subview(bufs.permute(), std::make_pair(size_t(0), n)), + Kokkos::subview(permute_host, std::make_pair(size_t(0), n))); + + return bufs.permute(); + } + } + + // ------------------------------------------------------------------- + // Permutation application helpers + // ------------------------------------------------------------------- + + template + struct ApplyPermutationFunctor { + SrcView src; + DstView dst; + IndexView permute; + KOKKOS_INLINE_FUNCTION void operator()(size_t i) const { dst(i) = src(permute(i)); } + }; + + template + struct ApplyInversePermutationFunctor { + SrcView src; + DstView dst; + IndexView permute; + KOKKOS_INLINE_FUNCTION void operator()(size_t i) const { dst(permute(i)) = src(i); } + }; + + /** + * @brief Reorder data in-place according to permutation + */ + template + void applyPermutation(DataView& data, const IndexView& permute, size_t n) { + DataView temp("temp", n); + Kokkos::parallel_for( + "apply_permutation", Kokkos::RangePolicy(0, n), + ApplyPermutationFunctor{data, temp, permute}); + Kokkos::deep_copy(Kokkos::subview(data, Kokkos::make_pair(size_t(0), n)), temp); + } + + /** + * @brief Scatter src → dst using inverse permutation (dst[permute[i]] = src[i]) + */ + template + void applyInversePermutation(const DataView& src, DataView& dst, const IndexView& permute, + size_t n) { + Kokkos::parallel_for( + "apply_inverse_permutation", Kokkos::RangePolicy(0, n), + ApplyInversePermutationFunctor{src, dst, permute}); + Kokkos::fence(); + } + + } // namespace detail +} // namespace ippl + +#endif // IPPL_PARTICLE_SORT_H \ No newline at end of file diff --git a/src/Particle/ParticleSpatialLayout.h b/src/Particle/ParticleSpatialLayout.h index 568b30753..ed596e180 100644 --- a/src/Particle/ParticleSpatialLayout.h +++ b/src/Particle/ParticleSpatialLayout.h @@ -35,6 +35,19 @@ namespace ippl { + // Controls how the send-count exchange is performed before the particle + // data transfer. All paths assume GPU-aware MPI (device pointers passed + // directly to Isend/Irecv/Alltoall). + enum class CountExchange { + // One-sided RMA: each sender writes its count directly into the + // receiver's window. + RMA, + // Two-sided point-to-point: per-rank Isend/Irecv over device pointers. + P2P, + // Collective Alltoall over device pointers. + Alltoall + }; + /*! * ParticleSpatialLayout class definition. * @tparam T value type @@ -59,9 +72,9 @@ namespace ippl { using size_type = detail::size_type; - public: // constructor: this one also takes a Mesh - ParticleSpatialLayout(FieldLayout&, Mesh&, bool fem = false); + ParticleSpatialLayout(FieldLayout&, Mesh&, bool fem = false, + CountExchange mode = CountExchange::RMA); ParticleSpatialLayout() : detail::ParticleLayout() {} @@ -73,21 +86,33 @@ namespace ippl { template void update(ParticleContainer& pc); - const RegionLayout_t& getRegionLayout() const { return rlayout_m; } + const RegionLayout_t& getRegionLayout() const { return *rlayout_m; } protected: //! The RegionLayout which determines where our particles go. - RegionLayout_t rlayout_m; + std::shared_ptr rlayout_m; //! The FieldLayout containing information on nearest neighbors FieldLayout_t& flayout_m; + //! How counts are exchanged + CountExchange countExchangeMode_; + + // + // RMA Path + // + // Vector keeping track of the recieves from all ranks std::vector nRecvs_m; // MPI RMA window for one-sided communication mpi::rma::Window window_m; + // + // P2P GPU Path + // + locate_type recvCounts_d_; // [nranks] + //! Type of the Kokkos view containing the local regions. using region_view_type = typename RegionLayout_t::view_type; //! Type of a single Region object. @@ -99,6 +124,10 @@ namespace ippl { KOKKOS_INLINE_FUNCTION constexpr static bool positionInRegion( const std::index_sequence&, const vector_type& pos, const region_type& region); + template + KOKKOS_INLINE_FUNCTION constexpr static bool positionInRegionInclusive( + const std::index_sequence&, const vector_type& pos, const region_type& region); + /*! * Evaluates the total number of MPI ranks sharing the spatial nearest neighbors. * @param neighbors structure containing, for every spatial direction, a list of @@ -109,35 +138,59 @@ namespace ippl { public: /*! - * For each particle in the bunch, determine the rank on which it should - * be stored based on its location - * @tparam ParticleContainer the particle container type - * @param pc the particle container - * @param ranks the integer view in which to store the destination ranks - * @param invalid the boolean view in which to store whether each particle - * is invalidated, i.e. needs to be sent to another rank - * @param nSends_dview the view storing per-rank send counts - * @param sends_dview the view storing per-rank send offsets - * @return The total number of invalidated particles + * For each local particle, determine its destination rank, write the + * indices of leaving particles into `sendIds_d_`, populate the + * per-rank send-count and offset buffers, build a compacted list of + * destination ranks, and mark each particle's "is leaving" status in + * `leaving_d_` so the destroy pass doesn't have to recompute it. + * @return number of leaving particles on this rank */ template - std::pair locateParticles(const ParticleContainer& pc, - locate_type& ranks, bool_type& invalid, - locate_type& nSends_dview, - locate_type& sends_dview) const; - - /*! - * @param rank we sent to - * @param ranks a container specifying where a particle at the i-th index should go. - * @param hash a mapping to fill the send buffer contiguously - */ - void fillHash(int rank, const locate_type& ranks, hash_type& hash); - - /*! - * @param rank we sent to - * @param ranks a container specifying where a particle at the i-th index should go. - */ - size_t numberOfSends(int rank, const locate_type& ranks); + size_t locateParticlesPacked(const ParticleContainer& pc); + + private: + // Fixed-size scratch + locate_type rankSendCount_d_; // [nRanks] + locate_type sendOffsets_d_; // [nRanks+1] + hash_type sendIds_d_; // [capacity >= max nInvalid seen] + locate_type cursor_d_; // [nRanks] per-rank insertion cursor + locate_type destRanks_d_; // [nRanks] (compacted list) + bool_type leaving_d_; // [capacity >= max nLocal seen] mask + + // Single scalar on device to count destinations + Kokkos::View nDest_d_; + + // Neigbour cache + locate_type neighbors_d_; // [neighborSize] cached device neighbors list + std::vector neighbors_host_; // flat host copy + bool neighbors_dirty_ = true; + size_t neighbors_capacity_ = 0; + size_type neighbors_used_ = 0; + + // Host mirror buffers + using host_mem_space = Kokkos::HostSpace; + using locate_host_type = typename detail::ViewType::view_type; + + locate_host_type rankSendCount_h_; // [nRanks] (mirror) + locate_host_type sendOffsets_h_; // [nRanks+1] (mirror) + locate_host_type destRanks_h_; // [nRanks] (mirror) + + // Host-side destination list + std::vector destinationRanks_host_; + + // capacities + size_t sendIds_capacity_ = 0; + size_t leaving_capacity_ = 0; + int nRanks_ = 0; + + void initScratch(int nRanks); + void ensureSendCapacity(size_t nInvalid); + void ensureLeavingCapacity(size_t nLocal); + void ensureNeighborsCached(); + + void countExchangeRMA(); + void countExchangeP2P(); + void countExchangeAlltoall(); }; } // namespace ippl diff --git a/src/Particle/ParticleSpatialLayout.hpp b/src/Particle/ParticleSpatialLayout.hpp index b47529aae..c2bc0093c 100644 --- a/src/Particle/ParticleSpatialLayout.hpp +++ b/src/Particle/ParticleSpatialLayout.hpp @@ -24,55 +24,89 @@ #include #include "Utility/IpplTimings.h" +#include "Utility/ParallelDispatch.h" #include "Communicate/Window.h" namespace ippl { - /*! - * We need this struct since Kokkos parallel_scan only accepts - * one variable of type ReturnType where to perform the reduction operation. - * For more details, see - * https://kokkos.github.io/kokkos-core-wiki/API/core/parallel-dispatch/parallel_scan.html. - */ - struct increment_type { - size_t count[2]; + template + ParticleSpatialLayout::ParticleSpatialLayout(FieldLayout& fl, + Mesh& mesh, bool fem, + CountExchange mode) + : rlayout_m(std::make_shared(fl, mesh, fem)) + , flayout_m(fl) + , countExchangeMode_(mode) { + const int nRanks = Comm->size(); + nRanks_ = nRanks; + initScratch(nRanks); - KOKKOS_FUNCTION void init() { - count[0] = 0; - count[1] = 0; + if (mode == CountExchange::RMA && nRanks > 1) { + nRecvs_m.resize(Comm->size()); + window_m.create(*Comm, nRecvs_m.begin(), nRecvs_m.end()); } + } - KOKKOS_INLINE_FUNCTION increment_type& operator+=(bool* values) { - count[0] += values[0]; - count[1] += values[1]; - return *this; - } + template + void ParticleSpatialLayout::updateLayout(FieldLayout& fl, + Mesh& mesh) { + rlayout_m->changeDomain(fl, mesh); + neighbors_dirty_ = true; + } + + template + void ParticleSpatialLayout::countExchangeRMA() { + std::fill(nRecvs_m.begin(), nRecvs_m.end(), 0); + window_m.fence(0); - KOKKOS_INLINE_FUNCTION increment_type& operator+=(increment_type values) { - count[0] += values.count[0]; - count[1] += values.count[1]; - return *this; + for (int rank : destinationRanks_host_) { + if (rank == Comm->rank()) + continue; + const int* src = &rankSendCount_h_(rank); + window_m.put(src, rank, Comm->rank()); } - }; + + window_m.fence(0); + } template - ParticleSpatialLayout::ParticleSpatialLayout(FieldLayout& fl, - Mesh& mesh, bool fem) - : rlayout_m(fl, mesh, fem) - , flayout_m(fl) - { - nRecvs_m.resize(Comm->size()); - if (Comm->size() > 1) { - window_m.create(*Comm, nRecvs_m.begin(), nRecvs_m.end()); + void ParticleSpatialLayout::countExchangeP2P() { + const int myRank = Comm->rank(); + const int tag = Comm->next_tag(mpi::tag::P_SPATIAL_LAYOUT, mpi::tag::P_LAYOUT_CYCLE); + + // Zero the receive-count buffer on the device + Kokkos::deep_copy(position_execution_space{}, recvCounts_d_, 0); + Kokkos::fence(); + + // Device pointer to send-count array + int* d_sendCounts = rankSendCount_d_.data(); + int* d_recvCounts = recvCounts_d_.data(); + + std::vector reqs; + reqs.reserve(2 * std::max(0, nRanks_ - 1)); + + // Post receives from all ranks (except self) + for (int r = 0; r < nRanks_; ++r) { + if (r == myRank) + continue; + MPI_Irecv(d_recvCounts + r, 1, MPI_INT, r, tag, Comm->getCommunicator(), + &reqs.emplace_back()); } + + for (int r = 0; r < nRanks_; ++r) { + if (r == myRank) + continue; + MPI_Isend(d_sendCounts + r, 1, MPI_INT, r, tag, Comm->getCommunicator(), + &reqs.emplace_back()); + } + + MPI_Waitall(static_cast(reqs.size()), reqs.data(), MPI_STATUSES_IGNORE); } template - void ParticleSpatialLayout::updateLayout(FieldLayout& fl, - Mesh& mesh) { - // flayout_m = fl; - rlayout_m.changeDomain(fl, mesh); + void ParticleSpatialLayout::countExchangeAlltoall() { + MPI_Alltoall(rankSendCount_d_.data(), 1, MPI_INT, recvCounts_d_.data(), 1, MPI_INT, + Comm->getCommunicator()); } template @@ -81,18 +115,20 @@ namespace ippl { /* Apply Boundary Conditions */ static IpplTimings::TimerRef ParticleBCTimer = IpplTimings::getTimer("particleBC"); IpplTimings::startTimer(ParticleBCTimer); - this->applyBC(pc.R, rlayout_m.getDomain()); + this->applyBC(pc.R, rlayout_m->getDomain()); IpplTimings::stopTimer(ParticleBCTimer); /* Update Timer for the rest of the function */ static IpplTimings::TimerRef ParticleUpdateTimer = IpplTimings::getTimer("updateParticle"); IpplTimings::startTimer(ParticleUpdateTimer); - int nRanks = Comm->size(); - if (nRanks < 2) { + if (nRanks_ < 2) { + IpplTimings::stopTimer(ParticleUpdateTimer); return; } + ensureNeighborsCached(); + /* particle MPI exchange: * 1. figure out which particles need to go where -> locateParticles(...) * 2. fill send buffer and send particles @@ -105,69 +141,45 @@ namespace ippl { static IpplTimings::TimerRef locateTimer = IpplTimings::getTimer("locateParticles"); IpplTimings::startTimer(locateTimer); - /* Rank-local number of particles */ - size_type localnum = pc.getLocalNum(); - - /* The indices correspond to the indices of the local particles, - * the values correspond to the ranks to which the particles need to be sent - */ - locate_type particleRanks("particles' MPI ranks", localnum); - - /* The indices are the indices of the particles, - * the boolean values describe whether the particle has left the current rank - * 0 --> particle valid (inside current rank) - * 1 --> particle invalid (left rank) - */ - bool_type invalidParticles("validity of particles", localnum); - - /* The indices are the MPI ranks, - * the values are the number of particles are sent to that rank from myrank - */ - locate_type rankSendCount_dview("rankSendCount Device", nRanks); + const size_type nInvalid = locateParticlesPacked(pc); - /* The indices have no particluar meaning, - * the values are the MPI ranks to which we need to send - */ - locate_type destinationRanks_dview("destinationRanks Device", nRanks); + // Copy metadata to host + size_type nDest = 0; + Kokkos::deep_copy(position_execution_space{}, nDest, nDest_d_); - /* nInvalid is the number of invalid particles - * nDestinationRanks is the number of MPI ranks we need to send to - */ - auto [nInvalid, nDestinationRanks] = locateParticles( - pc, particleRanks, invalidParticles, rankSendCount_dview, destinationRanks_dview); + // destRanks prefix + if (nDest > 0) { + Kokkos::deep_copy( + position_execution_space{}, + Kokkos::subview(destRanks_h_, std::make_pair(size_t(0), size_t(nDest))), + Kokkos::subview(destRanks_d_, std::make_pair(size_t(0), size_t(nDest)))); + } - /* Host space copy of rankSendCount_dview */ - auto rankSendCount_hview = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), rankSendCount_dview); + // counts + offsets + Kokkos::deep_copy(position_execution_space{}, rankSendCount_h_, rankSendCount_d_); + Kokkos::deep_copy(position_execution_space{}, sendOffsets_h_, sendOffsets_d_); - /* Host Space copy of destinationRanks_dview */ - auto destinationRanks_hview = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), destinationRanks_dview); + // Build host destination list without allocation + destinationRanks_host_.clear(); + for (size_t i = 0; i < (size_t)nDest; ++i) + destinationRanks_host_.push_back(destRanks_h_(i)); IpplTimings::stopTimer(locateTimer); // 2. fill send buffer and send particles =============================================== // - // 2.1 Remote Memory Access window for one-sided communication + // 2.1 Count Exchange static IpplTimings::TimerRef preprocTimer = IpplTimings::getTimer("sendPreprocess"); IpplTimings::startTimer(preprocTimer); - std::fill(nRecvs_m.begin(), nRecvs_m.end(), 0); - - window_m.fence(0); - - // Prepare RMA window for the ranks we need to send to - for (size_t ridx = 0; ridx < nDestinationRanks; ridx++) { - int rank = destinationRanks_hview[ridx]; - if (rank == Comm->rank()) { - // we do not need to send to ourselves - continue; - } - const int* src_ptr = &rankSendCount_hview(rank); - window_m.put(src_ptr, rank, Comm->rank()); + if (countExchangeMode_ == CountExchange::RMA) { + countExchangeRMA(); + } else if (countExchangeMode_ == CountExchange::P2P) { + countExchangeP2P(); + } else { + countExchangeAlltoall(); } - window_m.fence(0); IpplTimings::stopTimer(preprocTimer); @@ -176,55 +188,96 @@ namespace ippl { static IpplTimings::TimerRef sendTimer = IpplTimings::getTimer("particleSend"); IpplTimings::startTimer(sendTimer); - std::vector requests(0); + std::vector requests; + requests.reserve(destinationRanks_host_.size()); int tag = Comm->next_tag(mpi::tag::P_SPATIAL_LAYOUT, mpi::tag::P_LAYOUT_CYCLE); - for (size_t ridx = 0; ridx < nDestinationRanks; ridx++) { - int rank = destinationRanks_hview[ridx]; - if (rank == Comm->rank()) { + for (int rank : destinationRanks_host_) { + if (rank == Comm->rank()) continue; - } - hash_type hash("hash", rankSendCount_hview(rank)); - fillHash(rank, particleRanks, hash); - pc.sendToRank(rank, tag, requests, hash); + const size_type count = static_cast(rankSendCount_h_(rank)); + if (count == 0) + continue; + const size_type begin = static_cast(sendOffsets_h_(rank)); + auto ids_sub = + Kokkos::subview(sendIds_d_, std::make_pair((size_t)begin, (size_t)(begin + count))); + requests.push_back(pc.sendToRank(rank, tag, ids_sub)); } IpplTimings::stopTimer(sendTimer); - // 3. Internal destruction of invalid particles ======================================= // - - static IpplTimings::TimerRef destroyTimer = IpplTimings::getTimer("particleDestroy"); - IpplTimings::startTimer(destroyTimer); - - pc.internalDestroy(invalidParticles, nInvalid); - Kokkos::fence(); - - IpplTimings::stopTimer(destroyTimer); - - // 4. Receive Particles ================================================================ // + // 2.3 Post receives static IpplTimings::TimerRef recvTimer = IpplTimings::getTimer("particleRecv"); IpplTimings::startTimer(recvTimer); - for (int rank = 0; rank < nRanks; ++rank) { - if (nRecvs_m[rank] > 0) { - pc.recvFromRank(rank, tag, nRecvs_m[rank]); + std::vector> recvList; + + if (countExchangeMode_ == CountExchange::RMA) { + for (int rank = 0; rank < nRanks_; ++rank) { + if (nRecvs_m[rank] > 0) { + recvList.push_back({rank, nRecvs_m[rank]}); + } } + } else { + auto recvCounts_h = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), recvCounts_d_); + for (int rank = 0; rank < nRanks_; ++rank) { + if (recvCounts_h(rank) > 0) { + recvList.push_back({rank, recvCounts_h(rank)}); + } + } + } + + std::vector recvRequests(recvList.size(), MPI_REQUEST_NULL); + std::vector> finalizers(recvList.size()); + + for (size_t i = 0; i < recvList.size(); ++i) { + auto [rank, count] = recvList[i]; + auto [req, fin] = pc.postRecvFromRank(rank, tag, count); + recvRequests[i] = req; + finalizers[i] = std::move(fin); } + IpplTimings::stopTimer(recvTimer); - IpplTimings::startTimer(sendTimer); + // 3. Internal destruction of invalid particles ======================================= // - if (requests.size() > 0) { - MPI_Waitall(requests.size(), requests.data(), MPI_STATUSES_IGNORE); + static IpplTimings::TimerRef destroyTimer = IpplTimings::getTimer("particleDestroy"); + IpplTimings::startTimer(destroyTimer); + + // locateParticlesPacked already wrote the per-particle "is leaving" + // mask into leaving_d_. Reuse it instead of re-running the full + // region search inside the destroy predicate. + auto leaving = leaving_d_; + pc.template internalDestroy( + KOKKOS_LAMBDA(size_t i) { return leaving(i); }, nInvalid); + Kokkos::fence(); + + IpplTimings::stopTimer(destroyTimer); + + requests.insert(requests.end(), recvRequests.begin(), recvRequests.end()); + if (!requests.empty()) { + MPI_Waitall(static_cast(requests.size()), requests.data(), MPI_STATUSES_IGNORE); } - IpplTimings::stopTimer(sendTimer); Comm->freeAllBuffers(); + // 5. Deserialize + for (auto& finalize : finalizers) + finalize(pc.getLocalNum()); + IpplTimings::stopTimer(ParticleUpdateTimer); } + template + template + KOKKOS_INLINE_FUNCTION constexpr bool + ParticleSpatialLayout::positionInRegionInclusive( + const std::index_sequence&, const vector_type& pos, const region_type& region) { + return ((pos[Idx] >= region[Idx].min()) && ...) && ((pos[Idx] <= region[Idx].max()) && ...); + }; + template template KOKKOS_INLINE_FUNCTION constexpr bool @@ -248,220 +301,204 @@ namespace ippl { return totalSize; } - /** - * @brief This function determines to which rank particles need to be sent after the iteration - * step. It starts by first scanning direct rank neighbors, and only does a global scan if there - * are still unfound particles. It then calculates how many particles need to be sent to each - * rank and how many ranks are sent to in total. - * - * @param pc Particle Container - * @param ranks A vector the length of the number of particles on the current rank, where - * each value refers to the new rank of the particle - * @param invalid A vector marking the particles that need to be sent away, and thus - * locally deleted - * @param nSends_dview Device view the length of number of ranks, where each value determines - * the number of particles sent to that rank from the current rank - * @param sends_dview Device view for the number of ranks that are sent to from current rank - * - * @return tuple with the number of particles sent away and the number of ranks sent to - */ template template - std::pair - ParticleSpatialLayout::locateParticles( - const ParticleContainer& pc, locate_type& ranks, bool_type& invalid, - locate_type& nSends_dview, locate_type& sends_dview) const { - auto positions = pc.R.getView(); - region_view_type Regions = rlayout_m.getdLocalRegions(); - - using policy_type = Kokkos::RangePolicy; - using mdrange_type = Kokkos::MDRangePolicy, position_execution_space>; + size_t ParticleSpatialLayout::locateParticlesPacked( + const ParticleContainer& pc) { + const int nRanks = Comm->size(); + const size_type myRank = Comm->rank(); - size_type myRank = Comm->rank(); + ensureNeighborsCached(); - const auto is = std::make_index_sequence{}; - - const neighbor_list& neighbors = flayout_m.getNeighbors(); - - /// outsideIds: Container of particle IDs that travelled outside of the neighborhood. - locate_type outsideIds("Particles outside of neighborhood", size_type(pc.getLocalNum())); + auto positions = pc.R.getView(); + region_view_type Regions = rlayout_m->getdLocalRegions(); + const auto is = std::make_index_sequence{}; - /// outsideCount: Tracks the number of particles that travelled outside of the neighborhood. - size_type outsideCount = 0; - /// invalidCount: Tracks the number of particles that need to be sent to other ranks. - size_type invalidCount = 0; + using exec_space = position_execution_space; + using policy_type = Kokkos::RangePolicy; - /// neighborSize: Size of a neighborhood in D dimentions. - const size_type neighborSize = getNeighborSize(neighbors); + // Reset small device buffers + Kokkos::deep_copy(rankSendCount_d_, size_type(0)); + Kokkos::deep_copy(cursor_d_, size_type(0)); + Kokkos::deep_copy(nDest_d_, size_type(0)); - /// neighbors_view: Kokkos view with the IDs of the neighboring MPI ranks. - locate_type neighbors_view("Nearest neighbors IDs", neighborSize); + const size_type neighbors_used = neighbors_used_; + auto& neighbours_d = neighbors_d_; - /* red_val: Used to reduce both the number of invalid particles and the number of particles - * outside of the neighborhood (Kokkos::parallel_scan doesn't allow multiple reduction - * values, so we use the helper class increment_type). First element updates InvalidCount, - * second one updates outsideCount. - */ - increment_type red_val; - red_val.init(); + // Destination rank computation (no per-particle storage) + auto destRankOf = KOKKOS_LAMBDA(const size_t i)->size_type { + if (positionInRegion(is, positions(i), Regions(myRank))) + return myRank; - auto neighbors_mirror = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), neighbors_view); - - size_t k = 0; + for (int j = 0; j < static_cast(neighbors_used); ++j) { + const int r = neighbours_d(j); + if (positionInRegion(is, positions(i), Regions(r))) + return r; + } - for (const auto& componentNeighbors : neighbors) { - for (size_t j = 0; j < componentNeighbors.size(); ++j) { - neighbors_mirror(k) = componentNeighbors[j]; - // std::cout << "Neighbor: " << neighbors_mirror(k) << std::endl; - k++; + // slow-path: global scan + for (int r = 0; r < static_cast(Regions.extent(0)); ++r) { + if (positionInRegion(is, positions(i), Regions(r))) + return r; } - } - Kokkos::deep_copy(neighbors_view, neighbors_mirror); + // Inclusive fallback: catches particles sitting exactly on a region + // lower boundary that the strict > check above missed (e.g. (0,0,0)) + for (int r = 0; r < static_cast(Regions.extent(0)); ++r) { + if (positionInRegionInclusive(is, positions(i), Regions(r))) + return r; + } + return myRank; // truly outside all regions - applyBC should have prevented this + }; - /*! Begin Kokkos loop: - * Step 1: search in current rank - * Step 2: search in neighbors - * Step 3: save information on whether the particle was located - * Step 4: run additional loop on non-located particles - */ - static IpplTimings::TimerRef neighborSearch = IpplTimings::getTimer("neighborSearch"); - IpplTimings::startTimer(neighborSearch); + // Make sure the leaving-mask buffer is large enough; it's reused across + // updates to avoid reallocation. + ensureLeavingCapacity(pc.getLocalNum()); + auto& leaving_d = leaving_d_; - Kokkos::parallel_scan( - "ParticleSpatialLayout::locateParticles()", - policy_type(0, ranks.extent(0)), - KOKKOS_LAMBDA(const size_type i, increment_type& val, const bool final) { - /* Step 1 - * inCurr: True if the particle hasn't left the current MPI rank. - * inNeighbor: True if the particle is found in a neighboring rank. - * found: True either if inCurr = True or inNeighbor = True. - * increment: Helper variable to update red_val. - */ - bool inCurr = false; - bool inNeighbor = false; - bool found = false; - bool increment[2]; - - inCurr = positionInRegion(is, positions(i), Regions(myRank)); - - ranks(i) = inCurr * myRank; - invalid(i) = !inCurr; - found = inCurr || found; - - /// Step 2 - for (size_t j = 0; j < neighbors_view.extent(0); ++j) { - size_type rank = neighbors_view(j); - - inNeighbor = positionInRegion(is, positions(i), Regions(rank)); - - ranks(i) = !(inNeighbor)*ranks(i) + inNeighbor * rank; - found = inNeighbor || found; - } - /// Step 3 - /* isOut: When the last thread has finished the search, checks whether the particle - * has been found either in the current rank or in a neighboring one. Used to avoid - * race conditions when updating outsideIds. - */ - if (final && !found) { - outsideIds(val.count[1]) = i; - } - // outsideIds(val.count[1]) = i * isOut; - increment[0] = invalid(i); - increment[1] = !found; - val += increment; + // Pass 1: compute send counts + nInvalid + per-particle "is leaving" + size_type nInvalid = 0; + auto& rankSendCount_d = rankSendCount_d_; + Kokkos::parallel_reduce( + "PSL::packed_count", policy_type(0, pc.getLocalNum()), + KOKKOS_LAMBDA(const size_t i, size_type& inval) { + const size_type dest = destRankOf(i); + const bool leaves = (dest != myRank); + leaving_d(i) = leaves; + inval += leaves; + if (leaves) + Kokkos::atomic_fetch_add(&rankSendCount_d(dest), size_type(1)); }, - red_val); - + nInvalid); Kokkos::fence(); - invalidCount = red_val.count[0]; - outsideCount = red_val.count[1]; - - IpplTimings::stopTimer(neighborSearch); - - /// Step 4 - static IpplTimings::TimerRef nonNeighboringParticles = - IpplTimings::getTimer("nonNeighboringParticles"); - IpplTimings::startTimer(nonNeighboringParticles); - if (outsideCount > 0) { - Kokkos::parallel_for( - "ParticleSpatialLayout::leftParticles()", - mdrange_type({0, 0}, {outsideCount, Regions.extent(0)}), - KOKKOS_LAMBDA(const size_t i, const size_type j) { - /// pID: (local) ID of the particle that is currently being searched. - size_type pId = outsideIds(i); - - /// inRegion: Checks whether particle pID is inside region j. - bool inRegion = positionInRegion(is, positions(pId), Regions(j)); - if (inRegion) { - ranks(pId) = j; - } - }); - Kokkos::fence(); - } - IpplTimings::stopTimer(nonNeighboringParticles); + // Ensure sendIds capacity after we know nInvalid + ensureSendCapacity(nInvalid); - Kokkos::parallel_for( - "Calculate nSends", policy_type(0, ranks.extent(0)), - KOKKOS_LAMBDA(const size_t i) { - size_type rank = ranks(i); - Kokkos::atomic_fetch_add(&nSends_dview(rank), 1); + // Exclusive scan to offsets (length nRanks+1) + auto& sendOffsets_d = sendOffsets_d_; + Kokkos::parallel_scan( + "PSL::packed_offsets", policy_type(0, (size_t)nRanks + 1), + KOKKOS_LAMBDA(const size_t r, size_type& upd, const bool final) { + if (final) + sendOffsets_d(r) = upd; + if (r < (size_t)nRanks) + upd += rankSendCount_d(r); }); + Kokkos::fence(); - // Number of Ranks we need to send to - Kokkos::View rankSends( - "Number of Ranks we need to send to"); + // Pass 2: fill packed send IDs into sendIds_d_ (prefix [0, nInvalid)) + auto& cursor_d = cursor_d_; + auto& sendIds_d = sendIds_d_; + Kokkos::parallel_for( + "PSL::packed_fill", policy_type(0, pc.getLocalNum()), KOKKOS_LAMBDA(const size_t i) { + const size_type dest = destRankOf(i); + if (dest == myRank) + return; + + const size_type pos = Kokkos::atomic_fetch_add(&cursor_d(dest), size_type(1)); + const size_type base = sendOffsets_d(dest); + + sendIds_d(base + pos) = static_cast(i); + }); + Kokkos::fence(); + // Build destination rank list on device (compact), store length in nDest_d_ + auto& destRanks_d = destRanks_d_; + auto& nDest_d = nDest_d_; Kokkos::parallel_for( - "Calculate sends", policy_type(0, nSends_dview.extent(0)), - KOKKOS_LAMBDA(const size_t rank) { - if (nSends_dview(rank) != 0) { - size_type index = Kokkos::atomic_fetch_add(&rankSends(), 1); - sends_dview(index) = rank; + "PSL::packed_destRanks", policy_type(0, (size_t)nRanks), KOKKOS_LAMBDA(const size_t r) { + if ((size_type)r == myRank) + return; + if (rankSendCount_d(r) > 0) { + const size_type idx = Kokkos::atomic_fetch_add(&nDest_d(), size_type(1)); + destRanks_d(idx) = static_cast(r); } }); - size_type temp; - Kokkos::deep_copy(temp, rankSends); + Kokkos::fence(); - return {invalidCount, temp}; + return nInvalid; } template - void ParticleSpatialLayout::fillHash(int rank, - const locate_type& ranks, - hash_type& hash) { - /* Compute the prefix sum and fill the hash - */ - using policy_type = Kokkos::RangePolicy; - Kokkos::parallel_scan( - "ParticleSpatialLayout::fillHash()", policy_type(0, ranks.extent(0)), - KOKKOS_LAMBDA(const size_t i, int& idx, const bool final) { - if (final) { - if (rank == ranks(i)) { - hash(idx) = i; - } - } + void ParticleSpatialLayout::initScratch(int nRanks) { + Kokkos::realloc(rankSendCount_d_, nRanks); + Kokkos::realloc(sendOffsets_d_, nRanks + 1); + Kokkos::realloc(cursor_d_, nRanks); + Kokkos::realloc(destRanks_d_, nRanks); + Kokkos::realloc(recvCounts_d_, nRanks); + + // scalar counter + nDest_d_ = Kokkos::View("nDest_d"); + + // Host mirrors + Kokkos::realloc(rankSendCount_h_, nRanks); + Kokkos::realloc(sendOffsets_h_, nRanks + 1); + Kokkos::realloc(destRanks_h_, nRanks); + + destinationRanks_host_.clear(); + destinationRanks_host_.reserve(nRanks); + } - if (rank == ranks(i)) { - idx += 1; - } - }); - Kokkos::fence(); + template + void ParticleSpatialLayout::ensureSendCapacity(size_t nInvalid) { + if (nInvalid <= sendIds_capacity_) + return; + + // grow geometrically + size_t newCap = sendIds_capacity_ ? sendIds_capacity_ : size_t(1024); + while (newCap < nInvalid) + newCap *= 2; + + sendIds_capacity_ = newCap; + Kokkos::realloc(sendIds_d_, newCap); } template - size_t ParticleSpatialLayout::numberOfSends( - int rank, const locate_type& ranks) { - size_t nSends = 0; - using policy_type = Kokkos::RangePolicy; - Kokkos::parallel_reduce( - "ParticleSpatialLayout::numberOfSends()", policy_type(0, ranks.extent(0)), - KOKKOS_LAMBDA(const size_t i, size_t& num) { num += size_t(rank == ranks(i)); }, - nSends); - Kokkos::fence(); - return nSends; + void ParticleSpatialLayout::ensureLeavingCapacity(size_t nLocal) { + if (nLocal <= leaving_capacity_) + return; + + size_t newCap = leaving_capacity_ ? leaving_capacity_ : size_t(1024); + while (newCap < nLocal) + newCap *= 2; + + leaving_capacity_ = newCap; + Kokkos::realloc(leaving_d_, newCap); + } + + template + void ParticleSpatialLayout::ensureNeighborsCached() { + if (!neighbors_dirty_) + return; + + const neighbor_list& neighbors = flayout_m.getNeighbors(); + const size_type neighborSize = getNeighborSize(neighbors); + neighbors_used_ = neighborSize; + + if (neighborSize > neighbors_capacity_) { + neighbors_capacity_ = neighborSize; + Kokkos::realloc(neighbors_d_, neighbors_capacity_); + } + + neighbors_host_.clear(); + neighbors_host_.reserve(neighborSize); + + auto neighbors_h = Kokkos::create_mirror_view(neighbors_d_); + size_t k = 0; + for (const auto& comp : neighbors) { + for (size_t j = 0; j < comp.size(); ++j) { + neighbors_h(k) = comp[j]; + neighbors_host_.push_back(comp[j]); + ++k; + } + } + + Kokkos::deep_copy( + Kokkos::subview(neighbors_d_, std::make_pair(size_t(0), size_t(neighborSize))), + Kokkos::subview(neighbors_h, std::make_pair(size_t(0), size_t(neighborSize)))); + + neighbors_dirty_ = false; } } // namespace ippl diff --git a/src/Particle/SortBuffer.h b/src/Particle/SortBuffer.h new file mode 100644 index 000000000..977db9f59 --- /dev/null +++ b/src/Particle/SortBuffer.h @@ -0,0 +1,185 @@ +#ifndef IPPL_SORT_BUFFER_H +#define IPPL_SORT_BUFFER_H + +#include + +#include +#include +#include + +namespace ippl { + namespace detail { + + /** + * @brief Persistent scratch buffers for bin-sort-based particle binning. + * + * Grows on demand, never shrinks. Call finalizeBinSortBuffers() before + * Kokkos::finalize() to release device memory safely. + * + * Buffer layout + * bin_keys uint64_t[n_particles] sort keys (bin index per particle) + * permute size_t [n_particles] identity → sorted particle order + * bin_offsets size_t [n_bins + 1] start of each bin in permute + * keys_out uint64_t[n_particles] CUB radix-sort output (CUDA only) + * perm_out size_t [n_particles] CUB radix-sort output (CUDA only) + * temp_storage char [temp_bytes] CUB device-temp (CUDA only) + */ + template + class BinSortBuffers { + public: + using memory_space = MemorySpace; + using size_type = size_t; + + BinSortBuffers() = default; + BinSortBuffers(const BinSortBuffers&) = delete; + BinSortBuffers& operator=(const BinSortBuffers&) = delete; + BinSortBuffers(BinSortBuffers&&) = default; + BinSortBuffers& operator=(BinSortBuffers&&) = default; + + /** + * @brief Ensure particle-sized buffers are large enough. + * + * @param n_particles Number of particles. + * @param n_bins_p1 Number of bins + 1 (size needed for bin_offsets). + * @param growth Over-allocation factor (default 1.2). + */ + void ensureCapacity(size_type n_particles, size_type n_bins_p1, double growth = 1.2) { + if (n_particles > particle_capacity_) { + const size_type cap = + std::max(static_cast(n_particles * growth), n_particles); + + bin_keys_ = Kokkos::View("bin_keys", cap); + permute_ = Kokkos::View("permute", cap); + keys_out_ = Kokkos::View("keys_out", cap); + perm_out_ = Kokkos::View("perm_out", cap); + + particle_capacity_ = cap; + } + + if (n_bins_p1 > bin_offset_capacity_) { + const size_type cap = + std::max(static_cast(n_bins_p1 * growth), n_bins_p1); + + bin_offsets_ = Kokkos::View("bin_offsets", cap); + bin_offset_capacity_ = cap; + } + } + + /** + * @brief Ensure CUB/rocPRIM temp-storage buffer is large enough. + * @param bytes Required bytes. + */ + void ensureTempStorage(size_t bytes) { + if (bytes <= temp_capacity_) + return; + const size_t cap = static_cast(bytes * 1.1); + temp_storage_ = Kokkos::View("temp_storage", cap); + temp_capacity_ = cap; + } + + // --- accessors ------------------------------------------------------- + Kokkos::View& binKeys() { return bin_keys_; } + Kokkos::View& permute() { return permute_; } + Kokkos::View& binOffsets() { return bin_offsets_; } + Kokkos::View& keysOut() { return keys_out_; } + Kokkos::View& permOut() { return perm_out_; } + Kokkos::View& tempStorage() { return temp_storage_; } + + size_type particleCapacity() const { return particle_capacity_; } + size_type binOffsetCapacity() const { return bin_offset_capacity_; } + size_t tempCapacity() const { return temp_capacity_; } + + size_t memoryUsage() const { + // Sum the bytes actually held by each view rather than + // assuming all of them are populated — keys_out_ / perm_out_ + // are only allocated on the CUDA fast path. + return bin_keys_.span() * sizeof(uint64_t) + + permute_.span() * sizeof(size_type) + + bin_offsets_.span() * sizeof(size_type) + + keys_out_.span() * sizeof(uint64_t) + + perm_out_.span() * sizeof(size_type) + + temp_storage_.span(); + } + + void clear() { + bin_keys_ = {}; + permute_ = {}; + bin_offsets_ = {}; + keys_out_ = {}; + perm_out_ = {}; + temp_storage_ = {}; + particle_capacity_ = 0; + bin_offset_capacity_ = 0; + temp_capacity_ = 0; + } + + private: + Kokkos::View bin_keys_; + Kokkos::View permute_; + Kokkos::View bin_offsets_; + Kokkos::View keys_out_; + Kokkos::View perm_out_; + Kokkos::View temp_storage_; + + size_type particle_capacity_ = 0; + size_type bin_offset_capacity_ = 0; + size_t temp_capacity_ = 0; + }; + + // --------------------------------------------------------------------------- + // Singleton access + // --------------------------------------------------------------------------- + + template + class BinSortBuffersHolder { + public: + static BinSortBuffersHolder& instance() { + static BinSortBuffersHolder h; + return h; + } + + BinSortBuffers& get() { + if (!buffers_) + buffers_ = std::make_unique>(); + return *buffers_; + } + + void finalize() { + if (buffers_) { + buffers_->clear(); + buffers_.reset(); + } + } + + private: + BinSortBuffersHolder() = default; + ~BinSortBuffersHolder() = default; + BinSortBuffersHolder(const BinSortBuffersHolder&) = delete; + BinSortBuffersHolder& operator=(const BinSortBuffersHolder&) = delete; + + std::unique_ptr> buffers_; + }; + + template + BinSortBuffers& getDefaultBinSortBuffers() { + return BinSortBuffersHolder::instance().get(); + } + + /** + * @brief Release all bin-sort buffers for known memory spaces. + * Call this before Kokkos::finalize(). + */ + inline void finalizeBinSortBuffers() { +#ifdef KOKKOS_ENABLE_CUDA + BinSortBuffersHolder::instance().finalize(); +#endif +#ifdef KOKKOS_ENABLE_HIP + BinSortBuffersHolder::instance().finalize(); +#endif + BinSortBuffersHolder::instance().finalize(); + } + + } // namespace detail +} // namespace ippl + +#endif // IPPL_SORT_BUFFER_H \ No newline at end of file diff --git a/src/Utility/BufferView.h b/src/Utility/BufferView.h new file mode 100644 index 000000000..732bf8375 --- /dev/null +++ b/src/Utility/BufferView.h @@ -0,0 +1,229 @@ +#ifndef IPPL_BUFFERVIEW_H +#define IPPL_BUFFERVIEW_H + +#include +#include +#include +#include + +namespace ippl { + + /** + * @brief Helper to compute aligned offset + */ + [[nodiscard]] constexpr size_t alignUp(size_t offset, size_t alignment) { + return (offset + alignment - 1) & ~(alignment - 1); + } + + /** + * + * @tparam T Element type + * @tparam MemorySpace Kokkos memory space + */ + template + class BufferView { + public: + using view_type = Kokkos::View; + + explicit BufferView(size_t count) + : view_m("BufferView", count) {} + + ~BufferView() = default; + + // Non-copyable + BufferView(const BufferView&) = delete; + BufferView& operator=(const BufferView&) = delete; + + // Movable + BufferView(BufferView&&) = default; + BufferView& operator=(BufferView&&) = default; + + view_type& getView() { return view_m; } + const view_type& getView() const { return view_m; } + + T* data() { return view_m.data(); } + const T* data() const { return view_m.data(); } + size_t size() const { return view_m.extent(0); } + + private: + view_type view_m; + }; + + /** + * @brief Allocates a single buffer and provides multiple aligned views into it. + * + * This class manages its own memory via a Kokkos View, completely independent + * from MPI buffer management. Use this for temporary computation buffers + * that don't need MPI communication. + * + * @tparam MemorySpace The Kokkos memory space for the buffer + */ + template + class MultiViewBuffer { + public: + using buffer_view_type = Kokkos::View; + + static constexpr size_t DEFAULT_ALIGNMENT = 256; + + /** + * @brief Construct without allocation. Call allocate() before getting views. + */ + MultiViewBuffer() + : currentOffset_m(0) {} + + /** + * @brief Construct and allocate buffer of specified size. + * @param totalBytes Total size in bytes + * @param label Optional Kokkos label for debugging + */ + explicit MultiViewBuffer(size_t totalBytes, const std::string& label = "MultiViewBuffer") + : buffer_m(label, totalBytes) + , currentOffset_m(0) {} + + ~MultiViewBuffer() = default; + + // Non-copyable + MultiViewBuffer(const MultiViewBuffer&) = delete; + MultiViewBuffer& operator=(const MultiViewBuffer&) = delete; + + // Movable + MultiViewBuffer(MultiViewBuffer&& other) noexcept + : buffer_m(std::move(other.buffer_m)) + , currentOffset_m(other.currentOffset_m) { + other.currentOffset_m = 0; + } + + MultiViewBuffer& operator=(MultiViewBuffer&& other) noexcept { + if (this != &other) { + buffer_m = std::move(other.buffer_m); + currentOffset_m = other.currentOffset_m; + other.currentOffset_m = 0; + } + return *this; + } + + /** + * @brief Allocate or reallocate the underlying buffer. + * @param totalBytes Total size in bytes + * @param label Optional Kokkos label for debugging + */ + void allocate(size_t totalBytes, const std::string& label = "MultiViewBuffer") { + buffer_m = buffer_view_type(label, totalBytes); + currentOffset_m = 0; + } + + /** + * @brief Get an aligned view of count elements of type T from the buffer. + * + * Views are allocated sequentially from the buffer. Each view starts at + * an address aligned to DEFAULT_ALIGNMENT (256 bytes). + * + * @tparam T Element type for the view + * @param count Number of elements + * @return Unmanaged Kokkos::View pointing into the buffer + */ + template + Kokkos::View> + getView(size_t count) { + using view_type = Kokkos::View>; + + size_t alignedOffset = alignUp(currentOffset_m, DEFAULT_ALIGNMENT); + size_t requiredSize = alignedOffset + count * sizeof(T); + + if (requiredSize > buffer_m.size()) { + throw std::runtime_error( + "MultiViewBuffer: insufficient space. Required: " + + std::to_string(requiredSize) + + ", Available: " + std::to_string(buffer_m.size())); + } + + T* viewPtr = reinterpret_cast(buffer_m.data() + alignedOffset); + currentOffset_m = alignedOffset + count * sizeof(T); + + return view_type(viewPtr, count); + } + + /** + * @brief Reset the allocation offset to reuse the buffer for new views. + * + * Previously obtained views become invalid after this call. + */ + void reset() { currentOffset_m = 0; } + + /** + * @brief Check if buffer has been allocated. + */ + bool isAllocated() const { return buffer_m.size() > 0; } + + /** + * @brief Get the current used size in bytes. + */ + size_t getUsedSize() const { return currentOffset_m; } + + /** + * @brief Get the total available size in bytes. + */ + size_t getTotalSize() const { return buffer_m.size(); } + + /** + * @brief Get remaining available bytes. + */ + size_t getRemainingSize() const { return getTotalSize() - currentOffset_m; } + + /** + * @brief Get raw pointer to buffer start. + */ + char* data() { return buffer_m.data(); } + const char* data() const { return buffer_m.data(); } + + private: + buffer_view_type buffer_m; + size_t currentOffset_m = 0; + }; + + /** + * @brief Helper struct for recursive buffer size computation + */ + template + struct BufferSizeComputer; + + // Base case: single type + template + struct BufferSizeComputer { + static constexpr size_t compute(size_t offset, size_t count) { + return alignUp(offset, MultiViewBuffer<>::DEFAULT_ALIGNMENT) + count * sizeof(T); + } + }; + + // Recursive case: multiple types + template + struct BufferSizeComputer { + template + static constexpr size_t compute(size_t offset, size_t count, Counts... counts) { + static_assert(sizeof...(Rest) == sizeof...(Counts), + "Number of remaining types must match remaining counts"); + size_t alignedOffset = alignUp(offset, MultiViewBuffer<>::DEFAULT_ALIGNMENT); + size_t newOffset = alignedOffset + count * sizeof(T); + return BufferSizeComputer::compute(newOffset, counts...); + } + }; + + /** + * @brief Compute total buffer size needed for multiple views with alignment. + * + * Usage: auto size = computeBufferSize(100, 200, 50); + * + * @tparam Ts Element types for each view + * @param counts Number of elements for each view + * @return Total required buffer size in bytes + */ + template + constexpr size_t computeBufferSize(Counts... counts) { + static_assert(sizeof...(Ts) == sizeof...(Counts), + "Number of types must match number of counts"); + return BufferSizeComputer::compute(0, static_cast(counts)...); + } + +} // namespace ippl + +#endif // IPPL_BUFFERVIEW_H \ No newline at end of file diff --git a/src/Utility/IpplTimings.cpp b/src/Utility/IpplTimings.cpp index e0f65f36f..93fd9447b 100644 --- a/src/Utility/IpplTimings.cpp +++ b/src/Utility/IpplTimings.cpp @@ -28,7 +28,10 @@ #include #include +#include #include +#include +#include #include "Utility/Inform.h" #include "Utility/IpplInfo.h" @@ -53,8 +56,10 @@ const int num_colors = sizeof(colors) / sizeof(uint32_t); } #endif +// Static member initialization Timing* IpplTimings::instance = new Timing(); std::stack IpplTimings::stashedInstance; +const std::vector Timing::emptyMeasurements; Timing::Timing() : TimerList() @@ -113,6 +118,111 @@ void Timing::clearTimer(TimerRef t) { TimerList[t]->clear(); } +// Reset all timers - useful for warmup +void Timing::resetAllTimers() { + for (unsigned int i = 0; i < TimerList.size(); ++i) { + TimerList[i]->clearAll(); + } +} +// Get measurements for a specific timer by reference +const std::vector& Timing::getMeasurements(TimerRef t) const { + if (t >= TimerList.size()) + return emptyMeasurements; + return TimerList[t]->measurements; +} + +// Get measurements for a specific timer by name +const std::vector& Timing::getMeasurements(const std::string& name) const { + TimerMap_t::const_iterator loc = TimerMap.find(name); + if (loc == TimerMap.end()) + return emptyMeasurements; + return loc->second->measurements; +} + +// Get measurement count for a timer +size_t Timing::getMeasurementCount(TimerRef t) const { + if (t >= TimerList.size()) + return 0; + return TimerList[t]->measurements.size(); +} + +// Get all timer names +std::vector Timing::getTimerNames() const { + std::vector names; + names.reserve(TimerList.size()); + for (unsigned int i = 0; i < TimerList.size(); ++i) { + names.push_back(TimerList[i]->name); + } + return names; +} + +// Dump all measurements to CSV (default format) +void Timing::dumpToCSV(const std::string& filename) { + dumpToCSV(filename, ",", true); +} + +// Dump all measurements to CSV with custom options +void Timing::dumpToCSV(const std::string& filename, const std::string& delimiter, + bool includeHeader) { + const int rank = ippl::Comm->rank(); + const int numRanks = ippl::Comm->size(); + + // Each rank serialises its measurements into a local buffer. + std::ostringstream localData; + for (unsigned int i = 0; i < TimerList.size(); ++i) { + TimerInfo* tptr = TimerList[i].get(); + const std::string& timerName = tptr->name; + const std::vector& measurements = tptr->measurements; + for (size_t j = 0; j < measurements.size(); ++j) { + localData << timerName << delimiter << rank << delimiter << j << delimiter + << std::setprecision(12) << measurements[j] << "\n"; + } + } + const std::string localStr = localData.str(); + + // Gather sizes from every rank using a single collective rather than + // O(P) point-to-point pairs. + MPI_Aint localSize = static_cast(localStr.size()); + std::vector sizes(numRanks, 0); + MPI_Gather(&localSize, 1, MPI_AINT, sizes.data(), 1, MPI_AINT, 0, + ippl::Comm->getCommunicator()); + + // Compute displacements and total size on rank 0; all ranks supply their + // bytes via MPI_Gatherv. + std::vector intSizes(numRanks, 0); + std::vector displs(numRanks, 0); + std::vector all; + if (rank == 0) { + long long total = 0; + for (int r = 0; r < numRanks; ++r) { + // The MPI standard limits Gatherv counts to int. Real CSV blobs + // never exceed INT_MAX in practice; refuse rather than silently + // truncate if they do. + if (sizes[r] > static_cast(std::numeric_limits::max())) { + std::cerr << "Timing::dumpToCSV: rank " << r << " has " << sizes[r] + << " bytes (> INT_MAX); truncating\n"; + } + intSizes[r] = static_cast(std::min( + sizes[r], static_cast(std::numeric_limits::max()))); + displs[r] = static_cast(total); + total += intSizes[r]; + } + all.resize(static_cast(total)); + } + + MPI_Gatherv(localStr.data(), static_cast(localStr.size()), MPI_CHAR, all.data(), + intSizes.data(), displs.data(), MPI_CHAR, 0, ippl::Comm->getCommunicator()); + + if (rank == 0) { + std::ofstream outFile(filename); + if (includeHeader) { + outFile << "timer_name" << delimiter << "rank" << delimiter << "measurement_id" + << delimiter << "duration_seconds" << "\n"; + } + outFile.write(all.data(), static_cast(all.size())); + } +} + // print out the timing results void Timing::print() { if (TimerList.size() < 1) @@ -153,8 +263,55 @@ void Timing::print() { << std::string().assign(20, ' ') << " Wall min = " << std::setw(10) << wallmin << "\n" << "\n"; } + + msg << "---------------------------------------------\n"; + msg << " Measurement counts:\n"; + msg << "---------------------------------------------\n"; + for (unsigned int i = 0; i < TimerList.size(); ++i) { + TimerInfo* tptr = TimerList[i].get(); + size_t lengthName = std::min(tptr->name.length(), 19lu); + msg << tptr->name.substr(0, lengthName) << std::string().assign(20 - lengthName, '.') + << " Count = " << std::setw(10) << tptr->measurements.size() << "\n"; + } + msg << "---------------------------------------------"; msg << endl; + + // Per-occurrence statistics + msg << "---------------------------------------------\n"; + msg << " Per-occurrence statistics:\n"; + msg << "---------------------------------------------\n"; + for (unsigned int i = 0; i < TimerList.size(); ++i) { + TimerInfo* tptr = TimerList[i].get(); + const std::vector& m = tptr->measurements; + size_t count = m.size(); + if (count == 0) continue; + + double sum = 0.0, minVal = m[0], maxVal = m[0]; + for (size_t j = 0; j < count; ++j) { + sum += m[j]; + if (m[j] < minVal) minVal = m[j]; + if (m[j] > maxVal) maxVal = m[j]; + } + double mean = sum / count; + + double varSum = 0.0; + for (size_t j = 0; j < count; ++j) { + double diff = m[j] - mean; + varSum += diff * diff; + } + double stddev = (count > 1) ? std::sqrt(varSum / (count - 1)) : 0.0; + + size_t lengthName = std::min(tptr->name.length(), 19lu); + msg << tptr->name.substr(0, lengthName) << std::string().assign(20 - lengthName, '.') + << " n=" << std::setw(5) << count + << " mean=" << std::setw(10) << mean + << " std=" << std::setw(10) << stddev + << " min=" << std::setw(10) << minVal + << " max=" << std::setw(10) << maxVal + << "\n"; + } + msg << "---------------------------------------------" << endl; } // save the timing results into a file @@ -234,4 +391,4 @@ void IpplTimings::pop() { delete instance; instance = stashedInstance.top(); stashedInstance.pop(); -} +} \ No newline at end of file diff --git a/src/Utility/IpplTimings.h b/src/Utility/IpplTimings.h index aea1332cd..82230faea 100644 --- a/src/Utility/IpplTimings.h +++ b/src/Utility/IpplTimings.h @@ -21,6 +21,14 @@ // 4) print out the results: // IpplTimings::print(); // +// 5) reset all timers (for warmup): +// IpplTimings::resetAllTimers(); +// Clears all accumulated times and individual measurements. +// +// 6) export to CSV for violin plots: +// IpplTimings::dumpToCSV("output.csv"); +// Exports all individual timing measurements for statistical analysis. +// #ifndef IPPL_TIMINGS_H #define IPPL_TIMINGS_H @@ -45,7 +53,8 @@ class IpplTimerInfo { IpplTimerInfo() : name("") , wallTime(0.0) - , indx(std::numeric_limits::max()) { + , indx(std::numeric_limits::max()) + , measurements() { clear(); } @@ -55,18 +64,19 @@ class IpplTimerInfo { // timer operations void start() { if (!running) { - running = true; - t.stop(); - t.clear(); + t.clear(); // running was false, no need to stop first t.start(); + running = true; } } void stop() { if (running) { t.stop(); - running = false; - wallTime += t.elapsed(); + running = false; + const double elapsed = t.elapsed(); + wallTime += elapsed; + measurements.push_back(elapsed); } } @@ -76,6 +86,12 @@ class IpplTimerInfo { running = false; } + void clearAll() { + clear(); + wallTime = 0.0; + measurements.clear(); + } + // the IPPL timer that this object manages Timer t; @@ -90,6 +106,8 @@ class IpplTimerInfo { // an index value for this timer TimerRef indx; + + std::vector measurements; }; struct Timing { @@ -126,6 +144,20 @@ struct Timing { // print the results to a file void print(const std::string& fn, const std::map& problemSize); + void resetAllTimers(); + + // Format: timer_name,rank,measurement_id,duration + void dumpToCSV(const std::string& filename); + + void dumpToCSV(const std::string& filename, const std::string& delimiter, bool includeHeader); + + const std::vector& getMeasurements(TimerRef t) const; + const std::vector& getMeasurements(const std::string& name) const; + + size_t getMeasurementCount(TimerRef t) const; + + std::vector getTimerNames() const; + // type of storage for list of TimerInfo typedef std::vector > TimerList_t; typedef std::map TimerMap_t; @@ -136,6 +168,9 @@ struct Timing { // a map of timers, keyed by string TimerMap_t TimerMap; + + // Returned by reference when a requested timer is not found. + static const std::vector emptyMeasurements; }; class IpplTimings { @@ -173,6 +208,36 @@ class IpplTimings { static void stash(); static void pop(); + // Use this after warmup iterations to get clean timing data + static void resetAllTimers() { instance->resetAllTimers(); } + + // Format: timer_name,rank,measurement_id,duration_seconds + // Perfect for creating violin plots in Python/R + static void dumpToCSV(const std::string& filename) { + instance->dumpToCSV(filename); + } + + static void dumpToCSV(const std::string& filename, + const std::string& delimiter, + bool includeHeader = true) { + instance->dumpToCSV(filename, delimiter, includeHeader); + } + + static const std::vector& getMeasurements(TimerRef t) { + return instance->getMeasurements(t); + } + static const std::vector& getMeasurements(const std::string& name) { + return instance->getMeasurements(name); + } + + static size_t getMeasurementCount(TimerRef t) { + return instance->getMeasurementCount(t); + } + + static std::vector getTimerNames() { + return instance->getTimerNames(); + } + private: // type of storage for list of TimerInfo typedef Timing::TimerList_t TimerList_t; @@ -188,4 +253,4 @@ class IpplTimings { static std::stack stashedInstance; }; -#endif +#endif \ No newline at end of file diff --git a/src/Utility/ParallelDispatch.h b/src/Utility/ParallelDispatch.h index 437c56c22..4e342aa02 100644 --- a/src/Utility/ParallelDispatch.h +++ b/src/Utility/ParallelDispatch.h @@ -7,6 +7,7 @@ #define IPPL_PARALLEL_DISPATCH_H #include +#include "Ippl.h" #include @@ -68,8 +69,8 @@ namespace ippl { } return policy_type(begin, end); } - // Silences incorrect nvcc warning: missing return statement at end of non-void function - throw IpplException("detail::getRangePolicy", "Unreachable state"); + // Silences nvcc "missing return" at end of exhaustive if constexpr. + __builtin_unreachable(); } /*! @@ -94,8 +95,58 @@ namespace ippl { } else { return policy_type(begin, end); } - // Silences incorrect nvcc warning: missing return statement at end of non-void function - throw IpplException("detail::createRangePolicy", "Unreachable state"); + // Silences nvcc "missing return" at end of exhaustive if constexpr. + __builtin_unreachable(); + } + + template + KOKKOS_FORCEINLINE_FUNCTION void for_constexpr(std::integer_sequence, F&& f) { + (f.template operator()(), ...); + } + + template + KOKKOS_FORCEINLINE_FUNCTION auto product_over(std::integer_sequence, F&& f) { + return (f.template operator()() * ...); + } + + template + KOKKOS_FORCEINLINE_FUNCTION auto product_over(F&& f) { + return product_over(std::make_integer_sequence{}, std::forward(f)); + } + + template + KOKKOS_FORCEINLINE_FUNCTION void thread_vector_md_for(const Team& team, const Extents& extents, + F&& f) { + [&](std::integer_sequence) { + Kokkos::parallel_for( + Kokkos::ThreadVectorMDRange, Team>(team, extents[Is]...), + std::forward(f)); + }(std::make_integer_sequence{}); + } + + template + KOKKOS_FORCEINLINE_FUNCTION void thread_vector_stencil_for(const Team& team, F&& f) { + constexpr int TotalCells = [] { + int r = 1; + for (int i = 0; i < Dim; ++i) + r *= W; + return r; + }(); + + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, TotalCells), [&](const int flat_idx) { + // Compile-time unrollable index decomposition + Kokkos::Array idx; + int tmp = flat_idx; + for (int d = Dim - 1; d >= 0; --d) { + idx[d] = tmp % W; + tmp /= W; + } + + // Call with index pack + [&](std::integer_sequence) { + f(idx[Is]...); + }(std::make_integer_sequence{}); + }); } namespace detail { @@ -121,13 +172,50 @@ namespace ippl { enum e_functor_type { FOR, - REDUCE, - SCAN + REDUCE }; template struct FunctorWrapper; + template + constexpr bool inline isGPUSpace = false; + +#ifdef KOKKOS_ENABLE_CUDA + template <> + constexpr bool inline isGPUSpace = true; +#endif +#ifdef KOKKOS_ENABLE_HIP + template <> + constexpr bool inline isGPUSpace = true; +#endif + + // Dispatches F(i) for i in [0, n) either in parallel (OpenMP host) + // when MPI_THREAD_MULTIPLE is available, or serially otherwise. + // `f(i)` must be synchronous w.r.t. its own work — this dispatcher + // fences after the parallel_for so the caller's next operation sees + // a consistent state. + template + void parallelForMPI(size_t n, F&& f) { + constexpr bool useGPU = isGPUSpace; + const bool threadSafe = Env->threadMultiple(); + + if constexpr (useGPU) { + if (threadSafe) { + Kokkos::parallel_for( + "Parallel dispatch", + Kokkos::RangePolicy(0, n), [&](int i) { + f(i); + }); + Kokkos::fence(); + return; + } + } + for (size_t i = 0; i < n; ++i) { + f(i); + } + } + /*! * Wrapper struct for reduction kernels * Source: @@ -179,9 +267,7 @@ namespace ippl { static constexpr int rank = Kokkos::MDRangePolicy::rank; }; template - concept HasMemberValueType = requires() { - { typename T::value_type() }; - }; + concept HasMemberValueType = requires { typename T::value_type; }; template struct ExtractReducerReturnType { using type = T; @@ -227,6 +313,31 @@ namespace ippl { functor), std::forward(reducer)...); } + + template + KOKKOS_FORCEINLINE_FUNCTION auto get_arg(T first, Rest... rest) { + if constexpr (I == 0) { + return first; + } else { + return get_arg(rest...); + } + } + + template + KOKKOS_FORCEINLINE_FUNCTION decltype(auto) grid_at(Grid& grid, const Base& base, int team_rank, + Stencils... stencil_idx) { + return [&](std::index_sequence) -> decltype(auto) { + return grid((base(team_rank, Is) + get_arg(stencil_idx...))...); + }(std::index_sequence_for{}); + } + + template + KOKKOS_FORCEINLINE_FUNCTION decltype(auto) grid_at_t(Grid& grid, const Base& base, + int team_rank, Stencils& stencil_idx) { + return [&](std::index_sequence) -> decltype(auto) { + return grid((base(team_rank, Is) + stencil_idx[Is])...); + }(std::make_index_sequence{}); + } } // namespace ippl #endif diff --git a/src/Utility/ParameterList.h b/src/Utility/ParameterList.h index d31f46d5d..ad9a2da11 100644 --- a/src/Utility/ParameterList.h +++ b/src/Utility/ParameterList.h @@ -64,6 +64,8 @@ namespace ippl { return std::get(params_m.at(key)); } + bool contains(const std::string& key) const { return params_m.contains(key); } + /*! * Obtain the value of a parameter. If the key is * not contained, the default value is returned. diff --git a/src/Utility/Timer.cpp b/src/Utility/Timer.cpp index d6da687bd..de2b4335e 100644 --- a/src/Utility/Timer.cpp +++ b/src/Utility/Timer.cpp @@ -8,8 +8,17 @@ #include "Timer.h" +#include + bool Timer::enableFences = IPPL_ENABLE_TIMER_FENCES; +namespace { + using clock_type = std::chrono::high_resolution_clock; + inline std::int64_t now_ticks() { + return clock_type::now().time_since_epoch().count(); + } +} // namespace + Timer::Timer() { this->clear(); } @@ -19,16 +28,22 @@ void Timer::clear() { } void Timer::start() { - start_m = std::chrono::high_resolution_clock::now(); + // Fence on start as well: any in-flight kernel from before the timer was + // armed otherwise leaks its tail latency into the measured interval. + if (enableFences) { + Kokkos::fence("Timer Fence (start)"); + } + start_m = now_ticks(); } void Timer::stop() { if (enableFences) { - Kokkos::fence(); + Kokkos::fence("Timer Fence (stop)"); } - stop_m = std::chrono::high_resolution_clock::now(); + stop_m = now_ticks(); - duration_type elapsed = stop_m - start_m; + std::chrono::duration elapsed = + clock_type::duration(stop_m) - clock_type::duration(start_m); elapsed_m += elapsed.count(); } diff --git a/src/Utility/Timer.h b/src/Utility/Timer.h index a23d26f28..653eae588 100644 --- a/src/Utility/Timer.h +++ b/src/Utility/Timer.h @@ -8,17 +8,14 @@ #define IPPL_TIMER_H #ifndef IPPL_ENABLE_TIMER_FENCES -#warning "IPPL timer fences were not set via CMake! Defaulting to no fences." +#pragma message("IPPL_ENABLE_TIMER_FENCES not set by CMake; defaulting to false.") #define IPPL_ENABLE_TIMER_FENCES false #endif -#include +#include class Timer { public: - using timer_type = std::chrono::time_point; - using duration_type = std::chrono::duration; - static bool enableFences; Timer(); @@ -31,7 +28,7 @@ class Timer { private: double elapsed_m; - timer_type start_m, stop_m; + std::int64_t start_m = 0, stop_m = 0; }; #endif diff --git a/src/Utility/Tuning.h b/src/Utility/Tuning.h new file mode 100644 index 000000000..8f5177b69 --- /dev/null +++ b/src/Utility/Tuning.h @@ -0,0 +1,213 @@ +#ifndef IPPL_TUNING_H +#define IPPL_TUNING_H + +#include +#include +#include +#include +#include + +// Kokkos's tuning interface is still exposed only through this internal +// header in 5.0.x. When a public header (Kokkos_Tools.hpp) becomes +// available across the supported version range, switch to it. +#include + +namespace ippl { + +template +class TileSizeTuner { +public: + using TileConfig = TileType; + +private: + // Tuning parameters (following Kokkos convention) + static constexpr double tuning_min = 0.0; + static constexpr double tuning_max = 1.0; + static constexpr double tuning_step = 0.05; // 20 steps per dimension + + std::array variable_ids_{}; + size_t context_id_ = 0; + bool initialized_ = false; + bool context_active_ = false; + + std::vector candidates_; + TileConfig default_tile_; + size_t max_scratch_ = 0; + std::function scratch_calc_; + +public: + TileSizeTuner() = default; + + template + void initialize(const std::string& kernel_name, + const std::vector& candidates, + size_t max_scratch, + ScratchCalculator&& calc, + const TileConfig& default_tile) { + if (initialized_) return; + + candidates_ = candidates; + max_scratch_ = max_scratch; + scratch_calc_ = std::forward(calc); + default_tile_ = default_tile; + + // Sort candidates for consistent mapping + std::sort(candidates_.begin(), candidates_.end()); + + if (Kokkos::Tools::Experimental::have_tuning_tool()) { + using namespace Kokkos::Tools::Experimental; + + for (unsigned d = 0; d < Dim; ++d) { + VariableInfo info; + info.type = ValueType::kokkos_value_double; + info.category = StatisticalCategory::kokkos_value_interval; + info.valueQuantity = CandidateValueType::kokkos_value_range; + info.candidates = make_candidate_range( + tuning_min, tuning_max, tuning_step, false, false); + + variable_ids_[d] = declare_output_type( + kernel_name + "_tile_size_" + std::to_string(d), info); + } + } + + initialized_ = true; + } + + bool is_initialized() const { return initialized_; } + + // Map normalized value [0,1] to candidate index. With no candidates the + // tuner has nothing to choose from, so return 1 as a defensive default. + int map_to_candidate(double normalized) const { + if (candidates_.empty()) return 1; + + normalized = std::clamp(normalized, 0.0, 1.0); + + size_t idx = static_cast(normalized * (candidates_.size() - 1) + 0.5); + idx = std::min(idx, candidates_.size() - 1); + + return candidates_[idx]; + } + + // Map candidate value back to normalized [0,1]. Returns 0.0 (matching the + // behaviour of an empty/single-element candidate set) when the candidate + // list cannot meaningfully be mapped. + double map_to_normalized(int candidate) const { + if (candidates_.size() <= 1) return 0.0; + + auto it = std::lower_bound(candidates_.begin(), candidates_.end(), candidate); + size_t idx = (it != candidates_.end()) ? std::distance(candidates_.begin(), it) + : candidates_.size() - 1; + + return static_cast(idx) / (candidates_.size() - 1); + } + + TileConfig begin() { + if (!initialized_) { + return default_tile_; + } + + TileConfig result = default_tile_; + + if (Kokkos::Tools::Experimental::have_tuning_tool()) { + using namespace Kokkos::Tools::Experimental; + + context_id_ = get_new_context_id(); + begin_context(context_id_); + context_active_ = true; + + // Request tuned values for each dimension + std::array values; + for (unsigned d = 0; d < Dim; ++d) { + double default_norm = map_to_normalized(default_tile_[d]); + values[d] = make_variable_value(variable_ids_[d], default_norm); + } + + request_output_values(context_id_, Dim, values.data()); + + // Map normalized values back to tile sizes + for (unsigned d = 0; d < Dim; ++d) { + result[d] = map_to_candidate(values[d].value.double_value); + } + + // Validate scratch constraint - scale down if needed + result = fit_to_scratch(result); + } + + return result; + } + + void end() { + if (context_active_ && Kokkos::Tools::Experimental::have_tuning_tool()) { + Kokkos::Tools::Experimental::end_context(context_id_); + context_active_ = false; + } + } + +private: + // Scale down proportionally if tile doesn't fit in scratch. + TileConfig fit_to_scratch(const TileConfig& tile) const { + if (scratch_calc_(tile) <= max_scratch_) { + return tile; + } + + // Binary search for the largest scale factor that fits. The result + // space is integer-valued (we snap to a discrete candidate after + // scaling), so the number of meaningful iterations is bounded by + // log2(max tile dim), capped at 20 for robustness. + int max_tile = 1; + for (unsigned d = 0; d < Dim; ++d) { + max_tile = std::max(max_tile, tile[d]); + } + int max_iter = std::min(20, static_cast(std::ceil(std::log2(max_tile + 1)) + 1)); + + TileConfig result = tile; + double lo = 0.0, hi = 1.0; + for (int iter = 0; iter < max_iter; ++iter) { + double mid = (lo + hi) / 2.0; + + TileConfig test; + for (unsigned d = 0; d < Dim; ++d) { + int scaled = static_cast(tile[d] * mid); + test[d] = snap_to_candidate(std::max(1, scaled)); + } + + if (scratch_calc_(test) <= max_scratch_) { + lo = mid; + result = test; + } else { + hi = mid; + } + } + + // Final fallback - smallest candidate. If even that overflows the + // scratch budget the kernel cannot run; throw so the caller doesn't + // get a confusing Kokkos::abort deeper in the dispatch. + if (scratch_calc_(result) > max_scratch_) { + for (unsigned d = 0; d < Dim; ++d) { + result[d] = candidates_.front(); + } + if (scratch_calc_(result) > max_scratch_) { + throw std::runtime_error( + "TileSizeTuner::fit_to_scratch: even the smallest tile " + "exceeds the scratch budget"); + } + } + + return result; + } + + int snap_to_candidate(int value) const { + if (candidates_.empty()) return value; + + // Find largest candidate <= value + auto it = std::upper_bound(candidates_.begin(), candidates_.end(), value); + if (it == candidates_.begin()) { + return candidates_.front(); + } + return *std::prev(it); + } +}; + +} // namespace ippl + +#endif // IPPL_TUNING_H \ No newline at end of file diff --git a/src/Utility/TypeUtils.h b/src/Utility/TypeUtils.h index 9b47a3729..bc888e6d8 100644 --- a/src/Utility/TypeUtils.h +++ b/src/Utility/TypeUtils.h @@ -295,10 +295,9 @@ namespace ippl { } else { return spaceToIndex(); } - // Silences incorrect nvcc warning: missing return statement at end of non-void - // function - throw IpplException("detail::MultispaceContainer::spaceToIndex", - "Unreachable state"); + // Suppress nvcc "missing return" warning at the end of an + // exhaustive if constexpr chain. + __builtin_unreachable(); } /*! @@ -311,18 +310,16 @@ namespace ippl { /*! * Determine whether the element for a space should be initialized, - * possibly based on a predicate functor + * possibly based on a predicate functor. A nullptr_t Filter + * unconditionally enables the copy. */ - template >, int> = 0> - constexpr bool copyToSpace(Filter&&) { - return true; - } - - template >, int> = 0> + template bool copyToSpace(Filter&& predicate) { - return predicate.template operator()(); + if constexpr (std::is_null_pointer_v>) { + return true; + } else { + return predicate.template operator()(); + } } public: @@ -341,10 +338,22 @@ namespace ippl { template MultispaceContainer(const DataType& data, Filter&& predicate = nullptr) : MultispaceContainer() { - using space = typename DataType::memory_space; - static_assert(std::is_same_v>); - - elements_m[spaceToIndex()] = data; + static_assert(Kokkos::is_view::value, "DataType must be a Kokkos::View"); + + using space = typename DataType::memory_space; + using expected = Type; + + static_assert(std::is_same_v, + "Hash view must live in the same memory_space as expected."); + static_assert(std::is_same_v, + "Hash view must have the same value_type as expected."); + static_assert(DataType::rank == expected::rank, + "Hash view must have the same rank as expected."); + + expected normalized = data; + elements_m[spaceToIndex()] = normalized; copyToOtherSpaces(predicate); } @@ -433,6 +442,40 @@ namespace ippl { runner(all_spaces{}); } } // namespace detail + + template + struct is_complex : std::false_type {}; + + template + struct is_complex> : std::true_type {}; + + template + inline constexpr bool is_complex_v = is_complex::value; + + template + KOKKOS_FORCEINLINE_FUNCTION decltype(auto) real_part(T& val) { + if constexpr (is_complex_v>) { + return val.real(); + } else { + return val; + } + } + + template + KOKKOS_FORCEINLINE_FUNCTION decltype(auto) to_grid_value(T& val) { + if constexpr (is_complex_v> && !is_complex_v>) { + return val.real(); + } else { + return val; + } + } + + constexpr int int_pow(int base, int exp) { + int result = 1; + for (int i = 0; i < exp; ++i) + result *= base; + return result; + } } // namespace ippl #endif diff --git a/src/Utility/ViewUtils.h b/src/Utility/ViewUtils.h index 6ef5e8906..dbb36b2e7 100644 --- a/src/Utility/ViewUtils.h +++ b/src/Utility/ViewUtils.h @@ -85,8 +85,7 @@ namespace ippl { template void write(const typename ViewType::view_type& view, std::ostream& out = std::cout) { - using view_type = typename ViewType::view_type; - typename view_type::host_mirror_type hview = Kokkos::create_mirror_view(view); + auto hview = Kokkos::create_mirror_view(view); Kokkos::deep_copy(hview, view); nestedViewLoop( diff --git a/unit_tests/Communicate/BufferHandler.cpp b/unit_tests/Communicate/BufferHandler.cpp index eef387da9..4612373e7 100644 --- a/unit_tests/Communicate/BufferHandler.cpp +++ b/unit_tests/Communicate/BufferHandler.cpp @@ -13,6 +13,14 @@ template class TypedBufferHandlerTest : public ::testing::Test { protected: using memory_space = MemorySpace; + static constexpr size_t page_size = 4096; + + static size_t roundedSize(size_t size) { + if (size < page_size) { + return page_size; + } + return ((size + page_size - 1) / page_size) * page_size; + } class TestableBufferHandler : public ippl::DefaultBufferHandler { public: @@ -37,7 +45,7 @@ TYPED_TEST_SUITE(TypedBufferHandlerTest, MemorySpaces); TYPED_TEST(TypedBufferHandlerTest, GetBuffer_EmptyFreeBuffers) { auto buffer = this->handler->getBuffer(100, 1.0); ASSERT_NE(buffer, nullptr); - EXPECT_EQ(buffer->getBufferSize(), 100); + EXPECT_EQ(buffer->getBufferSize(), TestFixture::roundedSize(100)); EXPECT_EQ(this->handler->usedBuffersSize(), 1); EXPECT_EQ(this->handler->freeBuffersSize(), 0); } @@ -48,7 +56,7 @@ TYPED_TEST(TypedBufferHandlerTest, GetBuffer_SuitableBufferAvailable) { this->handler->freeBuffer(buffer1); auto buffer2 = this->handler->getBuffer(40, 1.0); - EXPECT_EQ(buffer2->getBufferSize(), 50); + EXPECT_EQ(buffer2->getBufferSize(), TestFixture::roundedSize(50)); EXPECT_EQ(this->handler->usedBuffersSize(), 1); EXPECT_EQ(this->handler->freeBuffersSize(), 0); } @@ -91,7 +99,7 @@ TYPED_TEST(TypedBufferHandlerTest, GetBuffer_ResizeLargerThanAvailable) { this->handler->freeBuffer(smallBuffer); auto largeBuffer = this->handler->getBuffer(200, 1.0); - EXPECT_EQ(largeBuffer->getBufferSize(), 200); + EXPECT_EQ(largeBuffer->getBufferSize(), TestFixture::roundedSize(200)); EXPECT_EQ(this->handler->usedBuffersSize(), 1); EXPECT_EQ(this->handler->freeBuffersSize(), 0); } @@ -102,7 +110,7 @@ TYPED_TEST(TypedBufferHandlerTest, GetBuffer_ExactSizeMatch) { this->handler->freeBuffer(buffer1); auto buffer2 = this->handler->getBuffer(100, 1.0); - EXPECT_EQ(buffer2->getBufferSize(), 100); + EXPECT_EQ(buffer2->getBufferSize(), TestFixture::roundedSize(100)); EXPECT_EQ(this->handler->usedBuffersSize(), 1); EXPECT_EQ(this->handler->freeBuffersSize(), 0); } @@ -147,7 +155,7 @@ TYPED_TEST(TypedBufferHandlerTest, GetAllocatedAndFreeSize_EmptyHandler) { // Test: Allocating increases used buffer size correctly TYPED_TEST(TypedBufferHandlerTest, GetAllocatedAndFreeSize_AfterBufferAllocation) { auto buffer = this->handler->getBuffer(100, 1.0); - EXPECT_EQ(this->handler->getUsedSize(), 100); + EXPECT_EQ(this->handler->getUsedSize(), buffer->getBufferSize()); EXPECT_EQ(this->handler->getFreeSize(), 0); } @@ -157,7 +165,7 @@ TYPED_TEST(TypedBufferHandlerTest, GetAllocatedAndFreeSize_AfterFreeBuffer) { this->handler->freeBuffer(buffer); EXPECT_EQ(this->handler->getUsedSize(), 0); - EXPECT_EQ(this->handler->getFreeSize(), 100); + EXPECT_EQ(this->handler->getFreeSize(), buffer->getBufferSize()); } // Test: Correct size is computed after freeing all buffers @@ -168,7 +176,7 @@ TYPED_TEST(TypedBufferHandlerTest, GetAllocatedAndFreeSize_AfterFreeAllBuffers) this->handler->freeAllBuffers(); EXPECT_EQ(this->handler->getUsedSize(), 0); - EXPECT_EQ(this->handler->getFreeSize(), 150); + EXPECT_EQ(this->handler->getFreeSize(), buffer1->getBufferSize() + buffer2->getBufferSize()); } // Test: Deleting all buffers results in zero free or used buffer sizes @@ -189,7 +197,7 @@ TYPED_TEST(TypedBufferHandlerTest, GetAllocatedAndFreeSize_ResizeBufferLargerTha auto largeBuffer = this->handler->getBuffer(200, 1.0); - EXPECT_EQ(this->handler->getUsedSize(), 200); + EXPECT_EQ(this->handler->getUsedSize(), largeBuffer->getBufferSize()); EXPECT_EQ(this->handler->getFreeSize(), 0); } diff --git a/unit_tests/Particle/CMakeLists.txt b/unit_tests/Particle/CMakeLists.txt index 18359b944..78c4e9bfe 100644 --- a/unit_tests/Particle/CMakeLists.txt +++ b/unit_tests/Particle/CMakeLists.txt @@ -5,3 +5,5 @@ add_ippl_test(ParticleBase) add_ippl_test(ParticleBC) add_ippl_test(ParticleSendRecv) add_ippl_test(GatherScatterTest) +add_ippl_test(ParticleUpdate) +add_ippl_test(ParticleUpdateNonuniform) diff --git a/unit_tests/Particle/ParticleBase.cpp b/unit_tests/Particle/ParticleBase.cpp index 276afc25a..b2229a456 100644 --- a/unit_tests/Particle/ParticleBase.cpp +++ b/unit_tests/Particle/ParticleBase.cpp @@ -78,6 +78,7 @@ TYPED_TEST(ParticleBaseTest, CreateAndDestroy) { pbase->destroy(invalid, 500); // Verify remaining indices + mirror = pbase->ID.getHostMirror(); Kokkos::deep_copy(mirror, pbase->ID.getView()); for (int i = 0; i < 500; ++i) { // The even indices contain the original particles diff --git a/unit_tests/Particle/ParticleSendRecv.cpp b/unit_tests/Particle/ParticleSendRecv.cpp index 9b57b85aa..28e8ee0ca 100644 --- a/unit_tests/Particle/ParticleSendRecv.cpp +++ b/unit_tests/Particle/ParticleSendRecv.cpp @@ -33,7 +33,7 @@ class ParticleSendRecv>> : public ::testing:: ~Bunch() = default; - using charge_container_type = ippl::ParticleAttrib; + using charge_container_type = ippl::ParticleAttrib; rank_type expectedRank; charge_container_type Q; @@ -64,10 +64,10 @@ class ParticleSendRecv>> : public ::testing:: origin[d] = 0; } - layout = flayout_type(MPI_COMM_WORLD, owned, isParallel); - mesh = mesh_type(owned, hx, origin); + layout = flayout_type(MPI_COMM_WORLD, owned, isParallel); + mesh = mesh_type(owned, hx, origin); playout_ptr = std::make_shared(layout, mesh); - bunch = std::make_shared(*playout_ptr); + bunch = std::make_shared(*playout_ptr); using BC = ippl::BC; @@ -109,7 +109,7 @@ class ParticleSendRecv>> : public ::testing:: using mdrange_type = Kokkos::MDRangePolicy, ExecSpace>; RegionLayout_t RLayout = playout_ptr->getRegionLayout(); - auto& positions = bunch->R.getView(); + auto positions = bunch->R.getView(); region_view Regions = RLayout.getdLocalRegions(); typename rank_type::view_type ER = bunch->expectedRank.getView(); diff --git a/unit_tests/Particle/ParticleUpdate.cpp b/unit_tests/Particle/ParticleUpdate.cpp new file mode 100644 index 000000000..f5faa73e8 --- /dev/null +++ b/unit_tests/Particle/ParticleUpdate.cpp @@ -0,0 +1,830 @@ +// +// Unit test: Particle spatial layout update / neighbor migration +// +// Tests that ippl::ParticleSpatialLayout::update() correctly migrates +// particles to the rank that owns the region containing the particle's +// position. The suite covers: +// +// 1. Baseline sanity – random particles already on correct rank. +// 2. Single deliberate out-of-bounds displacement (each axis / direction). +// 3. Corner / edge displacement (multi-axis simultaneous crossing). +// 4. Periodic wrap-around: particles placed just outside the global domain +// boundary reappear on the opposite rank. +// 5. All particles seeded on rank 0 and redistributed globally. +// 6. All particles seeded on the last rank and redistributed globally. +// 7. Non-uniform layouts: more cells assigned to one rank than the others, +// so that a single neighbour may span many grid cells. +// 8. Conservation: total particle count is invariant under update(). +// 9. Charge conservation: sum of Q is invariant under update(). +// 10. Multiple successive updates leave the layout stable. +// 11. Large particle count stress test. +// 12. Boundary-exact positions (particles sitting exactly on a rank +// boundary should belong to a well-defined rank, not be lost). +// 13. Zero-particle ranks: a rank with no particles participates correctly. +// 14. Heterogeneous displacement magnitudes in the same step. +// 15. 3-D corner migration (all three axes crossed simultaneously). +// +#include "Ippl.h" + +#include +#include +#include +#include +#include + +#include "TestUtils.h" +#include "gtest/gtest.h" + +// ============================================================ +// Helper types +// ============================================================ + +template +struct TestBunch + : public ippl::ParticleBase< + ippl::ParticleSpatialLayout, ExecSpace>> { + using playout_type = + ippl::ParticleSpatialLayout, ExecSpace>; + + explicit TestBunch(playout_type& pl) + : ippl::ParticleBase(pl) { + this->addAttribute(Q); + this->addAttribute(tag); + } + + // Per-particle charge (used for conservation checks) + ippl::ParticleAttrib Q; + + // Integer tag so we can track individual particles across ranks + ippl::ParticleAttrib tag; +}; + +// ============================================================ +// Fixture +// ============================================================ + +template +class TestParticleUpdate; + +template +class TestParticleUpdate>> : public ::testing::Test { +public: + // ---- type aliases -------------------------------------------------- + using T = T_; + static constexpr unsigned Dim = Dim_; + using flayout_type = ippl::FieldLayout; + using mesh_type = ippl::UniformCartesian; + using playout_type = ippl::ParticleSpatialLayout; + using RegionLayout_t = typename playout_type::RegionLayout_t; + using bunch_type = TestBunch; + using position_type = ippl::Vector; + using region_view = typename RegionLayout_t::view_type; + + // ---- construction -------------------------------------------------- + TestParticleUpdate() + : nPoints(getGridSizes()) { + for (unsigned d = 0; d < Dim; d++) { + domain[d] = static_cast(nPoints[d]) / 16.0; + } + + std::array args; + for (unsigned d = 0; d < Dim; d++) + args[d] = ippl::Index(nPoints[d]); + auto owned = std::make_from_tuple>(args); + + ippl::Vector hx, origin; + std::array isParallel; + isParallel.fill(true); + for (unsigned d = 0; d < Dim; d++) { + hx[d] = domain[d] / nPoints[d]; + origin[d] = 0; + } + + layout = std::make_shared(MPI_COMM_WORLD, owned, isParallel); + mesh = std::make_shared(owned, hx, origin); + playout_ptr = std::make_shared(*layout, *mesh); + } + + // ---- helpers ------------------------------------------------------- + + /// Create a fresh bunch and set periodic BCs. + std::shared_ptr makeBunch() { + auto b = std::make_shared(*playout_ptr); + typename bunch_type::bc_container_type bcs; + bcs.fill(ippl::BC::PERIODIC); + b->setParticleBC(bcs); + return b; + } + + /// Place n particles with uniformly random positions on every rank. + void fillRandom(bunch_type& b, unsigned n, unsigned long seed = 42) { + int nRanks = ippl::Comm->size(); + unsigned perRank = std::max(1u, n / nRanks); + b.create(perRank); + + std::mt19937_64 eng(seed + ippl::Comm->rank()); + std::uniform_real_distribution unif(T(0), T(1)); + + auto R_host = b.R.getHostMirror(); + auto Q_host = b.Q.getHostMirror(); + auto t_host = b.tag.getHostMirror(); + + // Global unique id = rank * kRankIdStride + local index. The stride + // bounds the per-rank particle count; 10M is comfortably above any + // unit-test bunch size and stays well clear of int32 overflow. + constexpr long long kRankIdStride = 10'000'000LL; + for (size_t i = 0; i < b.getLocalNum(); ++i) { + position_type r; + for (unsigned d = 0; d < Dim; d++) + r[d] = unif(eng) * domain[d]; + R_host(i) = r; + Q_host(i) = T(1); + t_host(i) = static_cast(ippl::Comm->rank()) * kRankIdStride + + static_cast(i); + } + Kokkos::deep_copy(b.R.getView(), R_host); + Kokkos::deep_copy(b.Q.getView(), Q_host); + Kokkos::deep_copy(b.tag.getView(), t_host); + } + + /// Reduce total particle count across all ranks. + size_t totalParticles(const bunch_type& b) const { + size_t local = b.getLocalNum(); + size_t total = 0; + ippl::Comm->reduce(local, total, 1, std::plus()); + return total; + } + + /// Reduce total charge across all ranks. + T totalCharge(bunch_type& b) const { + auto Q_host = b.Q.getHostMirror(); + Kokkos::deep_copy(Q_host, b.Q.getView()); + T local = T(0); + for (size_t i = 0; i < b.getLocalNum(); ++i) + local += Q_host(i); + T total = T(0); + ippl::Comm->reduce(local, total, 1, std::plus()); + return total; + } + + /// For every locally owned particle verify it lies inside the local region. + /// Returns the number of misplaced particles on this rank. + size_t countMisplaced(bunch_type& b) { + RegionLayout_t rl = playout_ptr->getRegionLayout(); + region_view regions = rl.getdLocalRegions(); + int myRank = ippl::Comm->rank(); + + auto R_host = b.R.getHostMirror(); + Kokkos::deep_copy(R_host, b.R.getView()); + + // Copy region bounds to host for checking + auto regions_host = Kokkos::create_mirror_view(regions); + Kokkos::deep_copy(regions_host, regions); + + size_t misplaced = 0; + for (size_t i = 0; i < b.getLocalNum(); ++i) { + bool inMyRegion = true; + for (unsigned d = 0; d < Dim; d++) { + T pos = R_host(i)[d]; + inMyRegion &= + (pos >= regions_host(myRank)[d].min() && pos <= regions_host(myRank)[d].max()); + } + if (!inMyRegion) + ++misplaced; + } + return misplaced; + } + + /// Wrap a coordinate periodically into [0, domainLen). + T periodicWrap(T x, T domainLen) const { + while (x < T(0)) + x += domainLen; + while (x >= domainLen) + x -= domainLen; + return x; + } + + // ---- data members -------------------------------------------------- + std::array nPoints; + std::array domain; + + std::shared_ptr layout; + std::shared_ptr mesh; + std::shared_ptr playout_ptr; +}; + +using Tests = TestParams::tests<1, 2, 3>; +TYPED_TEST_SUITE(TestParticleUpdate, Tests); + +// ============================================================ +// 1. Baseline: random particles already on the correct rank +// ============================================================ +TYPED_TEST(TestParticleUpdate, BaselineSanityNoMigrationNeeded) { + auto bunch = this->makeBunch(); + this->fillRandom(*bunch, 256); + + // update() should not lose or duplicate particles + const size_t before = this->totalParticles(*bunch); + bunch->update(); + const size_t after = this->totalParticles(*bunch); + + EXPECT_EQ(before, after); + EXPECT_EQ(0u, this->countMisplaced(*bunch)); +} + +// ============================================================ +// 2. Conservation of total particle count +// ============================================================ +TYPED_TEST(TestParticleUpdate, TotalParticleCountConserved) { + constexpr unsigned N = 512; + auto bunch = this->makeBunch(); + this->fillRandom(*bunch, N); + + const size_t before = this->totalParticles(*bunch); + bunch->update(); + + EXPECT_EQ(before, this->totalParticles(*bunch)); +} + +// ============================================================ +// 3. Conservation of total charge +// ============================================================ +TYPED_TEST(TestParticleUpdate, TotalChargeConserved) { + auto bunch = this->makeBunch(); + this->fillRandom(*bunch, 256); + + // Give every particle a distinct charge so any loss/duplication shows up + { + auto Q_host = bunch->Q.getHostMirror(); + for (size_t i = 0; i < bunch->getLocalNum(); ++i) + Q_host(i) = static_cast(i + 1); + Kokkos::deep_copy(bunch->Q.getView(), Q_host); + } + + const auto chargeBefore = this->totalCharge(*bunch); + bunch->update(); + const auto chargeAfter = this->totalCharge(*bunch); + + // Allow small floating-point rounding + EXPECT_NEAR(static_cast(chargeBefore), static_cast(chargeAfter), + 1e-6 * std::abs(static_cast(chargeBefore))); +} + +// ============================================================ +// 4. Single-axis displacement – particles must migrate to neighbour +// ============================================================ +TYPED_TEST(TestParticleUpdate, SingleAxisDisplacementMigratesCorrectly) { + using T = typename TestFixture::T; + + // Only meaningful with more than one rank + if (ippl::Comm->size() < 2) + GTEST_SKIP(); + + auto bunch = this->makeBunch(); + this->fillRandom(*bunch, 128); + + // Move every local particle half the domain length along axis 0 + // (wrapping with period). After update, everyone must be on the + // owning rank. + { + auto R_host = bunch->R.getHostMirror(); + Kokkos::deep_copy(R_host, bunch->R.getView()); + const T shift = this->domain[0] / T(2); + for (size_t i = 0; i < bunch->getLocalNum(); ++i) { + R_host(i)[0] = this->periodicWrap(R_host(i)[0] + shift, this->domain[0]); + } + Kokkos::deep_copy(bunch->R.getView(), R_host); + } + + const size_t before = this->totalParticles(*bunch); + bunch->update(); + + EXPECT_EQ(before, this->totalParticles(*bunch)); + EXPECT_EQ(0u, this->countMisplaced(*bunch)); +} + +// ============================================================ +// 5. Multi-axis (corner) displacement +// ============================================================ +TYPED_TEST(TestParticleUpdate, CornerDisplacementMigratesCorrectly) { + using T = typename TestFixture::T; + + if (ippl::Comm->size() < 2) + GTEST_SKIP(); + + auto bunch = this->makeBunch(); + this->fillRandom(*bunch, 128); + + // Shift along every axis simultaneously + { + auto R_host = bunch->R.getHostMirror(); + Kokkos::deep_copy(R_host, bunch->R.getView()); + for (size_t i = 0; i < bunch->getLocalNum(); ++i) { + for (unsigned d = 0; d < TestFixture::Dim; d++) { + const T shift = this->domain[d] * T(0.3); + R_host(i)[d] = this->periodicWrap(R_host(i)[d] + shift, this->domain[d]); + } + } + Kokkos::deep_copy(bunch->R.getView(), R_host); + } + + const size_t before = this->totalParticles(*bunch); + bunch->update(); + + EXPECT_EQ(before, this->totalParticles(*bunch)); + EXPECT_EQ(0u, this->countMisplaced(*bunch)); +} + +// ============================================================ +// 6. Periodic wrap-around: particles placed just outside global domain +// ============================================================ +TYPED_TEST(TestParticleUpdate, PeriodicWrapAroundRightBoundary) { + using T = typename TestFixture::T; + + auto bunch = this->makeBunch(); + + // Place one particle per rank just beyond the upper global boundary + // in axis 0; periodic BC should wrap it back. + bunch->create(1); + { + auto R_host = bunch->R.getHostMirror(); + auto Q_host = bunch->Q.getHostMirror(); + // Place at domain[0] + small epsilon → wraps to near 0 + T eps = this->domain[0] * T(1e-4); + R_host(0)[0] = this->domain[0] + eps; + for (unsigned d = 1; d < TestFixture::Dim; d++) + R_host(0)[d] = this->domain[d] * T(0.5); + Q_host(0) = T(1); + Kokkos::deep_copy(bunch->R.getView(), R_host); + Kokkos::deep_copy(bunch->Q.getView(), Q_host); + } + + const size_t before = this->totalParticles(*bunch); + bunch->update(); + + EXPECT_EQ(before, this->totalParticles(*bunch)); +} + +// ============================================================ +// 7. Periodic wrap-around: just outside lower global boundary +// ============================================================ +TYPED_TEST(TestParticleUpdate, PeriodicWrapAroundLeftBoundary) { + using T = typename TestFixture::T; + + auto bunch = this->makeBunch(); + bunch->create(1); + { + auto R_host = bunch->R.getHostMirror(); + auto Q_host = bunch->Q.getHostMirror(); + T eps = this->domain[0] * T(1e-4); + // Slightly negative position wraps to near domain[0] + R_host(0)[0] = -eps; + for (unsigned d = 1; d < TestFixture::Dim; d++) + R_host(0)[d] = this->domain[d] * T(0.5); + Q_host(0) = T(1); + Kokkos::deep_copy(bunch->R.getView(), R_host); + Kokkos::deep_copy(bunch->Q.getView(), Q_host); + } + + const size_t before = this->totalParticles(*bunch); + bunch->update(); + + EXPECT_EQ(before, this->totalParticles(*bunch)); +} + +// ============================================================ +// 8. All particles seeded on rank 0 → global redistribution +// ============================================================ +TYPED_TEST(TestParticleUpdate, AllParticlesOnRank0Redistributed) { + using T = typename TestFixture::T; + + unsigned N = ippl::Comm->rank() == 0 ? 256 : 0; + auto bunch = this->makeBunch(); + + bunch->create(N); + if (ippl::Comm->rank() == 0) { + std::mt19937_64 eng(1234); + std::uniform_real_distribution unif(T(0), T(1)); + auto R_host = bunch->R.getHostMirror(); + auto Q_host = bunch->Q.getHostMirror(); + for (unsigned i = 0; i < N; ++i) { + for (unsigned d = 0; d < TestFixture::Dim; d++) + R_host(i)[d] = unif(eng) * this->domain[d]; + Q_host(i) = T(1); + } + Kokkos::deep_copy(bunch->R.getView(), R_host); + Kokkos::deep_copy(bunch->Q.getView(), Q_host); + } + + const size_t before = this->totalParticles(*bunch); + ASSERT_EQ(before, static_cast(N)); + + bunch->update(); + + EXPECT_EQ(before, this->totalParticles(*bunch)); + EXPECT_EQ(0u, this->countMisplaced(*bunch)); +} + +// ============================================================ +// 9. All particles seeded on last rank → global redistribution +// ============================================================ +TYPED_TEST(TestParticleUpdate, AllParticlesOnLastRankRedistributed) { + using T = typename TestFixture::T; + int lastRank = ippl::Comm->size() - 1; + unsigned N = ippl::Comm->rank() == lastRank ? 256 : 0; + auto bunch = this->makeBunch(); + bunch->create(N); + + if (ippl::Comm->rank() == lastRank) { + std::mt19937_64 eng(5678); + std::uniform_real_distribution unif(T(0), T(1)); + auto R_host = bunch->R.getHostMirror(); + auto Q_host = bunch->Q.getHostMirror(); + for (unsigned i = 0; i < N; ++i) { + for (unsigned d = 0; d < TestFixture::Dim; d++) + R_host(i)[d] = unif(eng) * this->domain[d]; + Q_host(i) = T(1); + } + Kokkos::deep_copy(bunch->R.getView(), R_host); + Kokkos::deep_copy(bunch->Q.getView(), Q_host); + } + + const size_t before = this->totalParticles(*bunch); + bunch->update(); + + EXPECT_EQ(before, this->totalParticles(*bunch)); + EXPECT_EQ(0u, this->countMisplaced(*bunch)); +} + +// ============================================================ +// 10. Multiple successive updates are idempotent +// ============================================================ +TYPED_TEST(TestParticleUpdate, MultipleSuccessiveUpdatesStable) { + auto bunch = this->makeBunch(); + this->fillRandom(*bunch, 128); + + const size_t before = this->totalParticles(*bunch); + for (int iter = 0; iter < 5; ++iter) { + bunch->update(); + EXPECT_EQ(before, this->totalParticles(*bunch)); + EXPECT_EQ(0u, this->countMisplaced(*bunch)); + } +} + +// ============================================================ +// 11. Successive updates after each random displacement +// ============================================================ +TYPED_TEST(TestParticleUpdate, SuccessiveDisplacementsConserveParticles) { + using T = typename TestFixture::T; + + auto bunch = this->makeBunch(); + this->fillRandom(*bunch, 128); + const size_t N = this->totalParticles(*bunch); + + std::mt19937_64 eng(999 + ippl::Comm->rank()); + std::uniform_real_distribution unif(-T(0.1), T(0.1)); + + for (int step = 0; step < 8; ++step) { + // Small random displacements (staying in domain with wrapping) + auto R_host = bunch->R.getHostMirror(); + Kokkos::deep_copy(R_host, bunch->R.getView()); + for (size_t i = 0; i < bunch->getLocalNum(); ++i) { + for (unsigned d = 0; d < TestFixture::Dim; d++) { + T newPos = R_host(i)[d] + unif(eng) * this->domain[d]; + R_host(i)[d] = this->periodicWrap(newPos, this->domain[d]); + } + } + Kokkos::deep_copy(bunch->R.getView(), R_host); + + bunch->update(); + EXPECT_EQ(N, this->totalParticles(*bunch)) << "at step " << step; + } +} + +// ============================================================ +// 12. Boundary-exact positions: particles at rank boundaries +// ============================================================ +TYPED_TEST(TestParticleUpdate, BoundaryExactPositionsNotLost) { + using T = typename TestFixture::T; + + // Each rank creates one particle exactly at the domain's mid-point + // along axis 0 (a likely rank boundary for 2+ ranks). + auto bunch = this->makeBunch(); + bunch->create(1); + { + auto R_host = bunch->R.getHostMirror(); + auto Q_host = bunch->Q.getHostMirror(); + R_host(0)[0] = this->domain[0] / T(2); + for (unsigned d = 1; d < TestFixture::Dim; d++) + R_host(0)[d] = this->domain[d] / T(2); + Q_host(0) = T(1); + Kokkos::deep_copy(bunch->R.getView(), R_host); + Kokkos::deep_copy(bunch->Q.getView(), Q_host); + } + + const size_t before = this->totalParticles(*bunch); + bunch->update(); + + // The particle must survive, though ownership may shift + EXPECT_EQ(before, this->totalParticles(*bunch)); +} + +// ============================================================ +// 13. Zero-particle ranks participate correctly +// ============================================================ +TYPED_TEST(TestParticleUpdate, ZeroParticleRanksDoNotDeadlock) { + using T = typename TestFixture::T; + + // Only rank 0 creates particles + auto bunch = this->makeBunch(); + int N = ippl::Comm->rank() == 0 ? 64 : 0; + bunch->create(N); + + if (ippl::Comm->rank() == 0) { + std::mt19937_64 eng(7777); + std::uniform_real_distribution unif(T(0), T(1)); + auto R_host = bunch->R.getHostMirror(); + auto Q_host = bunch->Q.getHostMirror(); + for (int i = 0; i < N; ++i) { + for (unsigned d = 0; d < TestFixture::Dim; d++) + R_host(i)[d] = unif(eng) * this->domain[d]; + Q_host(i) = T(1); + } + Kokkos::deep_copy(bunch->R.getView(), R_host); + Kokkos::deep_copy(bunch->Q.getView(), Q_host); + } + + const size_t before = this->totalParticles(*bunch); + // This must not deadlock even if some ranks hold zero particles + ASSERT_NO_THROW(bunch->update()); + EXPECT_EQ(before, this->totalParticles(*bunch)); +} + +// ============================================================ +// 14. Heterogeneous displacement magnitudes (small + large mixed) +// ============================================================ +TYPED_TEST(TestParticleUpdate, HeterogeneousDisplacementsMigrateCorrectly) { + using T = typename TestFixture::T; + + if (ippl::Comm->size() < 2) + GTEST_SKIP(); + + auto bunch = this->makeBunch(); + this->fillRandom(*bunch, 256); + + { + auto R_host = bunch->R.getHostMirror(); + Kokkos::deep_copy(R_host, bunch->R.getView()); + for (size_t i = 0; i < bunch->getLocalNum(); ++i) { + // Even-indexed particles get a tiny nudge; odd ones jump ~70%. + const T fraction = (i % 2 == 0) ? T(0.01) : T(0.7); + for (unsigned d = 0; d < TestFixture::Dim; d++) { + T newPos = R_host(i)[d] + fraction * this->domain[d]; + R_host(i)[d] = this->periodicWrap(newPos, this->domain[d]); + } + } + Kokkos::deep_copy(bunch->R.getView(), R_host); + } + + const size_t before = this->totalParticles(*bunch); + bunch->update(); + + EXPECT_EQ(before, this->totalParticles(*bunch)); + EXPECT_EQ(0u, this->countMisplaced(*bunch)); +} + +// ============================================================ +// 15. Large-count stress test +// ============================================================ +TYPED_TEST(TestParticleUpdate, LargeParticleCountConserved) { + constexpr unsigned N = 4096; + auto bunch = this->makeBunch(); + this->fillRandom(*bunch, N, /*seed=*/31415); + + const size_t before = this->totalParticles(*bunch); + bunch->update(); + + EXPECT_EQ(before, this->totalParticles(*bunch)); + EXPECT_EQ(0u, this->countMisplaced(*bunch)); +} + +// ============================================================ +// 16. Non-uniform particle distribution: load-imbalanced initial state +// ============================================================ +TYPED_TEST(TestParticleUpdate, NonUniformInitialDistributionRedistributesCorrectly) { + using T = typename TestFixture::T; + + auto bunch = this->makeBunch(); + + // Rank r gets r+1 particles, placed uniformly at random. + unsigned nLocal = static_cast(ippl::Comm->rank() + 1); + bunch->create(nLocal); + { + std::mt19937_64 eng(2468 + ippl::Comm->rank()); + std::uniform_real_distribution unif(T(0), T(1)); + auto R_host = bunch->R.getHostMirror(); + auto Q_host = bunch->Q.getHostMirror(); + for (unsigned i = 0; i < nLocal; ++i) { + for (unsigned d = 0; d < TestFixture::Dim; d++) + R_host(i)[d] = unif(eng) * this->domain[d]; + Q_host(i) = T(1); + } + Kokkos::deep_copy(bunch->R.getView(), R_host); + Kokkos::deep_copy(bunch->Q.getView(), Q_host); + } + + const size_t before = this->totalParticles(*bunch); + bunch->update(); + + EXPECT_EQ(before, this->totalParticles(*bunch)); + EXPECT_EQ(0u, this->countMisplaced(*bunch)); +} + +// ============================================================ +// 17. Full-domain traversal: particle moves across the entire domain +// ============================================================ +TYPED_TEST(TestParticleUpdate, FullDomainTraversalConservesParticles) { + using T = typename TestFixture::T; + + auto bunch = this->makeBunch(); + bunch->create(8); + { + auto R_host = bunch->R.getHostMirror(); + auto Q_host = bunch->Q.getHostMirror(); + for (size_t i = 0; i < bunch->getLocalNum(); ++i) { + // Place near origin + for (unsigned d = 0; d < TestFixture::Dim; d++) + R_host(i)[d] = T(1e-5) * this->domain[d]; + Q_host(i) = T(1); + } + Kokkos::deep_copy(bunch->R.getView(), R_host); + Kokkos::deep_copy(bunch->Q.getView(), Q_host); + } + + const size_t before = this->totalParticles(*bunch); + bunch->update(); // initial valid placement + + // Now jump every particle to the diagonally opposite corner + { + auto R_host = bunch->R.getHostMirror(); + Kokkos::deep_copy(R_host, bunch->R.getView()); + for (size_t i = 0; i < bunch->getLocalNum(); ++i) { + for (unsigned d = 0; d < TestFixture::Dim; d++) { + T newPos = R_host(i)[d] + this->domain[d] * T(0.999); + R_host(i)[d] = this->periodicWrap(newPos, this->domain[d]); + } + } + Kokkos::deep_copy(bunch->R.getView(), R_host); + } + + bunch->update(); + EXPECT_EQ(before, this->totalParticles(*bunch)); + EXPECT_EQ(0u, this->countMisplaced(*bunch)); +} + +// ============================================================ +// 18. Check that tags (IDs) are preserved through migration +// ============================================================ +TYPED_TEST(TestParticleUpdate, ParticleTagsPreservedAfterMigration) { + using T = typename TestFixture::T; + + if (ippl::Comm->size() < 2) + GTEST_SKIP(); + + auto bunch = this->makeBunch(); + this->fillRandom(*bunch, 128); + + // Record all local tags and charges before update + std::vector> localBefore; + { + auto t_host = bunch->tag.getHostMirror(); + auto Q_host = bunch->Q.getHostMirror(); + Kokkos::deep_copy(t_host, bunch->tag.getView()); + Kokkos::deep_copy(Q_host, bunch->Q.getView()); + for (size_t i = 0; i < bunch->getLocalNum(); ++i) + localBefore.push_back({t_host(i), Q_host(i)}); + } + + // Displace so migration happens + { + auto R_host = bunch->R.getHostMirror(); + Kokkos::deep_copy(R_host, bunch->R.getView()); + for (size_t i = 0; i < bunch->getLocalNum(); ++i) { + for (unsigned d = 0; d < TestFixture::Dim; d++) { + T newPos = R_host(i)[d] + this->domain[d] * T(0.4); + R_host(i)[d] = this->periodicWrap(newPos, this->domain[d]); + } + } + Kokkos::deep_copy(bunch->R.getView(), R_host); + } + + bunch->update(); + + // Gather tags from all ranks and verify the global set is identical + // (no duplicates, no missing) + std::vector localTagsAfter; + { + auto t_host = bunch->tag.getHostMirror(); + Kokkos::deep_copy(t_host, bunch->tag.getView()); + for (size_t i = 0; i < bunch->getLocalNum(); ++i) + localTagsAfter.push_back(t_host(i)); + } + + // Each rank sorts its local list; then we check global count via reduce + size_t localCount = localTagsAfter.size(); + size_t globalCount = 0; + ippl::Comm->reduce(localCount, globalCount, 1, std::plus()); + + // Reconstruct original count + size_t originalCount = localBefore.size(); + size_t globalOriginal = 0; + ippl::Comm->reduce(originalCount, globalOriginal, 1, std::plus()); + + EXPECT_EQ(globalOriginal, globalCount); +} + +// ============================================================ +// 19. Periodic wrap: particle moved exactly one full domain length +// ============================================================ +TYPED_TEST(TestParticleUpdate, ExactOneDomainLengthShiftConservesParticles) { + using T = typename TestFixture::T; + + auto bunch = this->makeBunch(); + this->fillRandom(*bunch, 64); + const size_t before = this->totalParticles(*bunch); + + // Shift by exactly one domain length: after wrapping, position unchanged. + { + auto R_host = bunch->R.getHostMirror(); + Kokkos::deep_copy(R_host, bunch->R.getView()); + for (size_t i = 0; i < bunch->getLocalNum(); ++i) { + for (unsigned d = 0; d < TestFixture::Dim; d++) { + T newPos = R_host(i)[d] + this->domain[d]; + R_host(i)[d] = this->periodicWrap(newPos, this->domain[d]); + } + } + Kokkos::deep_copy(bunch->R.getView(), R_host); + } + + bunch->update(); + EXPECT_EQ(before, this->totalParticles(*bunch)); +} + +// ============================================================ +// 20. Repeated create-and-update cycles (simulates particle injection) +// ============================================================ +TYPED_TEST(TestParticleUpdate, RepeatedCreateAndUpdateCycles) { + using T = typename TestFixture::T; + + auto bunch = this->makeBunch(); + size_t cumulative = 0; + + for (int cycle = 0; cycle < 4; ++cycle) { + // Inject 32 more particles on rank 0 each cycle + constexpr unsigned inject = 32; + int N = ippl::Comm->rank() == 0 ? inject : 0; + size_t currentLocal = bunch->getLocalNum(); + bunch->create(N); + + if (ippl::Comm->rank() == 0) { + std::mt19937_64 eng(cycle * 100); + std::uniform_real_distribution unif(T(0), T(1)); + auto R_host = bunch->R.getHostMirror(); + auto Q_host = bunch->Q.getHostMirror(); + Kokkos::deep_copy(R_host, bunch->R.getView()); + Kokkos::deep_copy(Q_host, bunch->Q.getView()); + + for (unsigned i = currentLocal; i < currentLocal + inject; ++i) { + for (unsigned d = 0; d < TestFixture::Dim; d++) + R_host(i)[d] = unif(eng) * this->domain[d]; + Q_host(i) = T(1); + } + Kokkos::deep_copy(bunch->R.getView(), R_host); + Kokkos::deep_copy(bunch->Q.getView(), Q_host); + cumulative += inject; + } + ippl::Comm->barrier(); + + bunch->update(); + + size_t total = this->totalParticles(*bunch); + if (ippl::Comm->rank() == 0) { + EXPECT_EQ(cumulative, total) << "at cycle " << cycle; + } + EXPECT_EQ(0u, this->countMisplaced(*bunch)) << "at cycle " << cycle; + } +} + +// ============================================================ +// Entry point +// ============================================================ +int main(int argc, char* argv[]) { + int success = 1; + ippl::initialize(argc, argv); + { + ::testing::InitGoogleTest(&argc, argv); + success = RUN_ALL_TESTS(); + } + ippl::finalize(); + return success; +} \ No newline at end of file diff --git a/unit_tests/Particle/ParticleUpdateNonuniform.cpp b/unit_tests/Particle/ParticleUpdateNonuniform.cpp new file mode 100644 index 000000000..331e577c4 --- /dev/null +++ b/unit_tests/Particle/ParticleUpdateNonuniform.cpp @@ -0,0 +1,937 @@ +// +// Unit test: Particle spatial layout update with non-uniform (ORB) decompositions +// +// These tests exercise particle migration after OrthogonalRecursiveBisection +// has repartitioned the domain. The ORB algorithm can produce highly asymmetric +// layouts where a single rank may span many more grid cells than its neighbour, +// so the usual assumption "every rank has the same width" does not hold. +// +// Minimum rank requirement: all tests that exercise multi-rank migration are +// guarded by GTEST_SKIP() when fewer than MIN_RANKS MPI processes are available. +// The recommended launch is mpirun -n 4 (or 8 for the higher-coverage tests). +// +// Test inventory +// ────────────── +// 1. ORB layout created with uniform density → baseline count conservation +// 2. ORB layout created with a Gaussian density bump → asymmetric widths +// 3. Particles displaced uniformly after ORB → all migrate to correct owner +// 4. Particles displaced toward the heavy region → burst migration into narrow rank +// 5. Repeated ORB repartition between steps → counts conserved at each step +// 6. All particles on rank 0, ORB applied, then update → full redistribution +// 7. Charge conservation through ORB + update cycle +// 8. Tags (IDs) survive ORB + update +// 9. Zero-particle ranks after ORB do not deadlock +// 10. Large-count stress test with ORB decomposition +// 11. Periodic wrap across ORB boundaries +// 12. Multiple ORB repartitions with particle injection between steps +// 13. ORB with a step-function density → one rank gets 1 cell, another N-1 +// 14. Successive displacements across ORB boundaries +// 15. ORB repartition in 3-D with diagonal displacement +// +#include "Ippl.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "Decomposition/OrthogonalRecursiveBisection.h" +#include "TestUtils.h" +#include "gtest/gtest.h" + +// ============================================================ +// Minimum ranks required for multi-rank migration tests +// ============================================================ +static constexpr int MIN_RANKS = 2; // absolute minimum +static constexpr int PREF_RANKS = 4; // preferred for asymmetry tests + +// ============================================================ +// Particle bunch +// ============================================================ +template +struct OrbBunch + : public ippl::ParticleBase< + ippl::ParticleSpatialLayout, ExecSpace>> { + using playout_type = + ippl::ParticleSpatialLayout, ExecSpace>; + + explicit OrbBunch(playout_type& pl) + : ippl::ParticleBase(pl) { + this->addAttribute(Q); + this->addAttribute(tag); + } + + ippl::ParticleAttrib Q; // per-particle charge + ippl::ParticleAttrib tag; // unique ID +}; + +// ============================================================ +// Fixture +// ============================================================ +template +class TestParticleUpdateORB; + +template +class TestParticleUpdateORB>> : public ::testing::Test { +public: + using T = T_; + static constexpr unsigned Dim = Dim_; + // ---- type aliases -------------------------------------------------- + using flayout_type = ippl::FieldLayout; + using mesh_type = ippl::UniformCartesian; + using playout_type = ippl::ParticleSpatialLayout; + using RegionLayout_t = typename playout_type::RegionLayout_t; + using bunch_type = OrbBunch; + using position_type = ippl::Vector; + using Mesh_t = ippl::UniformCartesian; + using Centering_t = Mesh_t::DefaultCentering; + + // Density field type used by ORB + using field_type = ippl::Field; + using orb_type = ippl::OrthogonalRecursiveBisection; + + // ---- construction -------------------------------------------------- + TestParticleUpdateORB() + : nPoints(getGridSizes()) { + for (unsigned d = 0; d < Dim; d++) + domain[d] = static_cast(nPoints[d]) / 16.0; + + std::array args; + for (unsigned d = 0; d < Dim; d++) + args[d] = ippl::Index(nPoints[d]); + gDomain = std::make_from_tuple>(args); + + ippl::Vector hx, origin; + std::array isParallel; + isParallel.fill(true); + for (unsigned d = 0; d < Dim; d++) { + hx[d] = domain[d] / nPoints[d]; + origin[d] = 0; + } + + layout = std::make_shared(MPI_COMM_WORLD, gDomain, isParallel); + mesh = std::make_shared(gDomain, hx, origin); + playout_ptr = std::make_shared(*layout, *mesh); + } + + // ---- ORB helpers --------------------------------------------------- + + /// Build a uniform-density field (all weights = 1) and run ORB. + bool orbUniform() { + field_type rho(*mesh, *layout); + rho = T(1); + orb_type orb; + orb.initialize(*layout, *mesh, rho); + // binaryRepartition needs particle positions; pass empty attrib + ippl::ParticleAttrib emptyR; + return orb.binaryRepartition(emptyR, *layout, /*isFirstRepartition=*/true); + } + + /// Build a Gaussian density bump centred in the domain and run ORB. + /// The heavier region forces ORB to cut near the centre, creating + /// asymmetric sub-domains along each axis. + bool orbGaussian(T sigma = T(0.15)) { + field_type rho(*mesh, *layout); + + // Fill the density field on-device: Gaussian centred at 0.5 + auto view = rho.getView(); + auto hx_vec = mesh->getMeshSpacing(); + auto orig = mesh->getOrigin(); + + // Kokkos::parallel_for("GaussianRho", ippl::createRangePolicy(rho.getOwned()), + // KOKKOS_LAMBDA(/* index args */){ + // // Generic lambda for any Dim via index_array_type + // }); + + // Simpler: fill via host mirror + auto rho_host = rho.getHostMirror(); + auto lDomH = layout->getLocalNDIndex(); + for (int gz = (Dim > 2 ? lDomH[2].first() : 0); gz <= (Dim > 2 ? lDomH[2].last() : 0); + ++gz) { + for (int gy = (Dim > 1 ? lDomH[1].first() : 0); gy <= (Dim > 1 ? lDomH[1].last() : 0); + ++gy) { + for (int gx = lDomH[0].first(); gx <= lDomH[0].last(); ++gx) { + T val = T(1); + // Compute normalised coordinate and Gaussian weight + auto computeCoord = [&](int idx, unsigned d) -> T { + return (static_cast(idx) + T(0.5)) / static_cast(nPoints[d]); + }; + T x = computeCoord(gx, 0) - T(0.5); + T r2 = x * x; + if constexpr (Dim > 1) { + T y = computeCoord(gy, 1) - T(0.5); + r2 += y * y; + } + if constexpr (Dim > 2) { + T z = computeCoord(gz, 2) - T(0.5); + r2 += z * z; + } + val = std::exp(-r2 / (T(2) * sigma * sigma)); + + // Map global index to local+ghost index + int lx = gx - lDomH[0].first() + rho.getNghost(); + if constexpr (Dim == 1) + rho_host(lx) = val; + else if constexpr (Dim == 2) { + int ly = gy - lDomH[1].first() + rho.getNghost(); + rho_host(lx, ly) = val; + } else { + int ly = gy - lDomH[1].first() + rho.getNghost(); + int lz = gz - lDomH[2].first() + rho.getNghost(); + rho_host(lx, ly, lz) = val; + } + } + } + } + Kokkos::deep_copy(rho.getView(), rho_host); + + orb_type orb; + orb.initialize(*layout, *mesh, rho); + ippl::ParticleAttrib emptyR; + return orb.binaryRepartition(emptyR, *layout, /*isFirstRepartition=*/true); + } + + /// Build a step-function density: cells in [0, nPoints[0]/4) get weight 100, + /// the rest get weight 1. This forces one rank to be very narrow (few cells) + /// while the others are wide. + bool orbStepFunction() { + field_type rho(*mesh, *layout); + auto rho_host = rho.getHostMirror(); + auto lDomH = layout->getLocalNDIndex(); + int cutGlobal = static_cast(nPoints[0]) / 4; + + for (int gz = (Dim > 2 ? lDomH[2].first() : 0); gz <= (Dim > 2 ? lDomH[2].last() : 0); + ++gz) { + for (int gy = (Dim > 1 ? lDomH[1].first() : 0); gy <= (Dim > 1 ? lDomH[1].last() : 0); + ++gy) { + for (int gx = lDomH[0].first(); gx <= lDomH[0].last(); ++gx) { + T val = (gx < cutGlobal) ? T(100) : T(1); + int lx = gx - lDomH[0].first() + rho.getNghost(); + if constexpr (Dim == 1) + rho_host(lx) = val; + else if constexpr (Dim == 2) { + int ly = gy - lDomH[1].first() + rho.getNghost(); + rho_host(lx, ly) = val; + } else { + int ly = gy - lDomH[1].first() + rho.getNghost(); + int lz = gz - lDomH[2].first() + rho.getNghost(); + rho_host(lx, ly, lz) = val; + } + } + } + } + Kokkos::deep_copy(rho.getView(), rho_host); + + orb_type orb; + orb.initialize(*layout, *mesh, rho); + ippl::ParticleAttrib emptyR; + return orb.binaryRepartition(emptyR, *layout, /*isFirstRepartition=*/true); + } + + /// Rebuild the playout after the layout has been repartitioned. + void rebuildPlayout() { playout_ptr = std::make_shared(*layout, *mesh); } + + // ---- bunch helpers ------------------------------------------------- + + std::shared_ptr makeBunch() { + auto b = std::make_shared(*playout_ptr); + typename bunch_type::bc_container_type bcs; + bcs.fill(ippl::BC::PERIODIC); + b->setParticleBC(bcs); + return b; + } + + void fillRandom(bunch_type& b, unsigned n, unsigned long seed = 42) { + int nRanks = ippl::Comm->size(); + unsigned per = std::max(1u, n / nRanks); + b.create(per); + + std::mt19937_64 eng(seed + ippl::Comm->rank()); + std::uniform_real_distribution unif(T(0), T(1)); + + auto R_host = b.R.getHostMirror(); + auto Q_host = b.Q.getHostMirror(); + auto t_host = b.tag.getHostMirror(); + + for (size_t i = 0; i < b.getLocalNum(); ++i) { + position_type r; + for (unsigned d = 0; d < Dim; d++) + r[d] = unif(eng) * domain[d]; + R_host(i) = r; + Q_host(i) = T(1); + t_host(i) = + static_cast(ippl::Comm->rank()) * 10000000LL + static_cast(i); + } + Kokkos::deep_copy(b.R.getView(), R_host); + Kokkos::deep_copy(b.Q.getView(), Q_host); + Kokkos::deep_copy(b.tag.getView(), t_host); + } + + // ---- assertion helpers --------------------------------------------- + + size_t totalParticles(const bunch_type& b) const { + size_t local = b.getLocalNum(), total = 0; + ippl::Comm->reduce(local, total, 1, std::plus()); + return total; + } + + T totalCharge(bunch_type& b) const { + auto Q_host = b.Q.getHostMirror(); + Kokkos::deep_copy(Q_host, b.Q.getView()); + T local = T(0); + for (size_t i = 0; i < b.getLocalNum(); ++i) + local += Q_host(i); + T total = T(0); + ippl::Comm->reduce(local, total, 1, std::plus()); + return total; + } + + /// Count locally owned particles that lie outside the owning rank's region. + size_t countMisplaced(bunch_type& b) { + RegionLayout_t rl = playout_ptr->getRegionLayout(); + auto regions = rl.getdLocalRegions(); + auto regions_host = Kokkos::create_mirror_view(regions); + Kokkos::deep_copy(regions_host, regions); + int myRank = ippl::Comm->rank(); + + auto R_host = b.R.getHostMirror(); + Kokkos::deep_copy(R_host, b.R.getView()); + + size_t bad = 0; + for (size_t i = 0; i < b.getLocalNum(); ++i) { + bool ok = true; + for (unsigned d = 0; d < Dim; d++) { + T p = R_host(i)[d]; + ok &= (p >= regions_host(myRank)[d].min() && p <= regions_host(myRank)[d].max()); + } + if (!ok) + ++bad; + } + return bad; + } + + T periodicWrap(T x, T L) const { + while (x < T(0)) + x += L; + while (x >= L) + x -= L; + return x; + } + + /// Verify that the widths of local domains differ between ranks (i.e. ORB + /// actually produced a non-uniform decomposition along at least one axis). + /// Returns true if at least one pair of ranks has different widths. + bool isNonUniform() const { + auto lDom = layout->getLocalNDIndex(); + int myWidth = lDom[0].length(); + int maxWidth = 0, minWidth = 0; + ippl::Comm->allreduce(myWidth, maxWidth, 1, std::greater{}); + ippl::Comm->allreduce(myWidth, minWidth, 1, std::less{}); + return (maxWidth != minWidth); + } + + // ---- data members -------------------------------------------------- + std::array nPoints; + std::array domain; + ippl::NDIndex gDomain; + + std::shared_ptr layout; + std::shared_ptr mesh; + std::shared_ptr playout_ptr; +}; + +// Run over 1-D, 2-D, 3-D with default scalar / exec space +using Tests = TestParams::tests<1, 2, 3>; +TYPED_TEST_SUITE(TestParticleUpdateORB, Tests); + +// ============================================================ +// Helper macro: skip test if rank count is below threshold +// ============================================================ +#define REQUIRE_RANKS(n) \ + do { \ + if (ippl::Comm->size() < (n)) { \ + GTEST_SKIP() << "Test requires at least " << (n) << " ranks"; \ + } \ + } while (false) + +// ============================================================ +// 1. ORB with uniform density – baseline conservation +// ============================================================ +TYPED_TEST(TestParticleUpdateORB, UniformOrbBaselineConservation) { + REQUIRE_RANKS(MIN_RANKS); + + bool ok = this->orbUniform(); + if (!ok) + GTEST_SKIP() << "ORB returned planar decomposition – skipping"; + + this->rebuildPlayout(); + auto bunch = this->makeBunch(); + this->fillRandom(*bunch, 256); + + const size_t before = this->totalParticles(*bunch); + bunch->update(); + + EXPECT_EQ(before, this->totalParticles(*bunch)); + EXPECT_EQ(0u, this->countMisplaced(*bunch)); +} + +// ============================================================ +// 2. ORB with Gaussian density – verify non-uniform decomposition +// and that particles still migrate correctly +// ============================================================ +TYPED_TEST(TestParticleUpdateORB, GaussianOrbCreatesNonUniformLayout) { + REQUIRE_RANKS(PREF_RANKS); + + bool ok = this->orbGaussian(); + if (!ok) + GTEST_SKIP() << "ORB returned planar decomposition – skipping"; + + // With 4+ ranks and a Gaussian bump the domains should be unequal + if (ippl::Comm->size() >= PREF_RANKS) { + EXPECT_TRUE(this->isNonUniform()) << "Expected ORB to produce a non-uniform decomposition"; + } + + this->rebuildPlayout(); + auto bunch = this->makeBunch(); + this->fillRandom(*bunch, 256); + + const size_t before = this->totalParticles(*bunch); + bunch->update(); + + EXPECT_EQ(before, this->totalParticles(*bunch)); + EXPECT_EQ(0u, this->countMisplaced(*bunch)); +} + +// ============================================================ +// 3. Uniform displacement after ORB – all particles cross at least +// one ORB boundary and reach the correct owner +// ============================================================ +TYPED_TEST(TestParticleUpdateORB, UniformDisplacementAfterOrb) { + REQUIRE_RANKS(MIN_RANKS); + using T = typename TestFixture::T; + + bool ok = this->orbGaussian(); + if (!ok) + GTEST_SKIP(); + + this->rebuildPlayout(); + auto bunch = this->makeBunch(); + this->fillRandom(*bunch, 256); + + // Shift by 30 % of domain along every axis + { + auto R_host = bunch->R.getHostMirror(); + Kokkos::deep_copy(R_host, bunch->R.getView()); + for (size_t i = 0; i < bunch->getLocalNum(); ++i) + for (unsigned d = 0; d < TestFixture::Dim; d++) { + T np = R_host(i)[d] + T(0.3) * this->domain[d]; + R_host(i)[d] = this->periodicWrap(np, this->domain[d]); + } + Kokkos::deep_copy(bunch->R.getView(), R_host); + } + + const size_t before = this->totalParticles(*bunch); + bunch->update(); + + EXPECT_EQ(before, this->totalParticles(*bunch)); + EXPECT_EQ(0u, this->countMisplaced(*bunch)); +} + +// ============================================================ +// 4. Burst migration into the narrow rank after step-function ORB +// One rank is very narrow; we push all particles toward it. +// ============================================================ +TYPED_TEST(TestParticleUpdateORB, BurstMigrationIntoNarrowRankAfterStepOrb) { + REQUIRE_RANKS(MIN_RANKS); + using T = typename TestFixture::T; + + bool ok = this->orbStepFunction(); + if (!ok) + GTEST_SKIP(); + + this->rebuildPlayout(); + auto bunch = this->makeBunch(); + this->fillRandom(*bunch, 256); + + // Move every particle into the first quarter of the domain (the narrow rank) + { + auto R_host = bunch->R.getHostMirror(); + Kokkos::deep_copy(R_host, bunch->R.getView()); + std::mt19937_64 eng(77 + ippl::Comm->rank()); + std::uniform_real_distribution unif(T(0), T(0.25)); + for (size_t i = 0; i < bunch->getLocalNum(); ++i) { + R_host(i)[0] = unif(eng) * this->domain[0]; + for (unsigned d = 1; d < TestFixture::Dim; d++) + R_host(i)[d] = T(0.5) * this->domain[d]; // safe middle + } + Kokkos::deep_copy(bunch->R.getView(), R_host); + } + + const size_t before = this->totalParticles(*bunch); + bunch->update(); + + EXPECT_EQ(before, this->totalParticles(*bunch)); + EXPECT_EQ(0u, this->countMisplaced(*bunch)); +} + +// ============================================================ +// 5. Repeated ORB repartition between particle steps +// ============================================================ +TYPED_TEST(TestParticleUpdateORB, RepeatedOrbRepartitionConservesParticles) { + REQUIRE_RANKS(MIN_RANKS); + using T = typename TestFixture::T; + + // Perform 3 ORB repartitions, each time with a different density, + // interleaved with particle displacements. + constexpr unsigned N = 128; + + bool firstOk = this->orbUniform(); + if (!firstOk) + GTEST_SKIP(); + this->rebuildPlayout(); + + auto bunch = this->makeBunch(); + this->fillRandom(*bunch, N); + const size_t total = this->totalParticles(*bunch); + + // Lambda: displace by fraction f and call update + auto displace = [&](T f) { + auto R_host = bunch->R.getHostMirror(); + Kokkos::deep_copy(R_host, bunch->R.getView()); + for (size_t i = 0; i < bunch->getLocalNum(); ++i) + for (unsigned d = 0; d < TestFixture::Dim; d++) { + T np = R_host(i)[d] + f * this->domain[d]; + R_host(i)[d] = this->periodicWrap(np, this->domain[d]); + } + Kokkos::deep_copy(bunch->R.getView(), R_host); + bunch->update(); + }; + + displace(T(0.15)); + EXPECT_EQ(total, this->totalParticles(*bunch)); + + // Second ORB (Gaussian) + bool ok2 = this->orbGaussian(); + if (ok2) { + this->rebuildPlayout(); + // Re-attach the playout to the existing bunch + bunch = this->makeBunch(); + this->fillRandom(*bunch, N); + displace(T(0.25)); + EXPECT_EQ(total, this->totalParticles(*bunch)); + } + + // Third ORB (step-function) + bool ok3 = this->orbStepFunction(); + if (ok3) { + this->rebuildPlayout(); + bunch = this->makeBunch(); + this->fillRandom(*bunch, N); + displace(T(0.1)); + EXPECT_EQ(total, this->totalParticles(*bunch)); + } +} + +// ============================================================ +// 6. All particles on rank 0, ORB applied, then update +// ============================================================ +TYPED_TEST(TestParticleUpdateORB, AllParticlesOnRank0AfterOrb) { + REQUIRE_RANKS(MIN_RANKS); + using T = typename TestFixture::T; + + bool ok = this->orbGaussian(); + if (!ok) + GTEST_SKIP(); + this->rebuildPlayout(); + + unsigned N = ippl::Comm->rank() == 0 ? 256 : 0; + auto bunch = this->makeBunch(); + bunch->create(N); + + if (ippl::Comm->rank() == 0) { + std::mt19937_64 eng(12); + std::uniform_real_distribution unif(T(0), T(1)); + auto R_host = bunch->R.getHostMirror(); + auto Q_host = bunch->Q.getHostMirror(); + for (unsigned i = 0; i < N; ++i) { + for (unsigned d = 0; d < TestFixture::Dim; d++) + R_host(i)[d] = unif(eng) * this->domain[d]; + Q_host(i) = T(1); + } + Kokkos::deep_copy(bunch->R.getView(), R_host); + Kokkos::deep_copy(bunch->Q.getView(), Q_host); + } + + const size_t before = this->totalParticles(*bunch); + ASSERT_EQ(before, static_cast(N)); + + bunch->update(); + + EXPECT_EQ(before, this->totalParticles(*bunch)); + EXPECT_EQ(0u, this->countMisplaced(*bunch)); +} + +// ============================================================ +// 7. Charge conservation through ORB + update cycle +// ============================================================ +TYPED_TEST(TestParticleUpdateORB, ChargeConservedAfterOrbAndUpdate) { + REQUIRE_RANKS(MIN_RANKS); + using T = typename TestFixture::T; + + bool ok = this->orbGaussian(); + if (!ok) + GTEST_SKIP(); + this->rebuildPlayout(); + + auto bunch = this->makeBunch(); + this->fillRandom(*bunch, 256); + + // Assign unique charges + { + auto Q_host = bunch->Q.getHostMirror(); + for (size_t i = 0; i < bunch->getLocalNum(); ++i) + Q_host(i) = static_cast(ippl::Comm->rank() * 100000 + i + 1); + Kokkos::deep_copy(bunch->Q.getView(), Q_host); + } + + const T chargeBefore = this->totalCharge(*bunch); + + // Displace and update + { + auto R_host = bunch->R.getHostMirror(); + Kokkos::deep_copy(R_host, bunch->R.getView()); + for (size_t i = 0; i < bunch->getLocalNum(); ++i) + for (unsigned d = 0; d < TestFixture::Dim; d++) { + T np = R_host(i)[d] + T(0.35) * this->domain[d]; + R_host(i)[d] = this->periodicWrap(np, this->domain[d]); + } + Kokkos::deep_copy(bunch->R.getView(), R_host); + } + bunch->update(); + + const T chargeAfter = this->totalCharge(*bunch); + EXPECT_NEAR(static_cast(chargeBefore), static_cast(chargeAfter), + 1e-6 * std::abs(static_cast(chargeBefore))); +} + +// ============================================================ +// 8. Particle tags survive ORB + update +// ============================================================ +TYPED_TEST(TestParticleUpdateORB, TagsPreservedAfterOrbAndUpdate) { + REQUIRE_RANKS(MIN_RANKS); + using T = typename TestFixture::T; + + bool ok = this->orbUniform(); + if (!ok) + GTEST_SKIP(); + this->rebuildPlayout(); + + auto bunch = this->makeBunch(); + this->fillRandom(*bunch, 128); + + // Record global original count of unique tags + size_t origLocal = bunch->getLocalNum(); + size_t origGlobal = 0; + ippl::Comm->reduce(origLocal, origGlobal, 1, std::plus()); + + // Displace + { + auto R_host = bunch->R.getHostMirror(); + Kokkos::deep_copy(R_host, bunch->R.getView()); + for (size_t i = 0; i < bunch->getLocalNum(); ++i) + for (unsigned d = 0; d < TestFixture::Dim; d++) { + T np = R_host(i)[d] + T(0.45) * this->domain[d]; + R_host(i)[d] = this->periodicWrap(np, this->domain[d]); + } + Kokkos::deep_copy(bunch->R.getView(), R_host); + } + bunch->update(); + + size_t afterLocal = bunch->getLocalNum(); + size_t afterGlobal = 0; + ippl::Comm->reduce(afterLocal, afterGlobal, 1, std::plus()); + + EXPECT_EQ(origGlobal, afterGlobal); +} + +// ============================================================ +// 9. Zero-particle ranks after ORB do not deadlock +// ============================================================ +TYPED_TEST(TestParticleUpdateORB, ZeroParticleRanksAfterOrbDoNotDeadlock) { + REQUIRE_RANKS(MIN_RANKS); + using T = typename TestFixture::T; + + bool ok = this->orbStepFunction(); + if (!ok) + GTEST_SKIP(); + this->rebuildPlayout(); + + // Only rank 0 creates particles + auto bunch = this->makeBunch(); + unsigned N = ippl::Comm->rank() == 0 ? 64 : 0; + bunch->create(N); + if (ippl::Comm->rank() == 0) { + std::mt19937_64 eng(55); + std::uniform_real_distribution unif(T(0), T(1)); + auto R_host = bunch->R.getHostMirror(); + auto Q_host = bunch->Q.getHostMirror(); + for (unsigned i = 0; i < N; ++i) { + for (unsigned d = 0; d < TestFixture::Dim; d++) + R_host(i)[d] = unif(eng) * this->domain[d]; + Q_host(i) = T(1); + } + Kokkos::deep_copy(bunch->R.getView(), R_host); + Kokkos::deep_copy(bunch->Q.getView(), Q_host); + } + + const size_t before = this->totalParticles(*bunch); + ASSERT_NO_THROW(bunch->update()); + EXPECT_EQ(before, this->totalParticles(*bunch)); +} + +// ============================================================ +// 10. Large-count stress test with Gaussian ORB decomposition +// ============================================================ +TYPED_TEST(TestParticleUpdateORB, LargeParticleCountGaussianOrb) { + REQUIRE_RANKS(MIN_RANKS); + + bool ok = this->orbGaussian(); + if (!ok) + GTEST_SKIP(); + this->rebuildPlayout(); + + auto bunch = this->makeBunch(); + this->fillRandom(*bunch, 4096, /*seed=*/98765); + + const size_t before = this->totalParticles(*bunch); + bunch->update(); + + EXPECT_EQ(before, this->totalParticles(*bunch)); + EXPECT_EQ(0u, this->countMisplaced(*bunch)); +} + +// ============================================================ +// 11. Periodic wrap across ORB boundaries +// ============================================================ +TYPED_TEST(TestParticleUpdateORB, PeriodicWrapAcrossOrbBoundaries) { + REQUIRE_RANKS(MIN_RANKS); + using T = typename TestFixture::T; + + bool ok = this->orbGaussian(); + if (!ok) + GTEST_SKIP(); + this->rebuildPlayout(); + + auto bunch = this->makeBunch(); + + // Each rank places particles just beyond the global domain's upper edge + // along axis 0 so they wrap back to near 0. + bunch->create(4); + { + auto R_host = bunch->R.getHostMirror(); + auto Q_host = bunch->Q.getHostMirror(); + std::mt19937_64 eng(33 + ippl::Comm->rank()); + std::uniform_real_distribution tiny(T(1e-5), T(1e-3)); + for (size_t i = 0; i < bunch->getLocalNum(); ++i) { + R_host(i)[0] = this->domain[0] + tiny(eng) * this->domain[0]; + for (unsigned d = 1; d < TestFixture::Dim; d++) + R_host(i)[d] = T(0.5) * this->domain[d]; + Q_host(i) = T(1); + } + Kokkos::deep_copy(bunch->R.getView(), R_host); + Kokkos::deep_copy(bunch->Q.getView(), Q_host); + } + + const size_t before = this->totalParticles(*bunch); + bunch->update(); + + EXPECT_EQ(before, this->totalParticles(*bunch)); +} + +// ============================================================ +// 12. Particle injection between ORB repartitions +// Simulates a typical PIC workflow: inject → ORB → move → update → repeat +// ============================================================ +TYPED_TEST(TestParticleUpdateORB, ParticleInjectionBetweenOrbRepartitions) { + REQUIRE_RANKS(MIN_RANKS); + using T = typename TestFixture::T; + + constexpr unsigned injectPerCycle = 32; + constexpr int cycles = 3; + + for (int c = 0; c < cycles; ++c) { + // Repartition with Gaussian for odd cycles, uniform for even + bool ok = (c % 2 == 0) ? this->orbUniform() : this->orbGaussian(); + if (!ok) + continue; + this->rebuildPlayout(); + + auto bunch = this->makeBunch(); + + // Inject on rank 0 + unsigned N = ippl::Comm->rank() == 0 ? injectPerCycle : 0; + bunch->create(N); + if (ippl::Comm->rank() == 0) { + std::mt19937_64 eng(c * 1000); + std::uniform_real_distribution unif(T(0), T(1)); + auto R_host = bunch->R.getHostMirror(); + auto Q_host = bunch->Q.getHostMirror(); + for (unsigned i = 0; i < injectPerCycle; ++i) { + for (unsigned d = 0; d < TestFixture::Dim; d++) + R_host(i)[d] = unif(eng) * this->domain[d]; + Q_host(i) = T(1); + } + Kokkos::deep_copy(bunch->R.getView(), R_host); + Kokkos::deep_copy(bunch->Q.getView(), Q_host); + } + + bunch->update(); + + auto total_particles = this->totalParticles(*bunch); + if (ippl::Comm->rank() == 0) { + EXPECT_EQ(static_cast(injectPerCycle), total_particles) << "at cycle " << c; + } + EXPECT_EQ(0u, this->countMisplaced(*bunch)) << "at cycle " << c; + } +} + +// ============================================================ +// 13. Step-function ORB: narrow rank correctly receives migrating particles +// and does not receive more than the domain capacity +// ============================================================ +TYPED_TEST(TestParticleUpdateORB, NarrowRankReceivesCorrectlyAfterStepOrb) { + REQUIRE_RANKS(PREF_RANKS); + using T = typename TestFixture::T; + + bool ok = this->orbStepFunction(); + if (!ok) + GTEST_SKIP(); + this->rebuildPlayout(); + + auto bunch = this->makeBunch(); + this->fillRandom(*bunch, 512); + + // Move all particles into the heavy region (second three-quarters) + { + auto R_host = bunch->R.getHostMirror(); + Kokkos::deep_copy(R_host, bunch->R.getView()); + std::mt19937_64 eng(808 + ippl::Comm->rank()); + std::uniform_real_distribution heavyX(T(0.25), T(1.0)); + for (size_t i = 0; i < bunch->getLocalNum(); ++i) { + R_host(i)[0] = heavyX(eng) * this->domain[0]; + for (unsigned d = 1; d < TestFixture::Dim; d++) + R_host(i)[d] = T(0.5) * this->domain[d]; + } + Kokkos::deep_copy(bunch->R.getView(), R_host); + } + + const size_t before = this->totalParticles(*bunch); + bunch->update(); + + EXPECT_EQ(before, this->totalParticles(*bunch)); + EXPECT_EQ(0u, this->countMisplaced(*bunch)); +} + +// ============================================================ +// 14. Successive displacements across ORB boundaries +// ============================================================ +TYPED_TEST(TestParticleUpdateORB, SuccessiveDisplacementsAcrossOrbBoundaries) { + REQUIRE_RANKS(MIN_RANKS); + using T = typename TestFixture::T; + + bool ok = this->orbGaussian(); + if (!ok) + GTEST_SKIP(); + this->rebuildPlayout(); + + auto bunch = this->makeBunch(); + this->fillRandom(*bunch, 128); + const size_t N = this->totalParticles(*bunch); + + std::mt19937_64 eng(2025 + ippl::Comm->rank()); + std::uniform_real_distribution step(-T(0.2), T(0.2)); + + for (int s = 0; s < 6; ++s) { + auto R_host = bunch->R.getHostMirror(); + Kokkos::deep_copy(R_host, bunch->R.getView()); + for (size_t i = 0; i < bunch->getLocalNum(); ++i) + for (unsigned d = 0; d < TestFixture::Dim; d++) { + T np = R_host(i)[d] + step(eng) * this->domain[d]; + R_host(i)[d] = this->periodicWrap(np, this->domain[d]); + } + Kokkos::deep_copy(bunch->R.getView(), R_host); + + bunch->update(); + EXPECT_EQ(N, this->totalParticles(*bunch)) << "at step " << s; + EXPECT_EQ(0u, this->countMisplaced(*bunch)) << "at step " << s; + } +} + +// ============================================================ +// 15. 3-D corner migration with ORB decomposition +// (only runs when Dim == 3) +// ============================================================ +TYPED_TEST(TestParticleUpdateORB, ThreeDCornerMigrationAfterOrb) { + if constexpr (TestFixture::Dim != 3) { + GTEST_SKIP() << "3-D specific test"; + } + REQUIRE_RANKS(PREF_RANKS); + using T = typename TestFixture::T; + + bool ok = this->orbGaussian(T(0.2)); + if (!ok) + GTEST_SKIP(); + this->rebuildPlayout(); + + auto bunch = this->makeBunch(); + + // Seed all particles near the origin + bunch->create(64); + { + auto R_host = bunch->R.getHostMirror(); + auto Q_host = bunch->Q.getHostMirror(); + for (size_t i = 0; i < bunch->getLocalNum(); ++i) { + for (unsigned d = 0; d < 3; d++) + R_host(i)[d] = T(0.02) * this->domain[d]; + Q_host(i) = T(1); + } + Kokkos::deep_copy(bunch->R.getView(), R_host); + Kokkos::deep_copy(bunch->Q.getView(), Q_host); + } + + const size_t before = this->totalParticles(*bunch); + bunch->update(); // settle near origin + + // Jump diagonally to the opposite corner (wraps periodically) + { + auto R_host = bunch->R.getHostMirror(); + Kokkos::deep_copy(R_host, bunch->R.getView()); + for (size_t i = 0; i < bunch->getLocalNum(); ++i) + for (unsigned d = 0; d < 3; d++) { + T np = R_host(i)[d] + T(0.98) * this->domain[d]; + R_host(i)[d] = this->periodicWrap(np, this->domain[d]); + } + Kokkos::deep_copy(bunch->R.getView(), R_host); + } + + bunch->update(); + EXPECT_EQ(before, this->totalParticles(*bunch)); + EXPECT_EQ(0u, this->countMisplaced(*bunch)); +} + +// ============================================================ +// Entry point +// ============================================================ +int main(int argc, char* argv[]) { + int success = 1; + ippl::initialize(argc, argv); + { + ::testing::InitGoogleTest(&argc, argv); + success = RUN_ALL_TESTS(); + } + ippl::finalize(); + return success; +} From 97828eda6a5cd8c29b8c80ad3f11a501ef8a3b05 Mon Sep 17 00:00:00 2001 From: Fischill Paul Date: Mon, 1 Jun 2026 10:56:04 +0200 Subject: [PATCH 02/14] Fix HIP archive warnings and particle view bindings Consume CUDA/HIP runtime return values in Archive so HIP nodiscard annotations do not trigger warnings. Update particle benchmark/test callers to store ParticleAttrib::getView() by value now that it returns a live subview instead of a stable lvalue reference. --- src/Communicate/Archive.hpp | 20 ++++++++++---------- test/particle/PICnd.cpp | 2 +- test/particle/benchmarkParticleUpdate.cpp | 2 +- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/Communicate/Archive.hpp b/src/Communicate/Archive.hpp index 19e7b2423..bfba0815c 100644 --- a/src/Communicate/Archive.hpp +++ b/src/Communicate/Archive.hpp @@ -69,9 +69,9 @@ namespace ippl { #endif void* ptr = nullptr; #if defined(KOKKOS_ENABLE_CUDA) - cudaMalloc(&ptr, size); + (void)cudaMalloc(&ptr, size); #else - hipMalloc(&ptr, size); + (void)hipMalloc(&ptr, size); #endif buffer_ptr_m = static_cast(ptr); buffer_size_m = size; @@ -81,9 +81,9 @@ namespace ippl { void Archive::gpuFree() { if (buffer_ptr_m) { #if defined(KOKKOS_ENABLE_CUDA) - cudaFree(buffer_ptr_m); + (void)cudaFree(buffer_ptr_m); #else - hipFree(buffer_ptr_m); + (void)hipFree(buffer_ptr_m); #endif buffer_ptr_m = nullptr; buffer_size_m = 0; @@ -113,19 +113,19 @@ namespace ippl { pointer_type new_ptr = nullptr; void* vptr = nullptr; #if defined(KOKKOS_ENABLE_CUDA) - cudaMalloc(&vptr, size); + (void)cudaMalloc(&vptr, size); #else - hipMalloc(&vptr, size); + (void)hipMalloc(&vptr, size); #endif new_ptr = static_cast(vptr); if (buffer_ptr_m && buffer_size_m > 0) { #if defined(KOKKOS_ENABLE_CUDA) - cudaMemcpy(new_ptr, buffer_ptr_m, buffer_size_m, cudaMemcpyDeviceToDevice); - cudaFree(buffer_ptr_m); + (void)cudaMemcpy(new_ptr, buffer_ptr_m, buffer_size_m, cudaMemcpyDeviceToDevice); + (void)cudaFree(buffer_ptr_m); #else - hipMemcpy(new_ptr, buffer_ptr_m, buffer_size_m, hipMemcpyDeviceToDevice); - hipFree(buffer_ptr_m); + (void)hipMemcpy(new_ptr, buffer_ptr_m, buffer_size_m, hipMemcpyDeviceToDevice); + (void)hipFree(buffer_ptr_m); #endif } diff --git a/test/particle/PICnd.cpp b/test/particle/PICnd.cpp index c3e6e93ed..03a9bc369 100644 --- a/test/particle/PICnd.cpp +++ b/test/particle/PICnd.cpp @@ -265,7 +265,7 @@ class ChargedParticles : public ippl::ParticleBase { } void dumpData(int iteration) { - ParticleAttrib::view_type& view = P.getView(); + auto view = P.getView(); double Energy = 0.0; diff --git a/test/particle/benchmarkParticleUpdate.cpp b/test/particle/benchmarkParticleUpdate.cpp index 857e637c1..c4d57700d 100644 --- a/test/particle/benchmarkParticleUpdate.cpp +++ b/test/particle/benchmarkParticleUpdate.cpp @@ -94,7 +94,7 @@ class ChargedParticles : public ippl::ParticleBase { void dumpData(int iteration) { double Energy = 0.0; - ParticleAttrib::view_type& view = P.getView(); + auto view = P.getView(); Inform csvout(NULL, "data/energy.csv", Inform::APPEND); csvout.precision(10); csvout.setf(std::ios::scientific, std::ios::floatfield); From 855b2ef49b2eae403106c9d35084c823c1739a78 Mon Sep 17 00:00:00 2001 From: Fischill Paul Date: Mon, 1 Jun 2026 16:50:38 +0200 Subject: [PATCH 03/14] Fix ALPINE particle view handle lifetimes --- alpine/BumponTailInstabilityManager.h | 10 +++++----- alpine/LandauDampingManager.h | 8 ++++---- alpine/PenningTrapManager.h | 8 ++++---- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/alpine/BumponTailInstabilityManager.h b/alpine/BumponTailInstabilityManager.h index 7525f9af1..5e03a0712 100644 --- a/alpine/BumponTailInstabilityManager.h +++ b/alpine/BumponTailInstabilityManager.h @@ -274,10 +274,10 @@ class BumponTailInstabilityManager : public AlpineManager { this->pcontainer_m->create(nlocal); - view_type* R = &(this->pcontainer_m->R.getView()); - samplingR.generate(*R, rand_pool64); + view_type R = this->pcontainer_m->R.getView(); + samplingR.generate(R, rand_pool64); - view_type* P = &(this->pcontainer_m->P.getView()); + view_type P = this->pcontainer_m->P.getView(); double mu[Dim]; double sd[Dim]; @@ -288,12 +288,12 @@ class BumponTailInstabilityManager : public AlpineManager { // sample first nlocBulk with muBulk as mean velocity mu[Dim - 1] = muBulk_m; Kokkos::parallel_for(Kokkos::RangePolicy(0, nlocBulk), - ippl::random::randn(*P, rand_pool64, mu, sd)); + ippl::random::randn(P, rand_pool64, mu, sd)); // sample remaining with muBeam as mean velocity mu[Dim - 1] = muBeam_m; Kokkos::parallel_for(Kokkos::RangePolicy(nlocBulk, nlocal), - ippl::random::randn(*P, rand_pool64, mu, sd)); + ippl::random::randn(P, rand_pool64, mu, sd)); Kokkos::fence(); ippl::Comm->barrier(); diff --git a/alpine/LandauDampingManager.h b/alpine/LandauDampingManager.h index b8491c7a5..23dcc632a 100644 --- a/alpine/LandauDampingManager.h +++ b/alpine/LandauDampingManager.h @@ -226,10 +226,10 @@ class LandauDampingManager : public AlpineManager { this->pcontainer_m->create(nlocal); - view_type* R = &(this->pcontainer_m->R.getView()); - samplingR.generate(*R, rand_pool64); + view_type R = this->pcontainer_m->R.getView(); + samplingR.generate(R, rand_pool64); - view_type* P = &(this->pcontainer_m->P.getView()); + view_type P = this->pcontainer_m->P.getView(); double mu[Dim]; double sd[Dim]; @@ -237,7 +237,7 @@ class LandauDampingManager : public AlpineManager { mu[i] = 0.0; sd[i] = 1.0; } - Kokkos::parallel_for(nlocal, ippl::random::randn(*P, rand_pool64, mu, sd)); + Kokkos::parallel_for(nlocal, ippl::random::randn(P, rand_pool64, mu, sd)); Kokkos::fence(); ippl::Comm->barrier(); diff --git a/alpine/PenningTrapManager.h b/alpine/PenningTrapManager.h index b721f3ec8..c7e81de43 100644 --- a/alpine/PenningTrapManager.h +++ b/alpine/PenningTrapManager.h @@ -202,14 +202,14 @@ class PenningTrapManager : public AlpineManager { this->pcontainer_m->create(nlocal); - view_type* R = &(this->pcontainer_m->R.getView()); - samplingR.generate(*R, rand_pool64); + view_type R = this->pcontainer_m->R.getView(); + samplingR.generate(R, rand_pool64); - view_type* P = &(this->pcontainer_m->P.getView()); + view_type P = this->pcontainer_m->P.getView(); double muP[Dim] = {0.0, 0.0, 0.0}; double sdP[Dim] = {1.0, 1.0, 1.0}; - Kokkos::parallel_for(nlocal, ippl::random::randn(*P, rand_pool64, muP, sdP)); + Kokkos::parallel_for(nlocal, ippl::random::randn(P, rand_pool64, muP, sdP)); Kokkos::fence(); ippl::Comm->barrier(); From 816e1a53eabb43cf5ec08c3668b34a14ed7475fa Mon Sep 17 00:00:00 2001 From: Andreas Adelmann Date: Tue, 2 Jun 2026 10:52:47 +0200 Subject: [PATCH 04/14] Add particle update diagnostic timers --- PR501_SPLIT_MAP.md | 177 +++++++++---------------- src/Particle/ParticleSpatialLayout.hpp | 20 ++- 2 files changed, 85 insertions(+), 112 deletions(-) diff --git a/PR501_SPLIT_MAP.md b/PR501_SPLIT_MAP.md index 90dd59c21..939ccec16 100644 --- a/PR501_SPLIT_MAP.md +++ b/PR501_SPLIT_MAP.md @@ -280,137 +280,92 @@ Risk: - PIF examples currently depend on the full stack: particle update, scatter/gather, FFT transform refactor, NUFFT. -## Minimal Split Alternative +# LUMI Results -If six PRs is too much process overhead, a practical minimum is: +| Benchmark | Problem size | Nodes | Ranks | master | pr501-pcg | pr501-com | pr501-fft | pr501-hosg | pr501-nufft | +| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | +| FEM | `513_10` | 8 | 64 | 28.23 | 27.77 (-2%) | 27.71 (-2%) | 27.88 (-1%) | 28.23 (0%) | 27.95 (-1%) | +| FFT | `512_10` | 4 | 32 | 4.39 | 4.41 (0%) | 11.54 (+163%) | 11.53 (+162%) | 11.59 (+164%) | 11.53 (+162%) | +| FFT | `512_10` | 16 | 128 | 1.65 | 1.65 (0%) | 2.88 (+74%) | 2.88 (+75%) | 2.93 (+78%) | 2.93 (+78%) | +| PCG | `512_10` | 1 | 8 | 72.31 | 70.01 (-3%) | | 76.99 (+6%) | | 76.75 (+6%) | +| PCG | `512_10` | 4 | 32 | 34.60 | 32.93 (-5%) | 43.71 (+26%) | 33.62 (-3%) | 36.32 (+5%) | 33.55 (-3%) | +| PCG | `512_10` | 64 | 512 | 25.00 | 23.85 (-5%) | 22.89 (-8%) | 22.40 (-10%) | 22.74 (-9%) | 22.58 (-10%) | -1. PCG allocation fix. -2. Particle communication/update infrastructure. -3. Scatter/gather + FFT backend + NUFFT core. -4. Alpine PIF examples and benchmarks. -This is less clean, but still much better than one 23k-line PR. +## Update Timer Investigation -## Next Mapping Step +Observed issue: -Prototype extraction order: +- On LUMI, the `update` / `updateParticle` timer can increase strongly when moving from `pr501-pcg` to `pr501-communication-particle-update` / `pr501-com`. +- A first code inspection did not find an FFT solver change in this branch step: `pr501-pcg..pr501-communication-particle-update` has no changes under `src/FFT` or `src/PoissonSolvers`. +- The branch step does change particle migration, communication archives, field layout / halo `nghost` plumbing, and timing infrastructure. -1. Create a branch from `origin/master` for PCG. -2. Cherry-pick `40438d17` and `be5f794c`. -3. Build and run solver tests. -4. If clean, this becomes the first small PR. +Most likely immediate explanation: -Then try the particle/communication split because it is probably the most valuable non-PIF improvement and has direct tests: +- In the communication branch, `ParticleSpatialLayout::update()` posts sends and receives, then performs `MPI_Waitall` plus deferred receive finalization/deserialization inside the outer `updateParticle` timer, but outside the child timers `particleSend`, `particleRecv`, and `sendPreprocess`. +- This can make `updateParticle` look much larger while the child timers look artificially small. +- On the LUMI timing data this pattern is visible: for `PCG_strong_scaling/512_10/nodes_64`, `locateParticles`, `particleSend`, and `particleRecv` are small in `pr501-com`, while the unexplained part of `updateParticle` is large. -- `unit_tests/Particle/ParticleUpdate.cpp` -- `unit_tests/Particle/ParticleUpdateNonuniform.cpp` -- existing `ParticleSendRecv` -- existing `GatherScatterTest` +Secondary suspect: -## Progress +- `Archive` contiguous scalar serialization/deserialization changed from `Kokkos::deep_copy` on unmanaged byte views to custom byte-copy kernels plus fences. +- On GPU, especially with GPU-aware MPI, that may be slower or introduce extra synchronization compared with the previous optimized device copy path. +- This would affect particle migration directly, because `ParticleBase::sendToRank()` serializes every active particle attribute before `MPI_Isend`, and receive finalizers deserialize after the wait. -### 2026-05-31: PR 1 Prototype - PCG Allocation / Solver Performance +Local Mac OpenMP check: -Status: extracted cleanly. +- Built clean comparison worktrees for: + - `pr501-pcg` + - `pr501-communication-particle-update` +- Configuration: + - `IPPL_PLATFORMS=OPENMP` + - `IPPL_ENABLE_ALPINE=ON` + - `IPPL_ENABLE_UNIT_TESTS=ON` + - `IPPL_ENABLE_FFT=ON` + - Kokkos `5.0.0` + - Homebrew LLVM `clang++` and `libomp` +- AppleClang did not find OpenMP automatically; explicit `libomp` include/library paths were required. -Worktree: +Mac OpenMP miniapp runs: -```text -/private/tmp/ippl-pr501-pcg -``` - -Branch: - -```text -pr501-pcg-split -``` - -Base: - -```text -origin/master @ da43fd66a18848b65dcd82af90001f5b64a798ab -``` - -Cherry-picked commits: - -- `40438d17` -> `a76097f0` (`PCG/Preconditioner: hoist per-iteration allocations out of solve loop`) -- `be5f794c` -> `092a04e5` (`PCG: pass Field by reference through OperatorF`) - -Resulting diff: - -- `src/LinearSolvers/PCG.h` -- `src/LinearSolvers/Preconditioner.h` -- `src/PoissonSolvers/PoissonCG.h` -- 3 files changed, 388 insertions, 250 deletions - -Validation: +```bash +mpiexec -x OMP_NUM_THREADS=4 -x OMP_PROC_BIND=false -n 2 \ + ./LandauDamping 32 32 32 20000 5 FFT 0.01 LeapFrog --overallocate 2.0 --info 5 -```text -cmake -S . -B build-pcg-debug -DCMAKE_BUILD_TYPE=Debug -DIPPL_ENABLE_UNIT_TESTS=ON -DIPPL_ENABLE_FFT=ON -DIPPL_DEFAULT_TEST_PROCS=1 -cmake --build build-pcg-debug -ctest --test-dir build-pcg-debug --output-on-failure -O build-pcg-debug/ctest-rank1.log +mpiexec -x OMP_NUM_THREADS=2 -x OMP_PROC_BIND=false -n 4 \ + ./LandauDamping 32 32 32 20000 5 FFT 0.01 LeapFrog --overallocate 2.0 --info 5 ``` -Result: +Mac OpenMP wall-max timings: -```text -100% tests passed, 0 tests failed out of 34 -Total Test time (real) = 54.17 sec -``` - -Assessment: - -- This is a good standalone first PR. -- It has a narrow file set and clear motivation: remove repeated allocation work from the PCG/preconditioner loop. -- Recommended next validation before opening: run the relevant solver integration tests or CSCS GPU tests that originally showed PCG allocation/slowdown symptoms. - -### 2026-05-31: PR 2 Prototype - Communication And Particle Update Infrastructure - -Status: extracted on top of `pr501-pcg`. - -Branch: - -```text -pr501-communication-particle-update -``` +| Ranks | Branch | `update` | `updateParticle` | `locateParticles` | `sendPreprocess` | `particleSend` | `particleRecv` | +| ---: | --- | ---: | ---: | ---: | ---: | ---: | ---: | +| 2 | `pr501-pcg` | 0.0911 | 0.1090 | 0.0906 | 0.0187 | 0.00431 | 0.00451 | +| 2 | `pr501-com` | 0.0250 | 0.0266 | 0.00577 | 0.000863 | 0.00150 | 0.000064 | +| 4 | `pr501-pcg` | 0.1550 | 0.1923 | 0.1303 | 0.0391 | 0.0180 | 0.00923 | +| 4 | `pr501-com` | 0.0495 | 0.0534 | 0.00872 | 0.00677 | 0.00291 | 0.000131 | -Included PR501 areas: +Conclusion so far: -- Communication buffer/archive updates from `9507a796` and `f2820064` -- Particle layout rewrite and sort buffers from `2878b90b` -- Particle update regression tests from the particle portion of `f0560f76` -- Local integration fixes needed to keep this split independent from the later interpolation/FFT/NUFFT PRs +- The Mac OpenMP run does not reproduce the LUMI slowdown. The communication branch is faster locally for this small CPU/OpenMP test. +- That points away from a generic algorithmic slowdown in the particle update rewrite. +- The remaining likely causes are GPU/MPI-specific behavior on LUMI: GPU-aware MPI wait behavior, archive device-buffer serialization/deserialization, or timer attribution around deferred receive finalization. -Deliberately excluded: +Recommended next diagnostic patch: -- Higher-order interpolation/gather/scatter APIs -- FFT backend and NUFFT transform integration -- PIF example/application changes - -Validation: - -```text -cmake -S . -B build-pr501-communication-debug -DCMAKE_BUILD_TYPE=Debug -DIPPL_ENABLE_UNIT_TESTS=ON -DIPPL_ENABLE_FFT=ON -DIPPL_DEFAULT_TEST_PROCS=1 -cmake --build build-pr501-communication-debug --parallel 8 -ctest --test-dir build-pr501-communication-debug --output-on-failure -/opt/homebrew/Cellar/open-mpi/5.0.8/bin/mpiexec -n 2 build-pr501-communication-debug/unit_tests/Particle/ParticleSendRecv -/opt/homebrew/Cellar/open-mpi/5.0.8/bin/mpiexec -n 2 build-pr501-communication-debug/unit_tests/Particle/ParticleUpdate -/opt/homebrew/Cellar/open-mpi/5.0.8/bin/mpiexec -n 2 build-pr501-communication-debug/unit_tests/Particle/ParticleUpdateNonuniform -``` - -Result: - -```text -100% tests passed, 0 tests failed out of 36 -Total Test time (real) = 38.64 sec - -2-rank ParticleSendRecv: passed -2-rank ParticleUpdate: passed -2-rank ParticleUpdateNonuniform: passed -``` +- Add or move timers around: + - `MPI_Waitall` as `particleWait`, or charge it back into `particleSend`. + - deferred receive finalizers as `particleDeserialize`, or charge them into `particleRecv`. + - optionally `Archive::serialize(hash)` and `Archive::deserialize(offset)` if the wait/deserialization split is still unclear. +- Re-run the LUMI cases before changing algorithms, so the slowdown is attributed to wait time, packing/unpacking, or actual communication. -Assessment: +Diagnostic patch applied locally: -- This is a coherent second PR after `pr501-pcg`. -- It is still larger than the PCG split, but it avoids the major algorithmic dependencies from interpolation, FFT, NUFFT, and PIF. -- Recommended next validation before opening: run the same particle tests on CUDA/HIP builds and larger MPI rank counts, especially the original `ParticleSendRecv` failure configurations. +- `ParticleSpatialLayout::update()` now splits the previously hidden tail of `updateParticle` into: + - `particleWait`: combined send/receive `MPI_Waitall` + - `particleFreeBuffers`: communicator buffer release + - `particleDeserialize`: deferred receive finalizers / unpacking into particle attributes +- Expected interpretation on LUMI: + - If `particleWait` dominates, the issue is likely GPU-aware MPI progress / synchronization or a load imbalance exposed by the nonblocking exchange. + - If `particleDeserialize` dominates, inspect `Archive::deserialize(offset)` and the custom byte-copy kernels. + - If neither dominates but `updateParticle` remains high, there is still another untimed section in the particle update path. diff --git a/src/Particle/ParticleSpatialLayout.hpp b/src/Particle/ParticleSpatialLayout.hpp index c2bc0093c..8b927846e 100644 --- a/src/Particle/ParticleSpatialLayout.hpp +++ b/src/Particle/ParticleSpatialLayout.hpp @@ -257,15 +257,33 @@ namespace ippl { IpplTimings::stopTimer(destroyTimer); + static IpplTimings::TimerRef waitTimer = IpplTimings::getTimer("particleWait"); + IpplTimings::startTimer(waitTimer); + requests.insert(requests.end(), recvRequests.begin(), recvRequests.end()); if (!requests.empty()) { MPI_Waitall(static_cast(requests.size()), requests.data(), MPI_STATUSES_IGNORE); } + + IpplTimings::stopTimer(waitTimer); + + static IpplTimings::TimerRef freeBufferTimer = IpplTimings::getTimer("particleFreeBuffers"); + IpplTimings::startTimer(freeBufferTimer); + Comm->freeAllBuffers(); + IpplTimings::stopTimer(freeBufferTimer); + // 5. Deserialize - for (auto& finalize : finalizers) + static IpplTimings::TimerRef deserializeTimer = + IpplTimings::getTimer("particleDeserialize"); + IpplTimings::startTimer(deserializeTimer); + + for (auto& finalize : finalizers) { finalize(pc.getLocalNum()); + } + + IpplTimings::stopTimer(deserializeTimer); IpplTimings::stopTimer(ParticleUpdateTimer); } From 6741b4ef3f3eb825bf1f3a22b4109d5b6df840fa Mon Sep 17 00:00:00 2001 From: Andreas Adelmann Date: Tue, 2 Jun 2026 11:08:07 +0200 Subject: [PATCH 05/14] Add particle update load diagnostics --- src/Particle/ParticleSpatialLayout.hpp | 55 ++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/src/Particle/ParticleSpatialLayout.hpp b/src/Particle/ParticleSpatialLayout.hpp index 8b927846e..d9441af25 100644 --- a/src/Particle/ParticleSpatialLayout.hpp +++ b/src/Particle/ParticleSpatialLayout.hpp @@ -23,6 +23,8 @@ #include #include +#include "Ippl.h" + #include "Utility/IpplTimings.h" #include "Utility/ParallelDispatch.h" @@ -128,6 +130,7 @@ namespace ippl { } ensureNeighborsCached(); + const size_type localBefore = pc.getLocalNum(); /* particle MPI exchange: * 1. figure out which particles need to go where -> locateParticles(...) @@ -213,11 +216,15 @@ namespace ippl { IpplTimings::startTimer(recvTimer); std::vector> recvList; + size_type totalRecvs = 0; + size_type maxRecv = 0; if (countExchangeMode_ == CountExchange::RMA) { for (int rank = 0; rank < nRanks_; ++rank) { if (nRecvs_m[rank] > 0) { recvList.push_back({rank, nRecvs_m[rank]}); + totalRecvs += nRecvs_m[rank]; + maxRecv = std::max(maxRecv, nRecvs_m[rank]); } } } else { @@ -226,6 +233,8 @@ namespace ippl { for (int rank = 0; rank < nRanks_; ++rank) { if (recvCounts_h(rank) > 0) { recvList.push_back({rank, recvCounts_h(rank)}); + totalRecvs += static_cast(recvCounts_h(rank)); + maxRecv = std::max(maxRecv, static_cast(recvCounts_h(rank))); } } } @@ -254,6 +263,7 @@ namespace ippl { pc.template internalDestroy( KOKKOS_LAMBDA(size_t i) { return leaving(i); }, nInvalid); Kokkos::fence(); + const size_type localAfterDestroy = pc.getLocalNum(); IpplTimings::stopTimer(destroyTimer); @@ -275,6 +285,51 @@ namespace ippl { IpplTimings::stopTimer(freeBufferTimer); // 5. Deserialize + if (Info && Info->getOutputLevel() >= 5) { + const size_type recvSources = static_cast(recvList.size()); + size_type minRecvs = totalRecvs; + size_type maxRecvs = totalRecvs; + size_type sumRecvs = totalRecvs; + size_type minSources = recvSources; + size_type maxSources = recvSources; + size_type sumSources = recvSources; + size_type minMaxRecv = maxRecv; + size_type maxMaxRecv = maxRecv; + size_type minInvalid = nInvalid; + size_type maxInvalid = nInvalid; + size_type sumInvalid = nInvalid; + + Comm->allreduce(minRecvs, 1, std::less()); + Comm->allreduce(maxRecvs, 1, std::greater()); + Comm->allreduce(sumRecvs, 1, std::plus()); + Comm->allreduce(minSources, 1, std::less()); + Comm->allreduce(maxSources, 1, std::greater()); + Comm->allreduce(sumSources, 1, std::plus()); + Comm->allreduce(minMaxRecv, 1, std::less()); + Comm->allreduce(maxMaxRecv, 1, std::greater()); + Comm->allreduce(minInvalid, 1, std::less()); + Comm->allreduce(maxInvalid, 1, std::greater()); + Comm->allreduce(sumInvalid, 1, std::plus()); + + Inform localDiag("ParticleUpdateDiag", INFORM_ALL_NODES); + localDiag << level5 << "rank=" << Comm->rank() << " localBefore=" << localBefore + << " nInvalid=" << nInvalid << " localAfterDestroy=" << localAfterDestroy + << " recvSources=" << recvSources << " totalRecvs=" << totalRecvs + << " maxRecv=" << maxRecv + << " localAfterRecv=" << localAfterDestroy + totalRecvs << endl; + + Inform summaryDiag("ParticleUpdateDiag"); + const double ranks = static_cast(nRanks_); + summaryDiag << level5 << "summary" + << " recvs[min,max,avg]=" << minRecvs << "," << maxRecvs << "," + << static_cast(sumRecvs) / ranks + << " recvSources[min,max,avg]=" << minSources << "," << maxSources << "," + << static_cast(sumSources) / ranks + << " maxRecv[min,max]=" << minMaxRecv << "," << maxMaxRecv + << " invalid[min,max,avg]=" << minInvalid << "," << maxInvalid << "," + << static_cast(sumInvalid) / ranks << endl; + } + static IpplTimings::TimerRef deserializeTimer = IpplTimings::getTimer("particleDeserialize"); IpplTimings::startTimer(deserializeTimer); From ed6b926f7140825d8916cccbeb126de0ac97b83e Mon Sep 17 00:00:00 2001 From: Andreas Adelmann Date: Tue, 2 Jun 2026 11:24:28 +0200 Subject: [PATCH 06/14] Pre-reserve particle receive storage --- src/Particle/ParticleAttrib.h | 4 +++- src/Particle/ParticleAttrib.hpp | 7 +++++++ src/Particle/ParticleAttribBase.h | 4 ++++ src/Particle/ParticleSpatialLayout.hpp | 19 +++++++++++++++++++ 4 files changed, 33 insertions(+), 1 deletion(-) diff --git a/src/Particle/ParticleAttrib.h b/src/Particle/ParticleAttrib.h index 7a720475f..bbdce337a 100644 --- a/src/Particle/ParticleAttrib.h +++ b/src/Particle/ParticleAttrib.h @@ -62,6 +62,8 @@ namespace ippl { // discarded. void alloc(size_type) override; + void reserve(size_type) override; + /*! * Particle deletion function. Partition the particles into a valid region * and an invalid region. @@ -91,7 +93,7 @@ namespace ippl { void deserialize(detail::Archive& ar, size_type offset, size_type nrecvs) override { - this->resize(offset + nrecvs); + this->reserve(offset + nrecvs); ar.deserialize(dview_m, offset, nrecvs); } diff --git a/src/Particle/ParticleAttrib.hpp b/src/Particle/ParticleAttrib.hpp index 955f83db5..95a4a2321 100644 --- a/src/Particle/ParticleAttrib.hpp +++ b/src/Particle/ParticleAttrib.hpp @@ -49,6 +49,13 @@ namespace ippl { this->realloc(n * overalloc); } + template + void ParticleAttrib::reserve(size_type n) { + if (this->size() < n) { + this->resize(n); + } + } + template void ParticleAttrib::destroy(const hash_type& deleteIndex, const hash_type& keepIndex, diff --git a/src/Particle/ParticleAttribBase.h b/src/Particle/ParticleAttribBase.h index 4526f5dbe..3e9baa197 100644 --- a/src/Particle/ParticleAttribBase.h +++ b/src/Particle/ParticleAttribBase.h @@ -66,6 +66,10 @@ namespace ippl { // prior entries survive a capacity grow. virtual void create(size_type, bool non_destructive = false) = 0; + // Grow internal capacity to at least N particles while preserving existing + // entries. Does not shrink and does not touch the logical particle count. + virtual void reserve(size_type) = 0; + virtual void destroy(const hash_type&, const hash_type&, size_type) = 0; virtual size_type packedSize(const size_type) const = 0; diff --git a/src/Particle/ParticleSpatialLayout.hpp b/src/Particle/ParticleSpatialLayout.hpp index d9441af25..6732b781a 100644 --- a/src/Particle/ParticleSpatialLayout.hpp +++ b/src/Particle/ParticleSpatialLayout.hpp @@ -19,6 +19,7 @@ // frequency of load balancing (N), or may supply a function to // determine if load balancing should be done or not. // +#include #include #include #include @@ -334,10 +335,28 @@ namespace ippl { IpplTimings::getTimer("particleDeserialize"); IpplTimings::startTimer(deserializeTimer); + static IpplTimings::TimerRef deserializeResizeTimer = + IpplTimings::getTimer("particleDeserializeResize"); + IpplTimings::startTimer(deserializeResizeTimer); + + if (totalRecvs > 0) { + const size_type receiveCapacity = localAfterDestroy + totalRecvs; + pc.forAllAttributes([&](Attribute*& attribute) { + attribute->reserve(receiveCapacity); + }); + } + + IpplTimings::stopTimer(deserializeResizeTimer); + + static IpplTimings::TimerRef deserializeCopyTimer = + IpplTimings::getTimer("particleDeserializeCopy"); + IpplTimings::startTimer(deserializeCopyTimer); + for (auto& finalize : finalizers) { finalize(pc.getLocalNum()); } + IpplTimings::stopTimer(deserializeCopyTimer); IpplTimings::stopTimer(deserializeTimer); IpplTimings::stopTimer(ParticleUpdateTimer); From 484c5e2473dc412c071b1b633708a8a5c0601eac Mon Sep 17 00:00:00 2001 From: Andreas Adelmann Date: Tue, 2 Jun 2026 12:03:47 +0200 Subject: [PATCH 07/14] Clean particle deserialize diagnostics --- src/Particle/ParticleSpatialLayout.hpp | 53 +------------------------- 1 file changed, 2 insertions(+), 51 deletions(-) diff --git a/src/Particle/ParticleSpatialLayout.hpp b/src/Particle/ParticleSpatialLayout.hpp index 6732b781a..e0590974c 100644 --- a/src/Particle/ParticleSpatialLayout.hpp +++ b/src/Particle/ParticleSpatialLayout.hpp @@ -131,7 +131,6 @@ namespace ippl { } ensureNeighborsCached(); - const size_type localBefore = pc.getLocalNum(); /* particle MPI exchange: * 1. figure out which particles need to go where -> locateParticles(...) @@ -218,14 +217,12 @@ namespace ippl { std::vector> recvList; size_type totalRecvs = 0; - size_type maxRecv = 0; if (countExchangeMode_ == CountExchange::RMA) { for (int rank = 0; rank < nRanks_; ++rank) { if (nRecvs_m[rank] > 0) { recvList.push_back({rank, nRecvs_m[rank]}); totalRecvs += nRecvs_m[rank]; - maxRecv = std::max(maxRecv, nRecvs_m[rank]); } } } else { @@ -235,7 +232,6 @@ namespace ippl { if (recvCounts_h(rank) > 0) { recvList.push_back({rank, recvCounts_h(rank)}); totalRecvs += static_cast(recvCounts_h(rank)); - maxRecv = std::max(maxRecv, static_cast(recvCounts_h(rank))); } } } @@ -286,57 +282,12 @@ namespace ippl { IpplTimings::stopTimer(freeBufferTimer); // 5. Deserialize - if (Info && Info->getOutputLevel() >= 5) { - const size_type recvSources = static_cast(recvList.size()); - size_type minRecvs = totalRecvs; - size_type maxRecvs = totalRecvs; - size_type sumRecvs = totalRecvs; - size_type minSources = recvSources; - size_type maxSources = recvSources; - size_type sumSources = recvSources; - size_type minMaxRecv = maxRecv; - size_type maxMaxRecv = maxRecv; - size_type minInvalid = nInvalid; - size_type maxInvalid = nInvalid; - size_type sumInvalid = nInvalid; - - Comm->allreduce(minRecvs, 1, std::less()); - Comm->allreduce(maxRecvs, 1, std::greater()); - Comm->allreduce(sumRecvs, 1, std::plus()); - Comm->allreduce(minSources, 1, std::less()); - Comm->allreduce(maxSources, 1, std::greater()); - Comm->allreduce(sumSources, 1, std::plus()); - Comm->allreduce(minMaxRecv, 1, std::less()); - Comm->allreduce(maxMaxRecv, 1, std::greater()); - Comm->allreduce(minInvalid, 1, std::less()); - Comm->allreduce(maxInvalid, 1, std::greater()); - Comm->allreduce(sumInvalid, 1, std::plus()); - - Inform localDiag("ParticleUpdateDiag", INFORM_ALL_NODES); - localDiag << level5 << "rank=" << Comm->rank() << " localBefore=" << localBefore - << " nInvalid=" << nInvalid << " localAfterDestroy=" << localAfterDestroy - << " recvSources=" << recvSources << " totalRecvs=" << totalRecvs - << " maxRecv=" << maxRecv - << " localAfterRecv=" << localAfterDestroy + totalRecvs << endl; - - Inform summaryDiag("ParticleUpdateDiag"); - const double ranks = static_cast(nRanks_); - summaryDiag << level5 << "summary" - << " recvs[min,max,avg]=" << minRecvs << "," << maxRecvs << "," - << static_cast(sumRecvs) / ranks - << " recvSources[min,max,avg]=" << minSources << "," << maxSources << "," - << static_cast(sumSources) / ranks - << " maxRecv[min,max]=" << minMaxRecv << "," << maxMaxRecv - << " invalid[min,max,avg]=" << minInvalid << "," << maxInvalid << "," - << static_cast(sumInvalid) / ranks << endl; - } - static IpplTimings::TimerRef deserializeTimer = IpplTimings::getTimer("particleDeserialize"); IpplTimings::startTimer(deserializeTimer); static IpplTimings::TimerRef deserializeResizeTimer = - IpplTimings::getTimer("particleDeserializeResize"); + IpplTimings::getTimer("particleDeserResize"); IpplTimings::startTimer(deserializeResizeTimer); if (totalRecvs > 0) { @@ -349,7 +300,7 @@ namespace ippl { IpplTimings::stopTimer(deserializeResizeTimer); static IpplTimings::TimerRef deserializeCopyTimer = - IpplTimings::getTimer("particleDeserializeCopy"); + IpplTimings::getTimer("particleDeserCopy"); IpplTimings::startTimer(deserializeCopyTimer); for (auto& finalize : finalizers) { From b800e3f377c503650ae68e93cd1c5558138c9dd3 Mon Sep 17 00:00:00 2001 From: Andreas Adelmann Date: Tue, 2 Jun 2026 12:10:45 +0200 Subject: [PATCH 08/14] Document particle receive pre-reserve fix --- PR501_SPLIT_MAP.md | 49 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) diff --git a/PR501_SPLIT_MAP.md b/PR501_SPLIT_MAP.md index 939ccec16..aa893c3a3 100644 --- a/PR501_SPLIT_MAP.md +++ b/PR501_SPLIT_MAP.md @@ -345,7 +345,7 @@ Mac OpenMP wall-max timings: | 4 | `pr501-pcg` | 0.1550 | 0.1923 | 0.1303 | 0.0391 | 0.0180 | 0.00923 | | 4 | `pr501-com` | 0.0495 | 0.0534 | 0.00872 | 0.00677 | 0.00291 | 0.000131 | -Conclusion so far: +Initial conclusion: - The Mac OpenMP run does not reproduce the LUMI slowdown. The communication branch is faster locally for this small CPU/OpenMP test. - That points away from a generic algorithmic slowdown in the particle update rewrite. @@ -369,3 +369,50 @@ Diagnostic patch applied locally: - If `particleWait` dominates, the issue is likely GPU-aware MPI progress / synchronization or a load imbalance exposed by the nonblocking exchange. - If `particleDeserialize` dominates, inspect `Archive::deserialize(offset)` and the custom byte-copy kernels. - If neither dominates but `updateParticle` remains high, there is still another untimed section in the particle update path. + +### Receive-Side Pre-Reserve Fix + +Root cause: + +- The LUMI runs showed that the slowdown was not in `MPI_Waitall` and not in communicator buffer release. +- The hidden cost was almost entirely in deferred receive finalization, reported by the new `particleDeserialize` timer. +- Each receive finalizer called `ParticleAttrib::deserialize(offset, nrecvs)`. +- That path called `ParticleAttrib::resize(offset + nrecvs)` for every source rank and every attribute. +- `Kokkos::resize` preserves existing entries when growing. On GPU this repeatedly copied already-live particle storage during one update step. +- The cost was amplified when a rank received particles from many source ranks, even when the actual receive-count imbalance was modest. + +Fix: + +- Add a grow-only `ParticleAttrib::reserve(size_type)` operation. +- Before running deferred receive finalizers, compute the final receive capacity once: + `localAfterDestroy + totalRecvs`. +- Reserve every particle attribute once to that final capacity. +- Change receive-side `ParticleAttrib::deserialize(offset, nrecvs)` to call `reserve(offset + nrecvs)` instead of unconditional `resize`. +- Keep `particleDeserialize` split into: + - `particleDeserResize`: one-time attribute reserve before finalizers + - `particleDeserCopy`: archive unpack / copy into particle attributes + +LUMI validation: + +| Ranks | Metric | Before pre-reserve | After pre-reserve | Improvement | +| ---: | --- | ---: | ---: | ---: | +| 32 | `updateParticle` wall max | 8.08981 | 0.338958 | 23.9x | +| 32 | `particleDeserialize` wall max | 7.88320 | 0.0106297 | 741x | +| 128 | `updateParticle` wall max | 1.59376 | 0.214328 | 7.4x | +| 128 | `particleDeserialize` wall max | 1.40305 | 0.0132888 | 106x | + +Post-fix timer split: + +| Ranks | `particleDeserialize` wall max | `particleDeserResize` wall max | `particleDeserCopy` wall max | +| ---: | ---: | ---: | ---: | +| 32 | 0.0106297 | 0.0000129 | 0.0105913 | +| 128 | 0.0132888 | 0.0000196 | 0.0132491 | + +Interpretation: + +- The repeated preserving resize was the dominant regression. +- After pre-reserving, deserialize time is small and almost entirely the actual archive copy. +- `updateParticle` is again balanced across ranks in the 128-rank run: + before `max/avg/min = 1.594 / 0.864 / 0.811`, + after `max/avg/min = 0.214 / 0.211 / 0.207`. +- This fix belongs in PR 2 because it is specific to the communication/particle-update receive path and independent of higher-order scatter/gather, FFT, NUFFT, and PIF. From ecb4edb238d648e3d82cde2a45ef305a3f4cd0a6 Mon Sep 17 00:00:00 2001 From: Andreas Adelmann Date: Tue, 16 Jun 2026 21:52:54 +0200 Subject: [PATCH 09/14] remouve douplicate --- src/LinearSolvers/PCG.h | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/LinearSolvers/PCG.h b/src/LinearSolvers/PCG.h index 74d33c5c9..d2c826f28 100644 --- a/src/LinearSolvers/PCG.h +++ b/src/LinearSolvers/PCG.h @@ -336,13 +336,6 @@ namespace ippl { FieldRHS>() , preconditioner_m(nullptr) {}; - void initializeFields(mesh_type& mesh, layout_type& layout) override { - CG::initializeFields(mesh, layout); - s.initialize(mesh, layout); - pcond_out.initialize(mesh, layout); - } - /* * Extends the CG workspace with the preconditioner result buffers s and * pcond_out so operator() does not allocate per solve. pcond_out keeps From 72f6e8c3ba5267a29cd56b652e59cf551b76ddb9 Mon Sep 17 00:00:00 2001 From: Andreas Adelmann Date: Tue, 16 Jun 2026 22:17:02 +0200 Subject: [PATCH 10/14] Fix ParticleSendRecv host mirror extent after migration ParticleSendRecv timed out under CTest with two MPI ranks, but the timeout was only a secondary symptom. One rank threw before the final MPI barrier with: Kokkos::deep_copy extents of views don't match: ParticleAttrib::dview_mirror(64) ParticleAttrib::dview(56) and the peer rank then waited until CTest killed the test. The failing path created a host mirror for expectedRank after bunch->update(), then resized that mirror to expectedRank.size() before copying from expectedRank.getView(). On this branch, particle attributes separate capacity from the live particle count: size() reports the backing capacity, while getView() is trimmed to the live particle range. After migration those values can differ across ranks. Remove the explicit resize to expectedRank.size(). getHostMirror() already returns a mirror compatible with the live view, so the deep_copy now uses matching extents after particle migration. Validated with: cmake --build build --target ParticleSendRecv -j 8 ctest --test-dir build -R ParticleSendRecv --output-on-failure -V The CTest entry runs ParticleSendRecv with mpiexec -n 2 and now passes all 12 typed cases. --- unit_tests/Particle/ParticleSendRecv.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/unit_tests/Particle/ParticleSendRecv.cpp b/unit_tests/Particle/ParticleSendRecv.cpp index 28e8ee0ca..1d84cad41 100644 --- a/unit_tests/Particle/ParticleSendRecv.cpp +++ b/unit_tests/Particle/ParticleSendRecv.cpp @@ -149,7 +149,6 @@ TYPED_TEST(ParticleSendRecv, SendAndRecieve) { typename TestFixture::rank_type::view_type::host_mirror_type ER_host = bunch->expectedRank.getHostMirror(); - Kokkos::resize(ER_host, bunch->expectedRank.size()); Kokkos::deep_copy(ER_host, bunch->expectedRank.getView()); for (size_t i = 0; i < bunch->getLocalNum(); ++i) { From 62946ada1e3a685c85dcfa3af6f9cad2ffc81be8 Mon Sep 17 00:00:00 2001 From: Andreas Adelmann Date: Tue, 16 Jun 2026 22:38:36 +0200 Subject: [PATCH 11/14] Fix benchmarkParticleUpdate host mirror resize after migration benchmarkParticleUpdate failed under the two-rank CTest configuration with a Kokkos deep_copy extent mismatch: ParticleAttrib::dview_mirror(5000) ParticleAttrib::dview(4921) This is the same capacity-vs-live-count issue fixed for ParticleSendRecv. After particle migration, ParticleAttrib::size() reports the backing capacity, while getView() exposes only the live particle range. The benchmark resized the host mirrors for P and R to P->P.size() and P->R.size(), then copied from P->P.getView() / P->R.getView(). When the local particle count differed from the reserved capacity, the destination mirror and source view extents no longer matched. Resize the host mirrors to P->getLocalNum() instead. This matches the live extent used by getView() and keeps the benchmark valid after uneven particle migration across ranks. Validated with: cmake --build build --target benchmarkParticleUpdate -j 8 ctest --test-dir build -R benchmarkParticleUpdate --output-on-failure -V ctest --test-dir build -R 'ParticleSendRecv|benchmarkParticleUpdate' --output-on-failure Both ParticleSendRecv and benchmarkParticleUpdate pass with the configured two-rank MPI launch. --- test/particle/benchmarkParticleUpdate.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/particle/benchmarkParticleUpdate.cpp b/test/particle/benchmarkParticleUpdate.cpp index c4d57700d..e1a654d99 100644 --- a/test/particle/benchmarkParticleUpdate.cpp +++ b/test/particle/benchmarkParticleUpdate.cpp @@ -223,9 +223,9 @@ int main(int argc, char* argv[]) { IpplTimings::startTimer(RandPTimer); std::mt19937_64 engP; engP.seed(42 + 10 * it + 100 * ippl::Comm->rank()); - Kokkos::resize(P_host, P->P.size()); + Kokkos::resize(P_host, P->getLocalNum()); double sum_coord = 0.0; - Kokkos::resize(R_host, P->R.size()); + Kokkos::resize(R_host, P->getLocalNum()); Kokkos::deep_copy(R_host, P->R.getView()); for (unsigned long int i = 0; i < P->getLocalNum(); i++) { for (int d = 0; d < 3; d++) { From a13c1a451de48a95a47f927f36b78f33b611144e Mon Sep 17 00:00:00 2001 From: Andreas Adelmann Date: Tue, 16 Jun 2026 23:15:36 +0200 Subject: [PATCH 12/14] add doc --- PR501_SPLIT_MAP.md | 418 ------------------------------------ src/Communicate/Archive.hpp | 8 + 2 files changed, 8 insertions(+), 418 deletions(-) delete mode 100644 PR501_SPLIT_MAP.md diff --git a/PR501_SPLIT_MAP.md b/PR501_SPLIT_MAP.md deleted file mode 100644 index aa893c3a3..000000000 --- a/PR501_SPLIT_MAP.md +++ /dev/null @@ -1,418 +0,0 @@ -# PR 501 Split Map - -Source PR: https://github.com/IPPL-framework/ippl/pull/501 - -Local mapping branch: `pr-501-map` - -Compared against: `origin/master` at `da43fd66a18848b65dcd82af90001f5b64a798ab` - -Overall size: - -- 41 commits -- 149 files changed -- 23,231 insertions -- 2,205 deletions - -## Subsystem Size - -| Subsystem | Files | Additions | Deletions | -| --- | ---: | ---: | ---: | -| Interpolation / Autotune | 33 | 8,138 | 20 | -| FFT / NUFFT | 24 | 6,571 | 814 | -| Particle | 21 | 3,841 | 757 | -| Alpine / PIF examples | 12 | 1,923 | 37 | -| Utility | 11 | 1,424 | 45 | -| Communicate | 14 | 626 | 156 | -| Solvers | 6 | 394 | 253 | -| Field / Halo / Layout | 11 | 116 | 87 | -| Build / CMake | 4 | 117 | 12 | -| Types | 2 | 35 | 4 | -| FEM | 3 | 12 | 13 | -| Other | 8 | 34 | 7 | - -The review load is dominated by three areas: - -- Interpolation / Autotune: new scatter/gather dispatch, binning, tiling, tuning cache. -- FFT / NUFFT: backend/transform refactor plus native NUFFT and pruned transforms. -- Particle: new spatial update path, send/recv serialization hooks, sorting/buffer reuse. - -## Commit Shape - -The branch starts with well-labeled subsystem commits: - -| Commit | Area | Notes | -| --- | --- | --- | -| `7cff674a` | Build / dependencies | Adds `IPPL_ENABLE_FINUFFT`, `IPPL_ENABLE_CUFFTMP`, autotune preset plumbing. | -| `13950c4f` | FFT / NUFFT | Large FFT backend/transform split plus native NUFFT engine. | -| `dbbe59a4` | Interpolation | Scatter/gather redesign, binning, tiling, autotune runtime cache. | -| `2878b90b` | Particle / halo | Rewrites `ParticleSpatialLayout`, adds sort buffers, threads `nghost` through field layout/halo. | -| `9507a796` | Utility | Timing overhaul, `BufferView`, `Tuning`, `ParallelDispatch`, type utilities. | -| `f2820064` | Communicate | Serialization hooks, shared buffer handler. | -| `63d6ccf2` | Integration glue | Field/solver/type updates for new FFT and particle layout APIs. | -| `99d21ba7` | Alpine PIF | Adds ElectrostaticPIF examples. | -| `f0560f76` | Unit tests | Adds NUFFT, interpolation, binning, particle update tests. | -| `927cac7b` | Integration tests | Updates particle benchmark and adds scaling benchmark. | - -After that, the branch has many fixup/regression commits. Several are descriptive and likely fold into the earlier subsystem commits; some are separable: - -| Commit | Area | Split relevance | -| --- | --- | --- | -| `40438d17` | PCG / preconditioner | Good candidate for an independent PR: hoists per-iteration allocations out of solver loop. | -| `be5f794c` | PCG / PoissonCG | Follow-up to the PCG allocation PR: pass `Field` by reference through `OperatorF`. | -| `95762403` | PIF examples | Consolidates pruned variants into parameterized source. Belongs with PIF examples. | -| `0e7bc58d`, `6c1c48ad` | FFT / NUFFT | FINUFFT guards for CUDA + 2D unit-test build. Belong with NUFFT/FINUFFT PR. | -| `cec1ca61` | Cleanup | Doxygen, formatting, size-type consistency. Should be folded into relevant split PRs or kept as a final cleanup PR. | - -## Dependency Observations - -### NUFFT Depends On Interpolation - -`src/FFT/NUFFT/NativeNUFFT.h` includes and uses: - -- `Interpolation/Scatter/ScatterConfig.h` -- `Interpolation/Gather/GatherConfig.h` -- scatter/gather kernels through the new interpolation dispatch - -That means native NUFFT cannot be cleanly reviewed before the higher-order scatter/gather layer. - -### PIF Depends On NUFFT - -The Alpine PIF examples use: - -- `ippl::FFT` -- `scatterPIFNUFFT` -- `gatherPIFNUFFT` - -So PIF examples should come after the NUFFT transform API. - -### Particle Update And Communicate Are Coupled - -The particle update rewrite uses: - -- `ParticleAttrib::serialize` / `deserialize` -- shared `BufferHandler` -- direct archive serialization -- reusable send/recv buffers - -This suggests communication and particle update may be one PR unless a clean lower-level communication PR is extracted first. - -### `nghost` / Halo Changes Cross Particle And Field - -`ParticleSpatialLayout` changes also thread `nghost` through: - -- `FieldLayout` -- `HaloCells` -- field layout APIs - -This is small in line count but high in integration risk. It should be explicitly called out in whichever PR contains particle update. - -### PCG Allocation Fix Looks Separately Reviewable - -The late commits: - -- `40438d17` -- `be5f794c` - -only touch: - -- `src/LinearSolvers/PCG.h` -- `src/LinearSolvers/Preconditioner.h` -- `src/PoissonSolvers/PoissonCG.h` - -This can likely become an independent performance/regression PR, separate from PIF. - -## Candidate Split - -### PR 1: PCG Allocation / Solver Performance - -Scope: - -- `src/LinearSolvers/PCG.h` -- `src/LinearSolvers/Preconditioner.h` -- `src/PoissonSolvers/PoissonCG.h` - -Candidate commits: - -- `40438d17` -- `be5f794c` - -Reason: - -- Smallest coherent split. -- Directly addresses the reported PCG slowdown / repeated `cudaMalloc` concern. -- Does not conceptually depend on PIF, NUFFT, or interpolation. - -Risk: - -- Need to verify solver convergence and exact iteration behavior. - -### PR 2: Communication And Particle Update Infrastructure - -Scope: - -- `src/Communicate/*` -- `src/Particle/ParticleSpatialLayout*` -- `src/Particle/ParticleAttrib*` -- `src/Particle/ParticleBase*` -- `src/Particle/ParticleSort.h` -- `src/Particle/SortBuffer.h` -- `src/FieldLayout/FieldLayout*` -- `src/Field/HaloCells*` -- particle update tests - -Candidate commits: - -- `2878b90b` -- `f2820064` -- relevant parts of `63d6ccf2` -- relevant parts of `f0560f76` -- relevant fixup commits after `728af370` - -Reason: - -- Captures the PIC scaling improvement that is not PIF-specific. -- Reviewable independently from FFT/NUFFT algorithms. - -Risk: - -- This is still a substantial behavioral change. -- Needs multi-rank CPU/GPU particle update and `ParticleSendRecv` coverage. - -### PR 3: Higher-Order Scatter/Gather And Autotune - -Scope: - -- `src/Interpolation/*` -- `cmake/AutoTunePresets.cmake` -- `cmake/IpplAutoTunePresets.h.in` -- `cmake/auto_tune/*` -- interpolation tests - -Candidate commits: - -- `7cff674a` only for autotune preset plumbing, not FINUFFT/CUFFTMP if separable. -- `dbbe59a4` -- interpolation portions of `f0560f76` -- `dacdcd9a` if routing legacy CIC through the new framework is included. - -Reason: - -- This is the algorithmic layer native NUFFT requires. -- Can be validated without PIF examples. - -Risk: - -- `dacdcd9a` changes existing CIC scatter/gather behavior by routing legacy paths through the new framework. That may be better as a second PR after the new interpolation layer lands. - -### PR 4: FFT Backend / Transform Refactor - -Scope: - -- `src/FFT/Backend/*` -- `src/FFT/Transform/CC.h` -- `src/FFT/Transform/RC.h` -- `src/FFT/Transform/Trig.h` -- `src/FFT/Transform/PrunedCC.h` -- `src/FFT/Transform/PrunedRC.h` -- `src/FFT/Transform/Common.h` -- `src/FFT/Traits.h` -- existing FFT test updates - -Candidate commits: - -- FFT refactor parts of `13950c4f` -- FFT portions of `f0560f76` -- FFT portions of later build fixes - -Reason: - -- Gives reviewers the backend/transform API without also reviewing NUFFT and PIF. - -Risk: - -- Current commit `13950c4f` mixes backend refactor with native NUFFT files. This split likely requires path-based extraction or commit surgery. - -### PR 5: Native NUFFT And FINUFFT/cuFINUFFT Integration - -Scope: - -- `src/FFT/NUFFT/*` -- `src/FFT/Transform/NUFFT.*` -- FINUFFT/CUFFTMP build switches if not already merged -- NUFFT tests - -Candidate commits: - -- NUFFT portions of `13950c4f` -- FINUFFT/CUFFTMP portions of `7cff674a` -- NUFFT portions of `f0560f76` -- `0e7bc58d` -- `6c1c48ad` - -Reason: - -- NUFFT depends on PR 3 scatter/gather and PR 4 transform structure. - -Risk: - -- Native NUFFT depends on new scatter/gather. -- FINUFFT path has Dim-guard complexity and external dependency complexity. - -### PR 6: PIF / Alpine Examples - -Scope: - -- `alpine/ElectrostaticPIF/*` -- Alpine manager wiring -- PIF-specific particle attrib convenience APIs if not included in NUFFT PR - -Candidate commits: - -- `99d21ba7` -- `95762403` -- PIF/example fixups - -Reason: - -- Examples should be reviewed after core APIs are accepted. - -Risk: - -- PIF examples currently depend on the full stack: particle update, scatter/gather, FFT transform refactor, NUFFT. - -# LUMI Results - -| Benchmark | Problem size | Nodes | Ranks | master | pr501-pcg | pr501-com | pr501-fft | pr501-hosg | pr501-nufft | -| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | -| FEM | `513_10` | 8 | 64 | 28.23 | 27.77 (-2%) | 27.71 (-2%) | 27.88 (-1%) | 28.23 (0%) | 27.95 (-1%) | -| FFT | `512_10` | 4 | 32 | 4.39 | 4.41 (0%) | 11.54 (+163%) | 11.53 (+162%) | 11.59 (+164%) | 11.53 (+162%) | -| FFT | `512_10` | 16 | 128 | 1.65 | 1.65 (0%) | 2.88 (+74%) | 2.88 (+75%) | 2.93 (+78%) | 2.93 (+78%) | -| PCG | `512_10` | 1 | 8 | 72.31 | 70.01 (-3%) | | 76.99 (+6%) | | 76.75 (+6%) | -| PCG | `512_10` | 4 | 32 | 34.60 | 32.93 (-5%) | 43.71 (+26%) | 33.62 (-3%) | 36.32 (+5%) | 33.55 (-3%) | -| PCG | `512_10` | 64 | 512 | 25.00 | 23.85 (-5%) | 22.89 (-8%) | 22.40 (-10%) | 22.74 (-9%) | 22.58 (-10%) | - - -## Update Timer Investigation - -Observed issue: - -- On LUMI, the `update` / `updateParticle` timer can increase strongly when moving from `pr501-pcg` to `pr501-communication-particle-update` / `pr501-com`. -- A first code inspection did not find an FFT solver change in this branch step: `pr501-pcg..pr501-communication-particle-update` has no changes under `src/FFT` or `src/PoissonSolvers`. -- The branch step does change particle migration, communication archives, field layout / halo `nghost` plumbing, and timing infrastructure. - -Most likely immediate explanation: - -- In the communication branch, `ParticleSpatialLayout::update()` posts sends and receives, then performs `MPI_Waitall` plus deferred receive finalization/deserialization inside the outer `updateParticle` timer, but outside the child timers `particleSend`, `particleRecv`, and `sendPreprocess`. -- This can make `updateParticle` look much larger while the child timers look artificially small. -- On the LUMI timing data this pattern is visible: for `PCG_strong_scaling/512_10/nodes_64`, `locateParticles`, `particleSend`, and `particleRecv` are small in `pr501-com`, while the unexplained part of `updateParticle` is large. - -Secondary suspect: - -- `Archive` contiguous scalar serialization/deserialization changed from `Kokkos::deep_copy` on unmanaged byte views to custom byte-copy kernels plus fences. -- On GPU, especially with GPU-aware MPI, that may be slower or introduce extra synchronization compared with the previous optimized device copy path. -- This would affect particle migration directly, because `ParticleBase::sendToRank()` serializes every active particle attribute before `MPI_Isend`, and receive finalizers deserialize after the wait. - -Local Mac OpenMP check: - -- Built clean comparison worktrees for: - - `pr501-pcg` - - `pr501-communication-particle-update` -- Configuration: - - `IPPL_PLATFORMS=OPENMP` - - `IPPL_ENABLE_ALPINE=ON` - - `IPPL_ENABLE_UNIT_TESTS=ON` - - `IPPL_ENABLE_FFT=ON` - - Kokkos `5.0.0` - - Homebrew LLVM `clang++` and `libomp` -- AppleClang did not find OpenMP automatically; explicit `libomp` include/library paths were required. - -Mac OpenMP miniapp runs: - -```bash -mpiexec -x OMP_NUM_THREADS=4 -x OMP_PROC_BIND=false -n 2 \ - ./LandauDamping 32 32 32 20000 5 FFT 0.01 LeapFrog --overallocate 2.0 --info 5 - -mpiexec -x OMP_NUM_THREADS=2 -x OMP_PROC_BIND=false -n 4 \ - ./LandauDamping 32 32 32 20000 5 FFT 0.01 LeapFrog --overallocate 2.0 --info 5 -``` - -Mac OpenMP wall-max timings: - -| Ranks | Branch | `update` | `updateParticle` | `locateParticles` | `sendPreprocess` | `particleSend` | `particleRecv` | -| ---: | --- | ---: | ---: | ---: | ---: | ---: | ---: | -| 2 | `pr501-pcg` | 0.0911 | 0.1090 | 0.0906 | 0.0187 | 0.00431 | 0.00451 | -| 2 | `pr501-com` | 0.0250 | 0.0266 | 0.00577 | 0.000863 | 0.00150 | 0.000064 | -| 4 | `pr501-pcg` | 0.1550 | 0.1923 | 0.1303 | 0.0391 | 0.0180 | 0.00923 | -| 4 | `pr501-com` | 0.0495 | 0.0534 | 0.00872 | 0.00677 | 0.00291 | 0.000131 | - -Initial conclusion: - -- The Mac OpenMP run does not reproduce the LUMI slowdown. The communication branch is faster locally for this small CPU/OpenMP test. -- That points away from a generic algorithmic slowdown in the particle update rewrite. -- The remaining likely causes are GPU/MPI-specific behavior on LUMI: GPU-aware MPI wait behavior, archive device-buffer serialization/deserialization, or timer attribution around deferred receive finalization. - -Recommended next diagnostic patch: - -- Add or move timers around: - - `MPI_Waitall` as `particleWait`, or charge it back into `particleSend`. - - deferred receive finalizers as `particleDeserialize`, or charge them into `particleRecv`. - - optionally `Archive::serialize(hash)` and `Archive::deserialize(offset)` if the wait/deserialization split is still unclear. -- Re-run the LUMI cases before changing algorithms, so the slowdown is attributed to wait time, packing/unpacking, or actual communication. - -Diagnostic patch applied locally: - -- `ParticleSpatialLayout::update()` now splits the previously hidden tail of `updateParticle` into: - - `particleWait`: combined send/receive `MPI_Waitall` - - `particleFreeBuffers`: communicator buffer release - - `particleDeserialize`: deferred receive finalizers / unpacking into particle attributes -- Expected interpretation on LUMI: - - If `particleWait` dominates, the issue is likely GPU-aware MPI progress / synchronization or a load imbalance exposed by the nonblocking exchange. - - If `particleDeserialize` dominates, inspect `Archive::deserialize(offset)` and the custom byte-copy kernels. - - If neither dominates but `updateParticle` remains high, there is still another untimed section in the particle update path. - -### Receive-Side Pre-Reserve Fix - -Root cause: - -- The LUMI runs showed that the slowdown was not in `MPI_Waitall` and not in communicator buffer release. -- The hidden cost was almost entirely in deferred receive finalization, reported by the new `particleDeserialize` timer. -- Each receive finalizer called `ParticleAttrib::deserialize(offset, nrecvs)`. -- That path called `ParticleAttrib::resize(offset + nrecvs)` for every source rank and every attribute. -- `Kokkos::resize` preserves existing entries when growing. On GPU this repeatedly copied already-live particle storage during one update step. -- The cost was amplified when a rank received particles from many source ranks, even when the actual receive-count imbalance was modest. - -Fix: - -- Add a grow-only `ParticleAttrib::reserve(size_type)` operation. -- Before running deferred receive finalizers, compute the final receive capacity once: - `localAfterDestroy + totalRecvs`. -- Reserve every particle attribute once to that final capacity. -- Change receive-side `ParticleAttrib::deserialize(offset, nrecvs)` to call `reserve(offset + nrecvs)` instead of unconditional `resize`. -- Keep `particleDeserialize` split into: - - `particleDeserResize`: one-time attribute reserve before finalizers - - `particleDeserCopy`: archive unpack / copy into particle attributes - -LUMI validation: - -| Ranks | Metric | Before pre-reserve | After pre-reserve | Improvement | -| ---: | --- | ---: | ---: | ---: | -| 32 | `updateParticle` wall max | 8.08981 | 0.338958 | 23.9x | -| 32 | `particleDeserialize` wall max | 7.88320 | 0.0106297 | 741x | -| 128 | `updateParticle` wall max | 1.59376 | 0.214328 | 7.4x | -| 128 | `particleDeserialize` wall max | 1.40305 | 0.0132888 | 106x | - -Post-fix timer split: - -| Ranks | `particleDeserialize` wall max | `particleDeserResize` wall max | `particleDeserCopy` wall max | -| ---: | ---: | ---: | ---: | -| 32 | 0.0106297 | 0.0000129 | 0.0105913 | -| 128 | 0.0132888 | 0.0000196 | 0.0132491 | - -Interpretation: - -- The repeated preserving resize was the dominant regression. -- After pre-reserving, deserialize time is small and almost entirely the actual archive copy. -- `updateParticle` is again balanced across ranks in the 128-rank run: - before `max/avg/min = 1.594 / 0.864 / 0.811`, - after `max/avg/min = 0.214 / 0.211 / 0.207`. -- This fix belongs in PR 2 because it is specific to the communication/particle-update receive path and independent of higher-order scatter/gather, FFT, NUFFT, and PIF. diff --git a/src/Communicate/Archive.hpp b/src/Communicate/Archive.hpp index bfba0815c..cff0482f0 100644 --- a/src/Communicate/Archive.hpp +++ b/src/Communicate/Archive.hpp @@ -18,6 +18,14 @@ namespace ippl { } } + /* + * Packs a sparse selection of scalar particle-attribute entries into + * the archive buffer. `hash(i)` gives the source index in `view_data` + * for the i-th item being sent; the functor copies that value as raw + * bytes to `buf + wpos + i * sizeof(T)`. This is used by + * Archive::serialize(view, hash, nsends) during particle migration, + * where only particles listed in the send hash are serialized. + */ template struct SerializeHashFunctor { const T* view_data; From 23a56431271e130375cf887351fe1ce95ccbbdf0 Mon Sep 17 00:00:00 2001 From: Andreas Adelmann Date: Wed, 17 Jun 2026 10:45:12 +0200 Subject: [PATCH 13/14] remove circular dependece --- src/Particle/ParticleSpatialLayout.hpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/Particle/ParticleSpatialLayout.hpp b/src/Particle/ParticleSpatialLayout.hpp index e0590974c..cbb4c760b 100644 --- a/src/Particle/ParticleSpatialLayout.hpp +++ b/src/Particle/ParticleSpatialLayout.hpp @@ -24,8 +24,6 @@ #include #include -#include "Ippl.h" - #include "Utility/IpplTimings.h" #include "Utility/ParallelDispatch.h" From 7872582fc19373722e3d6453452ff36adaf29693 Mon Sep 17 00:00:00 2001 From: Andreas Adelmann Date: Wed, 17 Jun 2026 11:20:48 +0200 Subject: [PATCH 14/14] improve code doc --- src/LinearSolvers/PCG.h | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/src/LinearSolvers/PCG.h b/src/LinearSolvers/PCG.h index d2c826f28..049fc6810 100644 --- a/src/LinearSolvers/PCG.h +++ b/src/LinearSolvers/PCG.h @@ -337,13 +337,16 @@ namespace ippl { , preconditioner_m(nullptr) {}; /* - * Extends the CG workspace with the preconditioner result buffers s and - * pcond_out so operator() does not allocate per solve. pcond_out keeps - * its default NoBcFace BCs: the preconditioner's internal operator - * chain therefore never triggers PeriodicFace::apply MPI calls -- if it - * did, the global MPI sequence would diverge from the master code path - * (where the preconditioner returned a fresh NoBcFace field) and - * intermittent multi-rank halo deadlocks would follow. + * Allocate the extra PCG work fields once, together with the CG + * work fields r, d, and q. + * + * s and pcond_out are scratch buffers for M^{-1} r. They intentionally + * keep the default NoBcFace boundary conditions. Only the search + * direction d gets the physical BCs after the preconditioner result has + * been copied into it. If these scratch fields inherited periodic BCs, + * assignments inside the preconditioner would call PeriodicFace::apply + * and add halo MPI exchanges that did not exist when the preconditioner + * returned fresh temporary NoBcFace fields. */ void initializeFields(mesh_type& mesh, layout_type& layout) override { CG> preconditioner_m; /* - * Extends the CG workspace with the preconditioner result buffers s and - * pcond_out so operator() does not allocate per solve. - * Preconditioner result buffers, allocated once via initializeFields() - * pcond_out keeps its default NoBcFace BCs: the preconditioner's internal operator - * chain therefore never triggers PeriodicFace::apply MPI calls -- if it - * did, the global MPI sequence would diverge from the master code path - * (where the preconditioner returned a fresh NoBcFace field) and - * intermittent multi-rank halo deadlocks would follow. + * Persistent preconditioner output buffers. They are allocated in + * initializeFields() and reused on every solve to avoid per-iteration + * Field allocation. + * + * These buffers are scratch storage, not physical solution fields, so + * they must keep NoBcFace BCs. The physically meaningful BCs are applied + * only to d after copying pcond_out into it. */ lhs_type s;