diff --git a/src/Communicate/Archive.h b/src/Communicate/Archive.h index 6ec74f946..2e8561e36 100644 --- a/src/Communicate/Archive.h +++ b/src/Communicate/Archive.h @@ -9,6 +9,9 @@ // that they have type char and thus contain raw bytes, unlike other typed buffers // such as detail::FieldBufferData used by HaloCells. // +// On CUDA/HIP the internal buffer is allocated directly via cudaMalloc/hipMalloc +// so that the pointer is page-aligned (4K) and compatible with MPI IPC. +// #ifndef IPPL_ARCHIVE_H #define IPPL_ARCHIVE_H @@ -32,6 +35,7 @@ namespace ippl { using pointer_type = typename buffer_type::pointer_type; Archive(size_type size = 0); + ~Archive(); /*! * Serialize. @@ -40,6 +44,10 @@ namespace ippl { template void serialize(const Kokkos::View& view, size_type nsends); + template + void serialize(const Kokkos::View& view, const HashView& hash, + size_type nsends); + /*! * Serialize vector attributes * @@ -52,6 +60,10 @@ namespace ippl { void serialize(const Kokkos::View*, ViewArgs...>& view, size_type nsends); + template + void serialize(const Kokkos::View*, ViewArgs...>& view, + const HashView& hash, size_type nsends); + /*! * Deserialize. * @param view to put data to @@ -59,6 +71,14 @@ namespace ippl { template void deserialize(Kokkos::View& view, size_type nrecvs); + template + void deserialize(Kokkos::View& view, size_type offset, + size_type nrecvs); + + template + void deserialize(Kokkos::View*, ViewArgs...>& view, size_type offset, + size_type nrecvs); + /*! * Deserialize vector attributes * @@ -71,33 +91,50 @@ namespace ippl { void deserialize(Kokkos::View*, ViewArgs...>& view, size_type nrecvs); /*! - * @returns a pointer to the data of the buffer + * @returns a pointer to the data of the buffer. + * On GPU this is a page-aligned device pointer from cudaMalloc/hipMalloc. */ - pointer_type getBuffer() { return buffer_m.data(); } + pointer_type getBuffer() { return bufferData(); } /*! - * @returns the size of the buffer + * @returns the number of bytes written so far */ size_type getSize() const { return writepos_m; } - size_type getBufferSize() const { return buffer_m.size(); } - - void resizeBuffer(size_type size) { Kokkos::resize(buffer_m, size); } + /*! + * @returns the total capacity of the buffer in bytes + */ + size_type getBufferSize() const { return bufferSize(); } - void reallocBuffer(size_type size) { Kokkos::realloc(buffer_m, size); } + void resizeBuffer(size_type size); + void reallocBuffer(size_type size); void resetWritePos() { writepos_m = 0; } void resetReadPos() { readpos_m = 0; } - ~Archive() = default; - private: //! write position for serialization size_type writepos_m; //! read position for deserialization size_type readpos_m; - //! serialized data + +#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) + //! Raw device pointer from cudaMalloc/hipMalloc (page-aligned, IPC-safe) + pointer_type buffer_ptr_m = nullptr; + size_type buffer_size_m = 0; + + pointer_type bufferData() const { return buffer_ptr_m; } + size_type bufferSize() const { return buffer_size_m; } + + void gpuAlloc(size_type size); + void gpuFree(); +#else + //! serialized data (standard Kokkos view on CPU) buffer_type buffer_m; + + pointer_type bufferData() const { return buffer_m.data(); } + size_type bufferSize() const { return buffer_m.size(); } +#endif }; } // namespace detail } // namespace ippl diff --git a/src/Communicate/Archive.hpp b/src/Communicate/Archive.hpp index a55c33c7d..cff0482f0 100644 --- a/src/Communicate/Archive.hpp +++ b/src/Communicate/Archive.hpp @@ -4,6 +4,12 @@ // #include "Archive.h" +#if defined(KOKKOS_ENABLE_CUDA) +#include +#elif defined(KOKKOS_ENABLE_HIP) +#include +#endif + namespace ippl { namespace detail { KOKKOS_INLINE_FUNCTION void copyBytes(char* dst, const char* src, size_t size) { @@ -12,117 +18,372 @@ namespace ippl { } } + /* + * Packs a sparse selection of scalar particle-attribute entries into + * the archive buffer. `hash(i)` gives the source index in `view_data` + * for the i-th item being sent; the functor copies that value as raw + * bytes to `buf + wpos + i * sizeof(T)`. This is used by + * Archive::serialize(view, hash, nsends) during particle migration, + * where only particles listed in the send hash are serialized. + */ + template + struct SerializeHashFunctor { + const T* view_data; + HashView hash; + BufferPtr buf; + size_t elem_size; + size_t wpos; + + KOKKOS_INLINE_FUNCTION void operator()(const size_t i) const { + const char* src = reinterpret_cast(view_data + hash(i)); + char* dst = buf + i * elem_size + wpos; + copyBytes(dst, src, elem_size); + } + }; + + template + struct SerializeHashVectorFunctor { + const Vector* view_data; + HashView hash; + BufferPtr buf; + size_t elem_size; + size_t wpos; + + KOKKOS_INLINE_FUNCTION void operator()(const size_t i, const size_t d) const { + const Vector* vec = view_data + hash(i); + const T value = (*vec)[d]; + const char* src = reinterpret_cast(&value); + char* dst = buf + (Dim * i + d) * elem_size + wpos; + copyBytes(dst, src, elem_size); + } + }; + + // ================================================================= + // GPU buffer management + // ================================================================= + +#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) + + template + void Archive::gpuAlloc(size_type size) { + if (size == 0) return; +#if defined(KOKKOS_ENABLE_HIP) + // HSA IPC (used by Cray MPICH for large-message GPU transfers) + // requires allocation sizes to be multiples of the GPU page + // granularity (64 KB on MI250X / MI300X). Without this, + // hsa_amd_ipc_memory_attach fails with INVALID_ARGUMENT. + static constexpr size_type kGranularity = 65536; + size = ((size + kGranularity - 1) / kGranularity) * kGranularity; +#endif + void* ptr = nullptr; +#if defined(KOKKOS_ENABLE_CUDA) + (void)cudaMalloc(&ptr, size); +#else + (void)hipMalloc(&ptr, size); +#endif + buffer_ptr_m = static_cast(ptr); + buffer_size_m = size; + } + + template + void Archive::gpuFree() { + if (buffer_ptr_m) { +#if defined(KOKKOS_ENABLE_CUDA) + (void)cudaFree(buffer_ptr_m); +#else + (void)hipFree(buffer_ptr_m); +#endif + buffer_ptr_m = nullptr; + buffer_size_m = 0; + } + } + + template + Archive::Archive(size_type size) + : writepos_m(0) + , readpos_m(0) { + gpuAlloc(size); + } + + template + Archive::~Archive() { + gpuFree(); + } + + template + void Archive::resizeBuffer(size_type size) { + if (size <= buffer_size_m) return; + +#if defined(KOKKOS_ENABLE_HIP) + static constexpr size_type kGranularity = 65536; + size = ((size + kGranularity - 1) / kGranularity) * kGranularity; +#endif + pointer_type new_ptr = nullptr; + void* vptr = nullptr; +#if defined(KOKKOS_ENABLE_CUDA) + (void)cudaMalloc(&vptr, size); +#else + (void)hipMalloc(&vptr, size); +#endif + new_ptr = static_cast(vptr); + + if (buffer_ptr_m && buffer_size_m > 0) { +#if defined(KOKKOS_ENABLE_CUDA) + (void)cudaMemcpy(new_ptr, buffer_ptr_m, buffer_size_m, cudaMemcpyDeviceToDevice); + (void)cudaFree(buffer_ptr_m); +#else + (void)hipMemcpy(new_ptr, buffer_ptr_m, buffer_size_m, hipMemcpyDeviceToDevice); + (void)hipFree(buffer_ptr_m); +#endif + } + + buffer_ptr_m = new_ptr; + buffer_size_m = size; + } + + template + void Archive::reallocBuffer(size_type size) { + gpuFree(); + gpuAlloc(size); + } + +#else // CPU path + template Archive::Archive(size_type size) : writepos_m(0) , readpos_m(0) , buffer_m("buffer", size) {} - // ----------------------------------- - // Scalar serialize + template + Archive::~Archive() = default; + + template + void Archive::resizeBuffer(size_type size) { + Kokkos::resize(buffer_m, size); + } + + template + void Archive::reallocBuffer(size_type size) { + Kokkos::realloc(buffer_m, size); + } + +#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP + + // ================================================================= + // Serialize — scalar + // ================================================================= + template template void Archive::serialize(const Kokkos::View& view, size_type nsends) { - constexpr size_t size = sizeof(T); - char* dst_ptr = (char*)(buffer_m.data()) + writepos_m; - char* src_ptr = (char*)(view.data()); - assert(writepos_m + (nsends * size) <= buffer_m.size()); - // construct temp views of the src/dst buffers of the correct size (bytes) - using src_view_type = - Kokkos::View::memory_space, - Kokkos::MemoryTraits>; - using dst_view_type = - Kokkos::View>; - src_view_type src_view(src_ptr, size * nsends); - dst_view_type dst_view(dst_ptr, size * nsends); - Kokkos::deep_copy(dst_view, src_view); + using exec_space = typename Kokkos::View::execution_space; + using policy_type = Kokkos::RangePolicy; + + size_t size = sizeof(T); + auto base = bufferData(); + auto writepos = writepos_m; + Kokkos::parallel_for( + "Archive::serialize()", policy_type(0, nsends), KOKKOS_LAMBDA(const size_type i) { + const char* src = reinterpret_cast(view.data() + i); + char* dst = base + i * size + writepos; + copyBytes(dst, src, size); + }); Kokkos::fence(); - writepos_m += (nsends * size); + writepos_m += size * nsends; } - // ----------------------------------- - // Vector serialize + // ================================================================= + // Serialize — scalar with hash + // ================================================================= + + template + template + void Archive::serialize(const Kokkos::View& view, + const HashView& hash, size_type nsends) { + using exec_space = HashView::execution_space; + using policy_type = Kokkos::RangePolicy; + using BufferPtr = pointer_type; + + SerializeHashFunctor f{view.data(), hash, bufferData(), + sizeof(T), writepos_m}; + + Kokkos::parallel_for("Archive::serialize(hash)", policy_type(0, nsends), f); + Kokkos::fence(); + writepos_m += sizeof(T) * nsends; + } + + // ================================================================= + // Serialize — vector + // ================================================================= + template template void Archive::serialize( const Kokkos::View*, ViewArgs...>& view, size_type nsends) { - constexpr size_t size = sizeof(T); - char* dst_ptr = (char*)(buffer_m.data()); - ippl::Vector* src_ptr = view.data(); - auto wp = writepos_m; - // The Kokkos range policies expect int64 - // so we have to explicitly specify size_type (uint64) using exec_space = typename Kokkos::View::execution_space; + + size_t size = sizeof(T); + auto base = bufferData(); + auto writepos = writepos_m; + // Default index type for range policies is int64, + // so we have to explicitly specify size_type (uint64) using mdrange_t = Kokkos::MDRangePolicy, Kokkos::IndexType, exec_space>; Kokkos::parallel_for( - "Archive::serialize()", mdrange_t({0, 0}, {(long int)nsends, Dim}), + "Archive::serialize()", + // The constructor for Kokkos range policies always + // expects int64 regardless of index type provided + // by template parameters, so the typecast is necessary + // to avoid compiler warnings + mdrange_t({0, 0}, {(long int)nsends, Dim}), KOKKOS_LAMBDA(const size_type i, const size_t d) { - const char* src = reinterpret_cast(&src_ptr[i][d]); - char* dst = dst_ptr + (Dim * i + d) * size + wp; + const T value = view.data()[i][d]; + const char* src = reinterpret_cast(&value); + char* dst = base + (Dim * i + d) * size + writepos; copyBytes(dst, src, size); }); + Kokkos::fence(); + writepos_m += Dim * size * nsends; + } + + // ================================================================= + // Serialize — vector with hash + // ================================================================= + template + template + void Archive::serialize( + const Kokkos::View*, ViewArgs...>& view, const HashView& hash, + size_type nsends) { + using exec_space = typename HashView::execution_space; + size_t size = sizeof(T); + using BufferPtr = pointer_type; + using mdrange_t = + Kokkos::MDRangePolicy, Kokkos::IndexType, exec_space>; + + SerializeHashVectorFunctor f{ + view.data(), hash, bufferData(), size, writepos_m}; + + Kokkos::parallel_for("Archive::serialize(hash, vector)", + mdrange_t({0, 0}, {(long int)nsends, Dim}), f); Kokkos::fence(); writepos_m += Dim * size * nsends; } - // ----------------------------------- - // Scalar Deserialize + // ================================================================= + // Deserialize — scalar + // ================================================================= + template template void Archive::deserialize(Kokkos::View& view, size_type nrecvs) { - // if we have to enlarge the destination view + using exec_space = typename Kokkos::View::execution_space; + using policy_type = Kokkos::RangePolicy; + + size_t size = sizeof(T); if (nrecvs > view.extent(0)) { Kokkos::realloc(view, nrecvs); } - // - constexpr size_t size = sizeof(T); - char* src_ptr = (char*)(buffer_m.data()) + readpos_m; - char* dst_ptr = (char*)(view.data()); - assert(readpos_m + (nrecvs * size) <= buffer_m.size()); - // construct temp views of the src/dst buffers of the correct size (bytes) - using src_view_type = - Kokkos::View>; - using dst_view_type = - Kokkos::View::memory_space, - Kokkos::MemoryTraits>; - src_view_type src_view(src_ptr, size * nrecvs); - dst_view_type dst_view(dst_ptr, size * nrecvs); - Kokkos::deep_copy(dst_view, src_view); + auto base = bufferData(); + auto readpos = readpos_m; + Kokkos::parallel_for( + "Archive::deserialize()", policy_type(0, nrecvs), KOKKOS_LAMBDA(const size_type i) { + const char* src = base + i * size + readpos; + char* dst = reinterpret_cast(view.data() + i); + copyBytes(dst, src, size); + }); + // Wait for deserialization kernel to complete + // (as with serialization kernels) Kokkos::fence(); - readpos_m += (nrecvs * size); + readpos_m += size * nrecvs; } - // ----------------------------------- - // Vector Deserialize + // ================================================================= + // Deserialize — vector + // ================================================================= + template template void Archive::deserialize(Kokkos::View*, ViewArgs...>& view, - size_type nrecvs) - { - // if we have to enlarge the destination view + size_type nrecvs) { + using exec_space = typename Kokkos::View::execution_space; + + size_t size = sizeof(T); if (nrecvs > view.extent(0)) { Kokkos::realloc(view, nrecvs); } - // - constexpr size_t size = sizeof(T); - char* src_ptr = (char*)(buffer_m.data()); - ippl::Vector* dst_ptr = view.data(); - auto rp = readpos_m; - using exec_space = typename Kokkos::View::execution_space; using mdrange_t = Kokkos::MDRangePolicy, Kokkos::IndexType, exec_space>; + auto base = bufferData(); + auto readpos = readpos_m; Kokkos::parallel_for( "Archive::deserialize()", mdrange_t({0, 0}, {(long int)nrecvs, Dim}), KOKKOS_LAMBDA(const size_type i, const size_t d) { - const char* src = src_ptr + (Dim * i + d) * size + rp; - char* dst = reinterpret_cast(&dst_ptr[i][d]); + const char* src = base + (Dim * i + d) * size + readpos; + T value; + char* dst = reinterpret_cast(&value); + copyBytes(dst, src, size); + view.data()[i](d) = value; + }); + Kokkos::fence(); + readpos_m += Dim * size * nrecvs; + } + + // ================================================================= + // Deserialize — scalar with offset + // ================================================================= + + template + template + void Archive::deserialize(Kokkos::View& view, + size_type offset, size_type nrecvs) { + using exec_space = typename Kokkos::View::execution_space; + using policy_type = Kokkos::RangePolicy; + size_t size = sizeof(T); + if (offset + nrecvs > view.extent(0)) { + Kokkos::resize(view, offset + nrecvs); + } + auto base = bufferData(); + auto readpos = readpos_m; + Kokkos::parallel_for( + "Archive::deserialize(offset)", policy_type(0, nrecvs), + KOKKOS_LAMBDA(const size_type i) { + const char* src = base + i * size + readpos; + char* dst = reinterpret_cast(view.data() + offset + i); + copyBytes(dst, src, size); + }); + Kokkos::fence(); + readpos_m += size * nrecvs; + } + + // ================================================================= + // Deserialize — vector with offset + // ================================================================= + + template + template + void Archive::deserialize(Kokkos::View*, ViewArgs...>& view, + size_type offset, size_type nrecvs) { + using exec_space = typename Kokkos::View::execution_space; + size_t size = sizeof(T); + if (offset + nrecvs > view.extent(0)) { + Kokkos::resize(view, offset + nrecvs); + } + using mdrange_t = + Kokkos::MDRangePolicy, Kokkos::IndexType, exec_space>; + auto base = bufferData(); + auto readpos = readpos_m; + Kokkos::parallel_for( + "Archive::deserialize(offset, vector)", mdrange_t({0, 0}, {(long int)nrecvs, Dim}), + KOKKOS_LAMBDA(const size_type i, const size_t d) { + const char* src = base + (Dim * i + d) * size + readpos; + T value; + char* dst = reinterpret_cast(&value); copyBytes(dst, src, size); + view.data()[offset + i](d) = value; }); Kokkos::fence(); readpos_m += Dim * size * nrecvs; diff --git a/src/Communicate/BufferHandler.hpp b/src/Communicate/BufferHandler.hpp index c6d57f0a3..e27369c06 100644 --- a/src/Communicate/BufferHandler.hpp +++ b/src/Communicate/BufferHandler.hpp @@ -1,6 +1,8 @@ #ifndef IPPL_BUFFER_HANDLER_HPP #define IPPL_BUFFER_HANDLER_HPP +#include + namespace ippl { template @@ -11,6 +13,15 @@ namespace ippl { DefaultBufferHandler::getBuffer(size_type size, double overallocation) { size_type requiredSize = static_cast(size * overallocation); + // Round up to page granularity to avoid 0-byte or tiny allocations + // and to align with the GPU-aware MPI registration cache. + constexpr size_type PAGE_SIZE = 4096; + if (requiredSize < PAGE_SIZE) { + requiredSize = PAGE_SIZE; + } else { + requiredSize = ((requiredSize + PAGE_SIZE - 1) / PAGE_SIZE) * PAGE_SIZE; + } + auto freeBuffer = findFreeBuffer(requiredSize); if (freeBuffer != nullptr) { return getFreeBuffer(freeBuffer); @@ -120,17 +131,12 @@ namespace ippl { template typename DefaultBufferHandler::buffer_type DefaultBufferHandler::reallocateLargestFreeBuffer(size_type requiredSize) { - auto largest_it = std::prev(free_buffers.end()); - buffer_type buffer = *largest_it; - - freeSize_m -= buffer->getBufferSize(); - usedSize_m += requiredSize; - - free_buffers.erase(buffer); - buffer->reallocBuffer(requiredSize); - - used_buffers.insert(buffer); - return buffer; + // Always allocate a + // fresh buffer instead of reallocating the largest free one: a + // free + alloc cycle can release the device pointer that a GPU-aware + // MPI's registration cache still holds. Keeping the old buffers in the free + // pool and returning a new buffer at a new address sidesteps this. + return allocateNewBuffer(requiredSize); } template @@ -145,4 +151,4 @@ namespace ippl { } // namespace ippl -#endif +#endif \ No newline at end of file diff --git a/src/Communicate/Buffers.cpp b/src/Communicate/Buffers.cpp index 026633fec..e99100bc2 100644 --- a/src/Communicate/Buffers.cpp +++ b/src/Communicate/Buffers.cpp @@ -32,13 +32,13 @@ namespace ippl { } void Communicator::deleteAllBuffers() { - buffer_handlers_m->forAll([](BufferHandler&& bh) { + getBufferHandler().forAll([](BufferHandler&& bh) { bh.deleteAllBuffers(); }); } void Communicator::freeAllBuffers() { - buffer_handlers_m->forAll([](BufferHandler&& bh) { + getBufferHandler().forAll([](BufferHandler&& bh) { bh.freeAllBuffers(); }); } diff --git a/src/Communicate/Buffers.hpp b/src/Communicate/Buffers.hpp index 98c0b513f..597461b4d 100644 --- a/src/Communicate/Buffers.hpp +++ b/src/Communicate/Buffers.hpp @@ -20,21 +20,23 @@ // exchanging particle data between ranks. // +#include "Utility/IpplTimings.h" namespace ippl { namespace mpi { template Communicator::buffer_type Communicator::getBuffer(size_type size, double overallocation) { - auto& buffer_handler = buffer_handlers_m->get(); + auto& buffer_handler = getBufferHandler().get(); - return buffer_handler.getBuffer(size * sizeof(T), - std::max(overallocation, defaultOveralloc_m)); + auto buf = buffer_handler.getBuffer(size * sizeof(T), + std::max(overallocation, defaultOveralloc_m)); + return buf; } template void Communicator::freeBuffer(Communicator::buffer_type buffer) { - auto& buffer_handler = buffer_handlers_m->get(); + auto& buffer_handler = getBufferHandler().get(); buffer_handler.freeBuffer(buffer); } diff --git a/src/Communicate/Communicator.cpp b/src/Communicate/Communicator.cpp index 2c33f939f..e1a8b777b 100644 --- a/src/Communicate/Communicator.cpp +++ b/src/Communicate/Communicator.cpp @@ -3,25 +3,27 @@ namespace ippl::mpi { + namespace { + // Populate rank_m / size_m from the live communicator. + void cacheRankAndSize(const MPI_Comm& comm, int& rank, int& size) { + MPI_Comm_rank(comm, &rank); + MPI_Comm_size(comm, &size); + } + } // namespace + Communicator::Communicator() - : buffer_handlers_m(get_buffer_handler_instance()) - , comm_m(new MPI_Comm(MPI_COMM_WORLD)) { - MPI_Comm_rank(*comm_m, &rank_m); - MPI_Comm_size(*comm_m, &size_m); + : comm_m(new MPI_Comm(MPI_COMM_WORLD)) { + cacheRankAndSize(*comm_m, rank_m, size_m); } Communicator::Communicator(MPI_Comm comm) { - buffer_handlers_m = get_buffer_handler_instance(); - comm_m = std::make_shared(comm); - MPI_Comm_rank(*comm_m, &rank_m); - MPI_Comm_size(*comm_m, &size_m); + comm_m = std::make_shared(comm); + cacheRankAndSize(*comm_m, rank_m, size_m); } Communicator& Communicator::operator=(MPI_Comm comm) { - buffer_handlers_m = get_buffer_handler_instance(); - comm_m = std::make_shared(comm); - MPI_Comm_rank(*comm_m, &rank_m); - MPI_Comm_size(*comm_m, &size_m); + comm_m = std::make_shared(comm); + cacheRankAndSize(*comm_m, rank_m, size_m); return *this; } @@ -41,14 +43,4 @@ namespace ippl::mpi { return (flag != 0); } - // --------------------------------------- - // singleton access to buffer manager - // --------------------------------------- - std::shared_ptr Communicator::get_buffer_handler_instance() { - static std::shared_ptr comm_buff_handler_ptr{nullptr}; - if (comm_buff_handler_ptr == nullptr) { - comm_buff_handler_ptr = std::make_shared(); - } - return comm_buff_handler_ptr; - } } // namespace ippl::mpi diff --git a/src/Communicate/Communicator.h b/src/Communicate/Communicator.h index 9819ab3bd..7685d20d5 100644 --- a/src/Communicate/Communicator.h +++ b/src/Communicate/Communicator.h @@ -15,6 +15,7 @@ //////////////////////////////////////////////// // For message size check; see below + #include #include @@ -27,7 +28,6 @@ namespace ippl { namespace mpi { - class Communicator : public TagMaker { public: Communicator(); @@ -157,16 +157,21 @@ namespace ippl { const MPI_Comm& getCommunicator() const noexcept { return *comm_m; } + // MPI uses int for byte counts; messages exceeding INT_MAX must + // be split. This shared check aborts the run with a single + // diagnostic instead of silently truncating. + void assertMessageSize(size_type msize) const { + if (msize > static_cast(INT_MAX)) { + std::cerr << "Communicator: message size " << msize + << " bytes exceeds INT_MAX (" << INT_MAX << ")\n"; + MPI_Abort(*comm_m, -1); + } + } + template void recv(int src, int tag, Buffer& buffer, Archive& ar, size_type msize, size_type nrecvs) { - // Temporary fix. MPI communication seems to have problems when the - // count argument exceeds the range of int, so large messages should - // be split into smaller messages - if (msize > INT_MAX) { - std::cerr << "Message size exceeds range of int" << std::endl; - this->abort(); - } + assertMessageSize(msize); MPI_Status status; MPI_Recv(ar.getBuffer(), msize, MPI_BYTE, src, tag, *comm_m, &status); @@ -176,20 +181,27 @@ namespace ippl { template void isend(int dest, int tag, Buffer& buffer, Archive& ar, MPI_Request& request, size_type nsends) { - if (ar.getSize() > INT_MAX) { - std::cerr << "Message size exceeds range of int" << std::endl; - this->abort(); - } + assertMessageSize(ar.getSize()); buffer.serialize(ar, nsends); MPI_Isend(ar.getBuffer(), ar.getSize(), MPI_BYTE, dest, tag, *comm_m, &request); } + template + void isend(int dest, int tag, Archive& ar, MPI_Request& request) { + assertMessageSize(ar.getSize()); + MPI_Isend(ar.getBuffer(), ar.getSize(), MPI_BYTE, dest, tag, *comm_m, &request); + } + + template + void recv(int src, int tag, Archive& ar, size_type msize) { + assertMessageSize(msize); + MPI_Status status; + MPI_Recv(ar.getBuffer(), msize, MPI_BYTE, src, tag, *comm_m, &status); + } + template void irecv(int src, int tag, Archive& ar, MPI_Request& request, size_type msize) { - if (msize > INT_MAX) { - std::cerr << "Message size exceeds range of int" << std::endl; - this->abort(); - } + assertMessageSize(msize); MPI_Irecv(ar.getBuffer(), msize, MPI_BYTE, src, tag, *comm_m, &request); } @@ -201,7 +213,11 @@ namespace ippl { std::vector gatherLogsFromAllRanks(const std::vector& localLogs); void writeLogsToFile(const std::vector& allLogs, const std::string& filename); - std::shared_ptr buffer_handlers_m; + static buffer_handler_type& getBufferHandler() { + static buffer_handler_type handler; + return handler; + } + double defaultOveralloc_m = 1.0; ///////////////////////////////////////////////////////////////////////////////////// @@ -210,12 +226,10 @@ namespace ippl { std::shared_ptr comm_m; int size_m; int rank_m; - - public: - std::shared_ptr get_buffer_handler_instance(); }; } // namespace mpi + } // namespace ippl #include "Communicate/Collectives.hpp" diff --git a/src/Communicate/CommunicatorLogging.cpp b/src/Communicate/CommunicatorLogging.cpp index 541ff4074..39c0197e8 100644 --- a/src/Communicate/CommunicatorLogging.cpp +++ b/src/Communicate/CommunicatorLogging.cpp @@ -7,8 +7,16 @@ #include "Communicate/Communicator.h" #include "Communicate/LogEntry.h" +#include "Communicate/LoggingBufferHandler.h" namespace ippl::mpi { + + template + struct is_a_logger : std::false_type {}; + + template + struct is_a_logger> : std::true_type {}; + void Communicator::printLogs(const std::string& filename) { std::vector localLogs = gatherLocalLogs(); @@ -24,20 +32,17 @@ namespace ippl::mpi { } } - template - struct is_a_logger : std::false_type {}; - - template - struct is_a_logger > : std::true_type {}; - std::vector Communicator::gatherLocalLogs() { std::vector localLogs; - if constexpr (is_a_logger::value) { - buffer_handlers_m->forAll([&](auto& loggingHandler) { - const auto& logs = loggingHandler.getLogs(); + + getBufferHandler().forAll([&](auto& handler) { + using handler_t = std::decay_t; + if constexpr (is_a_logger::value) { + const auto& logs = handler.getLogs(); localLogs.insert(localLogs.end(), logs.begin(), logs.end()); - }); - } + } + }); + return localLogs; } @@ -88,11 +93,9 @@ namespace ippl::mpi { size_t offset = 0; while (offset < buffer.size()) { - LogEntry logEntry = LogEntry::deserialize(buffer, offset); - - logs.push_back(logEntry); - - offset += logEntry.serialize().size(); + // deserializeAdvance walks offset past the consumed bytes - + // avoids the O(N^2) blowup of re-serializing each entry + logs.push_back(LogEntry::deserializeAdvance(buffer, offset)); } return logs; } diff --git a/src/Communicate/Environment.cpp b/src/Communicate/Environment.cpp index 8e19df365..706c821f1 100644 --- a/src/Communicate/Environment.cpp +++ b/src/Communicate/Environment.cpp @@ -9,9 +9,23 @@ namespace ippl { namespace mpi { Environment::Environment(int& argc, char**& argv, const MPI_Comm& comm) - : comm_m(comm) { + : comm_m(comm) + , threadMultiple_m(false) { if (!initialized()) { - MPI_Init(&argc, &argv); + int provided = MPI_THREAD_SINGLE; + int rc = MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided); + if (rc != MPI_SUCCESS) { + std::cerr << "MPI_Init_thread failed (rc=" << rc << ")" << std::endl; + std::exit(EXIT_FAILURE); + } + threadMultiple_m = (provided >= MPI_THREAD_MULTIPLE); + if (!threadMultiple_m) { + int rank = 0; + MPI_Comm_rank(comm_m, &rank); + if (rank == 0) { + std::cerr << "MPI doesn't support MPI_THREAD_MULTIPLE!" << std::endl; + } + } } } diff --git a/src/Communicate/Environment.h b/src/Communicate/Environment.h index fd03a9ee4..bad195df4 100644 --- a/src/Communicate/Environment.h +++ b/src/Communicate/Environment.h @@ -23,10 +23,13 @@ namespace ippl { static bool finalized(); + bool threadMultiple() noexcept { return threadMultiple_m; } + void abort(int errorcode = -1) noexcept { MPI_Abort(comm_m, errorcode); } private: MPI_Comm comm_m; + bool threadMultiple_m; }; } // namespace mpi } // namespace ippl diff --git a/src/Communicate/LogEntry.cpp b/src/Communicate/LogEntry.cpp index 444221d6d..66539e879 100644 --- a/src/Communicate/LogEntry.cpp +++ b/src/Communicate/LogEntry.cpp @@ -40,28 +40,31 @@ namespace ippl { return buffer; } - LogEntry LogEntry::deserialize(const std::vector& buffer, size_t offset) { + LogEntry LogEntry::deserializeAdvance(const std::vector& buffer, size_t& offset) { LogEntry entry; - size_t current_pos = offset; - entry.methodName = deserializeString(buffer, current_pos); - entry.usedSize = deserializeBasicType(buffer, current_pos); - entry.freeSize = deserializeBasicType(buffer, current_pos); - entry.memorySpace = deserializeString(buffer, current_pos); - entry.rank = deserializeBasicType(buffer, current_pos); + entry.methodName = deserializeString(buffer, offset); + entry.usedSize = deserializeBasicType(buffer, offset); + entry.freeSize = deserializeBasicType(buffer, offset); + entry.memorySpace = deserializeString(buffer, offset); + entry.rank = deserializeBasicType(buffer, offset); - auto duration = deserializeBasicType(buffer, current_pos); + auto duration = deserializeBasicType(buffer, offset); entry.timestamp = std::chrono::time_point( std::chrono::high_resolution_clock::duration(duration)); - size_t mapSize = deserializeBasicType(buffer, current_pos); + size_t mapSize = deserializeBasicType(buffer, offset); for (size_t i = 0; i < mapSize; ++i) { - std::string key = deserializeString(buffer, current_pos); - std::string value = deserializeString(buffer, current_pos); + std::string key = deserializeString(buffer, offset); + std::string value = deserializeString(buffer, offset); entry.parameters[key] = value; } return entry; } + LogEntry LogEntry::deserialize(const std::vector& buffer, size_t offset) { + return deserializeAdvance(buffer, offset); + } + } // namespace ippl diff --git a/src/Communicate/LogEntry.h b/src/Communicate/LogEntry.h index a853c3431..1929915c3 100644 --- a/src/Communicate/LogEntry.h +++ b/src/Communicate/LogEntry.h @@ -20,6 +20,11 @@ namespace ippl { std::vector serialize() const; static LogEntry deserialize(const std::vector& buffer, size_t offset = 0); + + /// Variant of deserialize() that advances the caller's offset past + /// the just-read entry. Lets callers walk a byte-stream of entries + /// without re-serializing each one to compute its size. + static LogEntry deserializeAdvance(const std::vector& buffer, size_t& offset); }; template diff --git a/src/Communicate/LoggingBufferHandler.h b/src/Communicate/LoggingBufferHandler.h index f6a0bd5d6..95fb418cd 100644 --- a/src/Communicate/LoggingBufferHandler.h +++ b/src/Communicate/LoggingBufferHandler.h @@ -1,7 +1,6 @@ #ifndef IPPL_LOGGING_BUFFER_HANDLER_H #define IPPL_LOGGING_BUFFER_HANDLER_H -#include #include #include #include diff --git a/src/Field/BareField.hpp b/src/Field/BareField.hpp index 695576af3..14a36eb45 100644 --- a/src/Field/BareField.hpp +++ b/src/Field/BareField.hpp @@ -146,7 +146,7 @@ namespace ippl { template void BareField::fillHalo() { if (layout_m->comm.size() > 1) { - halo_m.fillHalo(dview_m, layout_m); + halo_m.fillHalo(dview_m, layout_m, nghost_m); } if (layout_m->isAllPeriodic_m) { using Op = typename detail::HaloCells::assign; @@ -157,7 +157,7 @@ namespace ippl { template void BareField::accumulateHalo() { if (layout_m->comm.size() > 1) { - halo_m.accumulateHalo(dview_m, layout_m); + halo_m.accumulateHalo(dview_m, layout_m, nghost_m); } if (layout_m->isAllPeriodic_m) { using Op = typename detail::HaloCells::rhs_plus_assign; diff --git a/src/Field/HaloCells.h b/src/Field/HaloCells.h index c4d87b9bd..6e7606083 100644 --- a/src/Field/HaloCells.h +++ b/src/Field/HaloCells.h @@ -57,7 +57,7 @@ namespace ippl { * @param view the original field data * @param layout the field layout storing the domain decomposition */ - void accumulateHalo(view_type& view, Layout_t* layout); + void accumulateHalo(view_type& view, Layout_t* layout, int nghost); /*! * Send halo data to internal cells for only the physical cells @@ -75,7 +75,7 @@ namespace ippl { * @param view the original field data * @param layout the field layout storing the domain decomposition */ - void fillHalo(view_type&, Layout_t* layout); + void fillHalo(view_type&, Layout_t* layout, int nghost); /*! * Pack the field data to be sent into a contiguous array. @@ -139,7 +139,8 @@ namespace ippl { * unpack function call */ template - void exchangeBoundaries(view_type& view, Layout_t* layout, SendOrder order, int nghost = 1); + void exchangeBoundaries(view_type& view, Layout_t* layout, SendOrder order, + int nghost); /*! * Extract the subview of the original data. This does not copy. diff --git a/src/Field/HaloCells.hpp b/src/Field/HaloCells.hpp index eb56642ce..d99ea638f 100644 --- a/src/Field/HaloCells.hpp +++ b/src/Field/HaloCells.hpp @@ -7,6 +7,7 @@ #include #include "Utility/IpplException.h" +#include "Utility/ParallelDispatch.h" #include "Communicate/Communicator.h" @@ -16,8 +17,8 @@ namespace ippl { HaloCells::HaloCells() {} template - void HaloCells::accumulateHalo(view_type& view, Layout_t* layout) { - exchangeBoundaries(view, layout, HALO_TO_INTERNAL); + void HaloCells::accumulateHalo(view_type& view, Layout_t* layout, int nghost) { + exchangeBoundaries(view, layout, HALO_TO_INTERNAL, nghost); } template @@ -25,8 +26,8 @@ namespace ippl { exchangeBoundaries(view, layout, HALO_TO_INTERNAL_NOGHOST, nghost); } template - void HaloCells::fillHalo(view_type& view, Layout_t* layout) { - exchangeBoundaries(view, layout, INTERNAL_TO_HALO); + void HaloCells::fillHalo(view_type& view, Layout_t* layout, int nghost) { + exchangeBoundaries(view, layout, INTERNAL_TO_HALO, nghost); } template @@ -45,9 +46,9 @@ namespace ippl { auto ldom = layout->getLocalNDIndex(); for (const auto& axis : ldom) { if ((axis.length() == 1) && (Dim != 1)) { - throw std::runtime_error( - "HaloCells: Cannot do neighbour exchange when domain decomposition " - "contains planes!"); + throw IpplException( + "HaloCells::exchangeBoundaries", + "Cannot do neighbour exchange when domain decomposition contains planes."); } } @@ -62,7 +63,7 @@ namespace ippl { totalRequests += componentNeighbors.size(); } - int me=Comm->rank(); + int me = Comm->rank(); using memory_space = typename view_type::memory_space; using buffer_type = mpi::Communicator::buffer_type; diff --git a/src/FieldLayout/FieldLayout.h b/src/FieldLayout/FieldLayout.h index 780040e8b..c8f1c1e2b 100644 --- a/src/FieldLayout/FieldLayout.h +++ b/src/FieldLayout/FieldLayout.h @@ -201,7 +201,7 @@ namespace ippl { FieldLayout(const mpi::Communicator& = MPI_COMM_WORLD); FieldLayout(mpi::Communicator, const NDIndex& domain, std::array decomp, - bool isAllPeriodic = false); + bool isAllPeriodic = false, int nghost = 1); // Destructor: Everything deletes itself automatically ... the base // class destructors inform all the FieldLayoutUser's we're going away. @@ -213,38 +213,29 @@ namespace ippl { // FieldLayout constructors: void initialize(const NDIndex& domain, std::array decomp, - bool isAllPeriodic = false); + bool isAllPeriodic = false, int nghost = 1); // Return the domain. const NDIndex& getDomain() const { return gDomain_m; } - // Compare FieldLayouts to see if they represent the same domain; if - // dimensionalities are different, the NDIndex operator==() will return - // false: + // Compare FieldLayouts. Different dimensionalities or different global + // domains are not equal; same global domain but different per-rank + // local-domain decompositions are also not equal. template bool operator==(const FieldLayout& x) const { - - // Throw exception if the domains are not the same - if (gDomain_m != x.getDomain()) { - throw std::runtime_error("FieldLayout: only FieldLayouts with the same global domain should be compared"); - } - - return gDomain_m == x.getDomain(); - } - - bool operator==(const FieldLayout& x) const { - - // Throw exception if the domains are not the same - if (gDomain_m != x.getDomain()) { - throw std::runtime_error("FieldLayout: only FieldLayouts with the same global domain should be compared"); - } - - for (unsigned int i = 0; i < Dim; ++i) { - if (hLocalDomains_m(comm.rank())[i] != x.getLocalNDIndex()[i]) { + if constexpr (Dim != Dim2) { + return false; + } else { + if (gDomain_m != x.getDomain()) { return false; } + for (unsigned int i = 0; i < Dim; ++i) { + if (hLocalDomains_m(comm.rank())[i] != x.getLocalNDIndex()[i]) { + return false; + } + } + return true; } - return true; } // for the requested dimension, report if the distribution is @@ -330,7 +321,7 @@ namespace ippl { * Finds all neighboring ranks based on the field layout * @param nghost number of ghost cells (default 1) */ - void findNeighbors(int nghost = 1); + void findNeighbors(int nghost); /*! * Adds a neighbor to the neighbor list @@ -389,6 +380,9 @@ namespace ippl { // Minimum width of all the local domains for each dimension unsigned int minWidth_m[Dim]; + // Nghost needed for computing send/receive ranges + int nghost_m; + void calcWidths(); }; diff --git a/src/FieldLayout/FieldLayout.hpp b/src/FieldLayout/FieldLayout.hpp index 2bcb1bcd1..ad6ae5a53 100644 --- a/src/FieldLayout/FieldLayout.hpp +++ b/src/FieldLayout/FieldLayout.hpp @@ -37,7 +37,8 @@ namespace ippl { FieldLayout::FieldLayout(const mpi::Communicator& communicator) : comm(communicator) , dLocalDomains_m("local domains (device)", 0) - , hLocalDomains_m(Kokkos::create_mirror_view(dLocalDomains_m)) { + , hLocalDomains_m(Kokkos::create_mirror_view(dLocalDomains_m)) + , nghost_m(1) { for (unsigned int d = 0; d < Dim; ++d) { minWidth_m[d] = 0; } @@ -45,9 +46,9 @@ namespace ippl { template FieldLayout::FieldLayout(mpi::Communicator communicator, const NDIndex& domain, - std::array isParallel, bool isAllPeriodic) + std::array isParallel, bool isAllPeriodic, int nghost) : FieldLayout(communicator) { - initialize(domain, isParallel, isAllPeriodic); + initialize(domain, isParallel, isAllPeriodic, nghost); } template @@ -60,7 +61,7 @@ namespace ippl { hLocalDomains_m(i) = domains[i]; } - findNeighbors(); + findNeighbors(nghost_m); Kokkos::deep_copy(dLocalDomains_m, hLocalDomains_m); @@ -69,7 +70,7 @@ namespace ippl { template void FieldLayout::initialize(const NDIndex& domain, std::array isParallel, - bool isAllPeriodic) { + bool isAllPeriodic, int nghost) { int nRanks = comm.size(); gDomain_m = domain; @@ -78,11 +79,17 @@ namespace ippl { isParallelDim_m = isParallel; + nghost_m = nghost; + if (nRanks < 2) { Kokkos::resize(dLocalDomains_m, nRanks); Kokkos::resize(hLocalDomains_m, nRanks); hLocalDomains_m(0) = domain; Kokkos::deep_copy(dLocalDomains_m, hLocalDomains_m); + // Even on a single rank we must populate minWidth_m so that + // getDistribution(d) returns the right answer; without this the + // serial-build path silently reports every dim as parallel. + calcWidths(); return; } @@ -95,8 +102,11 @@ namespace ippl { } if (totparelems < nRanks) { - throw std::runtime_error("FieldLayout:initialize: domain can only be partitioned in to " - + std::to_string(totparelems) + " local domains, but there are " + std::to_string(nRanks) + " ranks, decrease the number of ranks or increase the domain."); + throw std::runtime_error( + "FieldLayout:initialize: domain can only be partitioned in to " + + std::to_string(totparelems) + " local domains, but there are " + + std::to_string(nRanks) + + " ranks, decrease the number of ranks or increase the domain."); } Kokkos::resize(dLocalDomains_m, nRanks); @@ -105,7 +115,7 @@ namespace ippl { detail::Partitioner partitioner; partitioner.split(domain, hLocalDomains_m, isParallel, nRanks); - findNeighbors(); + findNeighbors(nghost); Kokkos::deep_copy(dLocalDomains_m, hLocalDomains_m); @@ -288,7 +298,7 @@ namespace ippl { // 0 - touching the lower axis value // 1 - touching the upper axis value // 2 - parallel to the axis - if (intersect[d].length() == 1) { + if (intersect[d].length() == static_cast(nghost)) { if (gnd[d].first() != intersect[d].first()) { index += digit; } diff --git a/src/FieldLayout/SubFieldLayout.hpp b/src/FieldLayout/SubFieldLayout.hpp index 53ee24c11..50a83f9e2 100644 --- a/src/FieldLayout/SubFieldLayout.hpp +++ b/src/FieldLayout/SubFieldLayout.hpp @@ -102,7 +102,7 @@ namespace ippl { } } - this->findNeighbors(); + this->findNeighbors(this->nghost_m); Kokkos::deep_copy(this->dLocalDomains_m, this->hLocalDomains_m); diff --git a/src/Ippl.h b/src/Ippl.h index 174c72a29..a606c655c 100644 --- a/src/Ippl.h +++ b/src/Ippl.h @@ -10,7 +10,6 @@ #include "Types/IpplTypes.h" #include "Utility/Inform.h" -#include "Utility/ParallelDispatch.h" #include "Communicate/Communicator.h" #include "Communicate/Environment.h" @@ -46,6 +45,8 @@ namespace ippl { } // namespace detail } // namespace ippl +#include "Utility/ParallelDispatch.h" + // FIMXE remove (only for backwards compatibility) #include "IpplCore.h" diff --git a/src/LinearSolvers/PCG.h b/src/LinearSolvers/PCG.h index 3fdf17742..07a29426a 100644 --- a/src/LinearSolvers/PCG.h +++ b/src/LinearSolvers/PCG.h @@ -356,6 +356,18 @@ namespace ippl { FieldRHS>() , preconditioner_m(nullptr) {}; + /* + * Allocate the extra PCG work fields once, together with the CG + * work fields r, d, and q. + * + * s and pcond_out are scratch buffers for M^{-1} r. They intentionally + * keep the default NoBcFace boundary conditions. Only the search + * direction d gets the physical BCs after the preconditioner result has + * been copied into it. If these scratch fields inherited periodic BCs, + * assignments inside the preconditioner would call PeriodicFace::apply + * and add halo MPI exchanges that did not exist when the preconditioner + * returned fresh temporary NoBcFace fields. + */ void initializeFields(mesh_type& mesh, layout_type& layout) override { CG::initializeFields(mesh, layout); @@ -535,14 +547,13 @@ namespace ippl { std::unique_ptr> preconditioner_m; /* - * Extends the CG workspace with the preconditioner result buffers s and - * pcond_out so operator() does not allocate per solve. - * Preconditioner result buffers, allocated once via initializeFields() - * pcond_out keeps its default NoBcFace BCs: the preconditioner's internal operator - * chain therefore never triggers PeriodicFace::apply MPI calls -- if it - * did, the global MPI sequence would diverge from the master code path - * (where the preconditioner returned a fresh NoBcFace field) and - * intermittent multi-rank halo deadlocks would follow. + * Persistent preconditioner output buffers. They are allocated in + * initializeFields() and reused on every solve to avoid per-iteration + * Field allocation. + * + * These buffers are scratch storage, not physical solution fields, so + * they must keep NoBcFace BCs. The physically meaningful BCs are applied + * only to d after copying pcond_out into it. */ lhs_type s; diff --git a/src/Particle/ParticleAttrib.h b/src/Particle/ParticleAttrib.h index 3d41b9029..bbdce337a 100644 --- a/src/Particle/ParticleAttrib.h +++ b/src/Particle/ParticleAttrib.h @@ -20,8 +20,12 @@ #include "Expression/IpplExpressions.h" +#ifdef IPPL_ENABLE_FFT +#include "FFT/FFT.h" +#endif #include "Interpolation/CIC.h" #include "Particle/ParticleAttribBase.h" +#include "Particle/SortBuffer.h" namespace ippl { @@ -58,6 +62,8 @@ namespace ippl { // discarded. void alloc(size_type) override; + void reserve(size_type) override; + /*! * Particle deletion function. Partition the particles into a valid region * and an invalid region. @@ -73,32 +79,46 @@ namespace ippl { void unpack(size_type) override; void serialize(detail::Archive& ar, size_type nsends) override { - ar.serialize(buf_m, nsends); + ar.serialize(dview_m, nsends); + } + + void serialize(detail::Archive& ar, const hash_type& hash, + size_type nsends) override { + ar.serialize(dview_m, hash, nsends); } void deserialize(detail::Archive& ar, size_type nrecvs) override { ar.deserialize(buf_m, nrecvs); } - virtual ~ParticleAttrib() = default; - + void deserialize(detail::Archive& ar, size_type offset, + size_type nrecvs) override { + this->reserve(offset + nrecvs); + ar.deserialize(dview_m, offset, nrecvs); + } + + KOKKOS_INLINE_FUNCTION virtual ~ParticleAttrib() = default; + size_type size() const override { return dview_m.extent(0); } - + size_type packedSize(const size_type count) const override { return count * sizeof(value_type); } - - void resize(size_type n) { Kokkos::resize(dview_m, n); } /*! - * @brief Reallocate the underlying view with a new size. + * @brief Resize the underlying view, preserving existing entries on grow. * - * This function reallocates the device view to a new size. Existing data is - * discarded and should not be relied upon after this call. Note that this function does not - * apply overallocation. For use from outside, call `ParticleAttrib::alloc(size_type)` - * instead. + * Capacity-only operation; does not change the live particle count. + * Overallocation is the caller's responsibility (`alloc()` / `create()` + * apply it when sizing this view). + */ + void resize(size_type n) { Kokkos::resize(dview_m, n); } + + /*! + * @brief Reallocate the underlying view, discarding existing entries. * - * @param n The new size to allocate in the internal view. + * Capacity-only operation. Does not apply the default overallocation + * factor — call `alloc()` instead from the outside. */ void realloc(size_type n) { Kokkos::realloc(dview_m, n); } @@ -112,11 +132,22 @@ namespace ippl { KOKKOS_INLINE_FUNCTION T& operator()(const size_t i) const { return dview_m(i); } - view_type& getView() { return dview_m; } - - const view_type& getView() const { return dview_m; } + /*! + * Returns a subview of the underlying storage covering only the live + * particle range [0, size()). + */ + view_type getView() { + return Kokkos::subview(dview_m, + Kokkos::make_pair(size_type(0), + static_cast(*(this->localNum_mp)))); + } + const view_type getView() const { + return Kokkos::subview(dview_m, + Kokkos::make_pair(size_type(0), + static_cast(*(this->localNum_mp)))); + } - host_mirror_type getHostMirror() const { return Kokkos::create_mirror(dview_m); } + host_mirror_type getHostMirror() const { return Kokkos::create_mirror(getView()); } void set_name(const std::string& name_) override { size_t len = name_.size(); @@ -227,9 +258,21 @@ namespace ippl { void internalCopy(const hash_type& indices) override; private: - view_type dview_m; - view_type buf_m; + view_type dview_m{"ParticleAttrib::dview", 0}; + view_type buf_m{"ParticleAttrib::buf", 0}; }; + + namespace detail { + template + struct AttribTraits; + + template + struct AttribTraits> { + using value_type = T; + using view_type = + std::decay_t>().getView())>; + }; + } // namespace detail } // namespace ippl #include "Particle/ParticleAttrib.hpp" diff --git a/src/Particle/ParticleAttrib.hpp b/src/Particle/ParticleAttrib.hpp index 1422bb9dd..95a4a2321 100644 --- a/src/Particle/ParticleAttrib.hpp +++ b/src/Particle/ParticleAttrib.hpp @@ -17,8 +17,15 @@ #include "Communicate/DataTypes.h" +#include "Utility/BufferView.h" #include "Utility/IpplTimings.h" +#ifdef IPPL_ENABLE_FFT +#include "FFT/FFT.h" +#endif +#include "Particle/ParticleSort.h" +#include "Particle/SortBuffer.h" + namespace ippl { template @@ -42,6 +49,13 @@ namespace ippl { this->realloc(n * overalloc); } + template + void ParticleAttrib::reserve(size_type n) { + if (this->size() < n) { + this->resize(n); + } + } + template void ParticleAttrib::destroy(const hash_type& deleteIndex, const hash_type& keepIndex, @@ -71,17 +85,12 @@ namespace ippl { Kokkos::parallel_for( "ParticleAttrib::pack()", policy_type(0, size), KOKKOS_LAMBDA(const size_t i) { buf(i) = dview(hash(i)); }); - Kokkos::fence(); } template void ParticleAttrib::unpack(size_type nrecvs) { - auto size = dview_m.extent(0); size_type required = *(this->localNum_mp) + nrecvs; - if (size < required) { - int overalloc = Comm->getDefaultOverallocation(); - this->resize(required * overalloc); - } + this->resize(required); size_type count = *(this->localNum_mp); auto buf = buf_m; @@ -94,7 +103,6 @@ namespace ippl { } template - // KOKKOS_INLINE_FUNCTION ParticleAttrib& ParticleAttrib::operator=(T x) { auto dview = dview_m; using policy_type = Kokkos::RangePolicy; @@ -106,7 +114,6 @@ namespace ippl { template template - // KOKKOS_INLINE_FUNCTION ParticleAttrib& ParticleAttrib::operator=( detail::Expression const& expr) { using capture_type = detail::CapturedExpression; @@ -147,11 +154,11 @@ namespace ippl { const NDIndex& lDom = layout.getLocalNDIndex(); const int nghost = f.getNghost(); - //using policy_type = Kokkos::RangePolicy; const bool useHashView = hash_array.extent(0) > 0; if (useHashView && (iteration_policy.end() > hash_array.extent(0))) { Inform m("scatter"); - m << "Hash array was passed to scatter, but size does not match iteration policy." << endl; + m << "Hash array was passed to scatter, but size does not match iteration policy." + << endl; ippl::Comm->abort(); } auto dview = dview_m; @@ -159,10 +166,8 @@ namespace ippl { Kokkos::parallel_for( "ParticleAttrib::scatter", iteration_policy, KOKKOS_LAMBDA(const size_t idx) { - // map index to possible hash_map size_t mapped_idx = useHashView ? hash_array(idx) : idx; - // find nearest grid point vector_type l = (ppview(mapped_idx) - origin) * invdx + 0.5; Vector index = l; Vector whi = l - index; @@ -170,7 +175,6 @@ namespace ippl { Vector args = index - lDom.first() + nghost; - // scatter const value_type& val = dview(mapped_idx); detail::scatterToField(std::make_index_sequence<1 << Field::dim>{}, view, wlo, whi, args, val); @@ -219,7 +223,6 @@ namespace ippl { Kokkos::parallel_for( "ParticleAttrib::gather", policy_type(0, *(this->localNum_mp)), KOKKOS_LAMBDA(const size_t idx) { - // find nearest grid point vector_type l = (ppview(idx) - origin) * invdx + 0.5; Vector index = l; Vector whi = l - index; @@ -227,9 +230,8 @@ namespace ippl { Vector args = index - lDom.first() + nghost; - // gather - value_type gathered = detail::gatherFromField(std::make_index_sequence<1 << Field::dim>{}, - view, wlo, whi, args); + value_type gathered = detail::gatherFromField( + std::make_index_sequence<1 << Field::dim>{}, view, wlo, whi, args); if (addToAttribute) { dview(idx) += gathered; } else { @@ -240,10 +242,8 @@ namespace ippl { } template - void ParticleAttrib::applyPermutation( - const hash_type& permutation) { - - const auto view = this->getView(); + void ParticleAttrib::applyPermutation(const hash_type& permutation) { + const auto view = this->getView(); // trimmed to localNum_mp const auto size = this->getParticleCount(); view_type temp("copy", size); @@ -255,31 +255,28 @@ namespace ippl { Kokkos::fence(); - Kokkos::deep_copy(Kokkos::subview(view, Kokkos::make_pair(0, size)), temp); + Kokkos::deep_copy(view, temp); } - template - void ParticleAttrib::internalCopy( - const hash_type &indices) { + template + void ParticleAttrib::internalCopy(const hash_type& indices) { auto copySize = indices.size(); - create(copySize); - auto view = this->getView(); - const auto size = this->getParticleCount(); + // Snapshot the current count BEFORE create() increments localNum_mp. + const size_type oldSize = *(this->localNum_mp); - using policy_type = Kokkos::RangePolicy; + create(copySize); // localNum_mp becomes oldSize + copySize + + auto view = this->getView(); Kokkos::parallel_for( - "Copy to temp", policy_type(0, copySize), - KOKKOS_LAMBDA(const size_type &i) { - view(size + i) = view(i); - }); + "internalCopy", Kokkos::RangePolicy(0, copySize), + KOKKOS_LAMBDA(const size_type& i) { view(oldSize + i) = view(indices(i)); }); Kokkos::fence(); } /* - * Non-class function - * + * Non-class functions */ /** @@ -287,7 +284,7 @@ namespace ippl { * * This overload preserves legacy functionality by providing a default iteration policy. * It calls the member scatter() with a default Kokkos::RangePolicy. - * + * * @note The default behaviour is to scatter all particles without any custom index mapping. * * @tparam Attrib1 The type of the particle attribute. @@ -298,19 +295,21 @@ namespace ippl { * @param f The field onto which the data is scattered. * @param pp The ParticleAttrib representing particle positions. */ - template > + template > inline void scatter(const Attrib1& attrib, Field& f, const Attrib2& pp) { - attrib.scatter(f, pp, policy_type(0, attrib.getParticleCount())); + attrib.scatter(f, pp, policy_type(0, attrib.getParticleCount())); } /** - * @brief Non-class interface for scattering with a custom iteration policy and optional index array. + * @brief Non-class interface for scattering with a custom iteration policy and optional index + * array. * * This overload allows the caller to specify a custom `Kokkos::range_policy` and an optional * `ippl::hash_type` array. It forwards the parameters to the member scatter() function. - * - * @note See ParticleAttrib::scatter() for more information on the custom iteration functionality. + * + * @note See ParticleAttrib::scatter() for more information on the custom iteration + * functionality. * * @tparam Attrib1 The type of the particle attribute. * @tparam Field The type of the field. @@ -322,33 +321,34 @@ namespace ippl { * @param iteration_policy A custom `Kokkos::range_policy` defining the iteration range. * @param hash_array An optional `ippl::hash_type` array for index mapping. */ - template > - inline void scatter(const Attrib1& attrib, Field& f, const Attrib2& pp, - policy_type iteration_policy, typename Attrib1::hash_type hash_array = {}) { + template > + inline void scatter(const Attrib1& attrib, Field& f, const Attrib2& pp, + policy_type iteration_policy, + typename Attrib1::hash_type hash_array = {}) { attrib.scatter(f, pp, iteration_policy, hash_array); } /** * @brief Non-class interface for gathering field data into a particle attribute. - * + * * This interface calls the member ParticleAttrib::gather() function with the provided * parameters and preserving legacy behavior by assigning `addToAttribute` a default value. - * + * * @note See ParticleAttrib::gather() for more information on the behavior of `addToAttribute`. - * + * * @tparam Attrib1 The type of the particle attribute. * @tparam Field The type of the field. * @tparam Attrib2 The type of the particle position attribute. * @param attrib The particle attribute to gather data into. * @param f The field from which data is gathered. * @param pp The ParticleAttrib representing particle positions. - * @param addToAttribute If true, the gathered field value is added to the current attribute value; - * otherwise, the attribute value is overwritten. + * @param addToAttribute If true, the gathered field value is added to the current attribute + * value; otherwise, the attribute value is overwritten. */ template - inline void gather(Attrib1& attrib, Field& f, const Attrib2& pp, - const bool addToAttribute = false) { + inline void gather(Attrib1& attrib, Field& f, const Attrib2& pp, + const bool addToAttribute = false) { attrib.gather(f, pp, addToAttribute); } @@ -374,4 +374,5 @@ namespace ippl { DefineParticleReduction(Max, max, if (myVal > valL) valL = myVal, std::greater) DefineParticleReduction(Min, min, if (myVal < valL) valL = myVal, std::less) DefineParticleReduction(Prod, prod, valL *= myVal, std::multiplies) + } // namespace ippl diff --git a/src/Particle/ParticleAttribBase.h b/src/Particle/ParticleAttribBase.h index 4dea4ac8e..3e9baa197 100644 --- a/src/Particle/ParticleAttribBase.h +++ b/src/Particle/ParticleAttribBase.h @@ -66,6 +66,10 @@ namespace ippl { // prior entries survive a capacity grow. virtual void create(size_type, bool non_destructive = false) = 0; + // Grow internal capacity to at least N particles while preserving existing + // entries. Does not shrink and does not touch the logical particle count. + virtual void reserve(size_type) = 0; + virtual void destroy(const hash_type&, const hash_type&, size_type) = 0; virtual size_type packedSize(const size_type) const = 0; @@ -73,19 +77,22 @@ namespace ippl { virtual void unpack(size_type) = 0; - virtual void serialize(Archive& ar, size_type nsends) = 0; - + virtual void serialize(Archive& ar, size_type nsends) = 0; + virtual void serialize(detail::Archive& ar, const hash_type& hash, + size_type nsends) = 0; virtual void deserialize(Archive& ar, size_type nrecvs) = 0; + virtual void deserialize(detail::Archive& ar, size_type offset, + size_type nrecvs) = 0; virtual size_type size() const = 0; - virtual ~ParticleAttribBase() = default; + KOKKOS_INLINE_FUNCTION virtual ~ParticleAttribBase() = default; void setParticleCount(size_type& num) { localNum_mp = # } size_type getParticleCount() const { return *localNum_mp; } virtual void applyPermutation(const hash_type&) = 0; - virtual void internalCopy(const hash_type&) = 0; + virtual void internalCopy(const hash_type&) = 0; protected: const size_type* localNum_mp; diff --git a/src/Particle/ParticleBase.h b/src/Particle/ParticleBase.h index 9fbed2faa..9793da2a9 100644 --- a/src/Particle/ParticleBase.h +++ b/src/Particle/ParticleBase.h @@ -299,9 +299,18 @@ namespace ippl { /* This function does not alter the totalNum_m member function. It should only be called * during the update function where we know the number of particles remains the same. */ + template + void internalDestroy(const F& invalid_functor, const size_type destroyNum); template void internalDestroy(const Kokkos::View& invalid, - const size_type destroyNum); + const size_type destroyNum) { + using view_type = Kokkos::View; + using memory_space = typename view_type::memory_space; + using execution_space = typename view_type::execution_space; + internalDestroy( + KOKKOS_LAMBDA(size_t i) { return invalid(i); }, destroyNum); + } /*! * Sends particles to another rank @@ -316,6 +325,9 @@ namespace ippl { void sendToRank(int rank, int tag, std::vector& requests, const HashType& hash); + template + MPI_Request sendToRank(int rank, int tag, const HashType& hash); + /*! * Receives particles from another rank * @param rank the source rank @@ -324,6 +336,9 @@ namespace ippl { */ void recvFromRank(int rank, int tag, size_type nRecvs); + std::pair> postRecvFromRank(int rank, int tag, + size_type nRecvs); + /*! * Serialize to do MPI calls. * @param ar archive diff --git a/src/Particle/ParticleBase.hpp b/src/Particle/ParticleBase.hpp index a71a917d6..fbbe72638 100644 --- a/src/Particle/ParticleBase.hpp +++ b/src/Particle/ParticleBase.hpp @@ -62,8 +62,6 @@ namespace ippl { addAttribute(ID); } addAttribute(R); - ID.set_name("ID"); - R.set_name("position"); } template @@ -175,9 +173,9 @@ namespace ippl { } template - template - void ParticleBase::internalDestroy( - const Kokkos::View& invalid, const size_type destroyNum) { + template + void ParticleBase::internalDestroy(const F& invalid_functor, + const size_type destroyNum) { PAssert(destroyNum <= localNum_m); // If there aren't any particles to delete, do nothing @@ -193,19 +191,18 @@ namespace ippl { return; } - using view_type = Kokkos::View; - using memory_space = typename view_type::memory_space; - using execution_space = typename view_type::execution_space; - using policy_type = Kokkos::RangePolicy; - auto& locDeleteIndex = deleteIndex_m.get(); - auto& locKeepIndex = keepIndex_m.get(); + using policy_type = Kokkos::RangePolicy; + auto& locDeleteIndex = deleteIndex_m.get(); + auto& locKeepIndex = keepIndex_m.get(); - // Resize buffers, if necessary + // Resize buffers per-memory-space. The previous code mistakenly used + // the outer `memory_space` template parameter inside the per-space + // lambda, leaving every other space's del/keep buffers undersized. detail::runForAllSpaces([&]() { - if (attributes_m.template get().size() > 0) { + if (attributes_m.template get().size() > 0) { int overalloc = Comm->getDefaultOverallocation(); - auto& del = deleteIndex_m.get(); - auto& keep = keepIndex_m.get(); + auto& del = deleteIndex_m.template get(); + auto& keep = keepIndex_m.template get(); if (del.size() < destroyNum) { Kokkos::realloc(del, destroyNum * overalloc); Kokkos::realloc(keep, destroyNum * overalloc); @@ -220,10 +217,10 @@ namespace ippl { Kokkos::parallel_scan( "Scan in ParticleBase::destroy()", policy_type(0, localNum_m - destroyNum), KOKKOS_LAMBDA(const size_t i, int& idx, const bool final) { - if (final && invalid(i)) { + if (final && invalid_functor(i)) { locDeleteIndex(idx) = i; } - if (invalid(i)) { + if (invalid_functor(i)) { idx += 1; } }); @@ -245,10 +242,10 @@ namespace ippl { "Second scan in ParticleBase::destroy()", Kokkos::RangePolicy(localNum_m - destroyNum, localNum_m), KOKKOS_LAMBDA(const size_t i, int& idx, const bool final) { - if (final && !invalid(i)) { + if (final && !invalid_functor(i)) { locKeepIndex(idx) = i; } - if (!invalid(i)) { + if (!invalid_functor(i)) { idx += 1; } }); @@ -280,27 +277,39 @@ namespace ippl { template template - void ParticleBase::sendToRank(int rank, int tag, - std::vector& requests, - const HashType& hash) { - size_type nSends = hash.size(); - requests.resize(requests.size() + 1); + MPI_Request ParticleBase::sendToRank(int rank, int tag, const HashType& hash) { + size_type nSends = hash.size(); + MPI_Request request = MPI_REQUEST_NULL; auto hashes = hash_container_type(hash, [&]() { return attributes_m.template get().size() > 0; }); - pack(hashes); detail::runForAllSpaces([&]() { size_type bufSize = packedSize(nSends); if (bufSize == 0) { return; } - auto buf = Comm->getBuffer(bufSize); - Comm->isend(rank, tag++, *this, *buf, requests.back(), nSends); + forAllAttributes([&](Attribute& att) { + att->serialize(*buf, hashes.template get(), nSends); + }); + + Comm->isend(rank, tag++, *buf, request); buf->resetWritePos(); }); + return request; + } + + template + template + void ParticleBase::sendToRank(int rank, int tag, + std::vector& requests, + const HashType& hash) { + auto hashes = hash_container_type(hash, [&]() { + return attributes_m.template get().size() > 0; + }); + requests.push_back(sendToRank(rank, tag, hash)); } template @@ -313,10 +322,46 @@ namespace ippl { auto buf = Comm->getBuffer(bufSize); - Comm->recv(rank, tag++, *this, *buf, bufSize, nRecvs); + Comm->recv(rank, tag++, *buf, bufSize); + forAllAttributes([&](Attribute& att) { + att->deserialize(*buf, localNum_m, nRecvs); + }); + buf->resetReadPos(); }); - unpack(nRecvs); + localNum_m += nRecvs; + } + + template + std::pair> + ParticleBase::postRecvFromRank(int rank, int tag, size_type nRecvs) { + MPI_Request request = MPI_REQUEST_NULL; + + // Collect (buf, nRecvs) per memory space for deferred deserialization + auto deferred = std::make_shared>>(); + + detail::runForAllSpaces([&]() { + size_type bufSize = packedSize(nRecvs); + if (bufSize == 0) + return; + + auto buf = Comm->getBuffer(bufSize); + Comm->irecv(rank, tag++, *buf, request, bufSize); + + deferred->push_back([this, buf, nRecvs](size_type offset) { + forAllAttributes([&](Attribute& att) { + att->deserialize(*buf, offset, nRecvs); + }); + buf->resetReadPos(); + }); + }); + + // Finalize takes the offset (localNum_m after destruction) and advances it + return {request, [this, deferred, nRecvs](size_type offset) { + for (auto& fn : *deferred) + fn(offset); + localNum_m += nRecvs; + }}; } template diff --git a/src/Particle/ParticleSort.h b/src/Particle/ParticleSort.h new file mode 100644 index 000000000..f5eaa3336 --- /dev/null +++ b/src/Particle/ParticleSort.h @@ -0,0 +1,331 @@ +#ifndef IPPL_PARTICLE_SORT_H +#define IPPL_PARTICLE_SORT_H + +#include + +#include +#include +#include +#include +#include + +#include "Particle/SortBuffer.h" + +#ifdef KOKKOS_ENABLE_CUDA +#include +#endif + +#ifdef KOKKOS_ENABLE_HIP +#include +#endif + +namespace ippl { + namespace detail { + + /** + * @brief Validate that all permutation indices are in bounds + */ + template + void validatePermutation(const IndexView& permute, size_t n, + const char* label = "sortParticles") { + using size_type = size_t; + + size_type num_invalid = 0; + Kokkos::parallel_reduce( + label, Kokkos::RangePolicy(0, n), + KOKKOS_LAMBDA(size_type i, size_type & count) { + if (permute(i) >= n) + count++; + }, + num_invalid); + Kokkos::fence(); + + if (num_invalid > 0) { + size_type first_invalid_idx = n; + Kokkos::parallel_reduce( + "find_first_invalid", Kokkos::RangePolicy(0, n), + KOKKOS_LAMBDA(size_type i, size_type & first_idx) { + if (permute(i) >= n && i < first_idx) + first_idx = i; + }, + Kokkos::Min(first_invalid_idx)); + Kokkos::fence(); + + auto permute_host = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, permute); + + throw std::runtime_error(std::string(label) + ": Found " + + std::to_string(num_invalid) + + " invalid permutation indices. First invalid: permute[" + + std::to_string(first_invalid_idx) + + "] = " + std::to_string(permute_host(first_invalid_idx)) + + " (n = " + std::to_string(n) + ")"); + } + } + + /** + * @brief Compute Morton code (Z-order curve) for spatial sorting + */ + template + KOKKOS_INLINE_FUNCTION uint64_t computeMortonCode(const Vector& position, + const Vector& origin, + const Vector& invdx, + const Vector& ngrid) { + uint64_t morton = 0; + constexpr int bits_per_dim = 21; // 21*3 = 63 bits, safe for 3D + + uint32_t gridIndices[Dim]; + for (unsigned d = 0; d < Dim; ++d) { + T sx = (position[d] - origin[d]) * invdx[d]; + sx -= ngrid[d] * Kokkos::floor(sx / ngrid[d]); // wrap to [0, ngrid) + gridIndices[d] = static_cast(sx); + } + + for (int bit = 0; bit < bits_per_dim; ++bit) { + for (unsigned d = 0; d < Dim; ++d) { + if (gridIndices[d] & (1u << bit)) { + morton |= (uint64_t(1) << (bit * Dim + d)); + } + } + } + return morton; + } + + /** + * @brief Functor to compute Morton codes for all particles + */ + template + struct ComputeMortonCodesFunctor { + using size_type = size_t; + + PositionView positions; + KeyView keys; + Vector origin; + Vector invdx; + Vector ngrid; + + KOKKOS_INLINE_FUNCTION void operator()(size_type i) const { + keys(i) = computeMortonCode(positions(i), origin, invdx, ngrid); + } + }; + + // ------------------------------------------------------------------- + // Platform-specific sort implementations + // ------------------------------------------------------------------- + + /** + * @brief Sort particles on host using std::sort on (morton, index) pairs + * + * @return Subview of the buffered permute array, valid until next ensureCapacity + */ + template + Kokkos::View sortParticlesHost( + Kokkos::View*, Kokkos::HostSpace> positions, + const Vector& origin, const Vector& invdx, + const Vector& ngrid, size_t n) { + auto& bufs = ippl::detail::getDefaultBinSortBuffers(); + bufs.ensureCapacity(n, /*n_bins_p1=*/1); + + auto& permute = bufs.permute(); + + // Build (key, index) pairs, sort, then extract permutation + std::vector> pairs(n); + for (size_t i = 0; i < n; ++i) { + pairs[i] = {computeMortonCode(positions(i), origin, invdx, ngrid), + i}; + } + std::sort(pairs.begin(), pairs.end()); // pair has lexicographic < by .first + + for (size_t i = 0; i < n; ++i) { + permute(i) = pairs[i].second; + } + + return permute; + } + +#ifdef KOKKOS_ENABLE_CUDA + /** + * @brief Sort particles on CUDA using CUB DeviceRadixSort with buffered temporaries + * + * @return Subview of the buffered permOut array containing sorted particle indices + */ + template + Kokkos::View sortParticlesCuda( + Kokkos::View*, Kokkos::CudaSpace> positions, + const Vector& origin, const Vector& invdx, + const Vector& ngrid, size_t n) { + using size_type = size_t; + auto& bufs = ippl::detail::getDefaultBinSortBuffers(); + bufs.ensureCapacity(n, /*n_bins_p1=*/1); + + // Step 1: compute Morton codes into binKeys, init permute to identity + auto keys = bufs.binKeys(); + auto permute = bufs.permute(); + + Kokkos::parallel_for( + "MortonSort::ComputeKeys", Kokkos::RangePolicy(0, n), + ComputeMortonCodesFunctor{ + positions, keys, origin, invdx, ngrid}); + + Kokkos::parallel_for( + "MortonSort::InitPermute", Kokkos::RangePolicy(0, n), + KOKKOS_LAMBDA(size_type i) { permute(i) = i; }); + Kokkos::fence(); + + // Step 2: query temp storage, grow buffer if needed, then sort + auto keys_out = bufs.keysOut(); + auto perm_out = bufs.permOut(); + + size_t temp_bytes = 0; + cub::DeviceRadixSort::SortPairs(nullptr, temp_bytes, keys.data(), keys_out.data(), + permute.data(), perm_out.data(), static_cast(n)); + + bufs.ensureTempStorage(temp_bytes); + + cub::DeviceRadixSort::SortPairs(bufs.tempStorage().data(), temp_bytes, keys.data(), + keys_out.data(), permute.data(), perm_out.data(), + static_cast(n)); + Kokkos::fence(); + + return bufs.permOut(); + } +#endif + +#ifdef KOKKOS_ENABLE_HIP + /** + * @brief Sort particles on HIP using rocPRIM radix_sort_pairs with buffered temporaries + * + * @return Subview of the buffered permOut array containing sorted particle indices + */ + template + Kokkos::View sortParticlesHip( + Kokkos::View*, Kokkos::HIPSpace> positions, const Vector& origin, + const Vector& invdx, const Vector& ngrid, size_t n) { + using size_type = size_t; + auto& bufs = ippl::detail::getDefaultBinSortBuffers(); + bufs.ensureCapacity(n, /*n_bins_p1=*/1); + + auto keys = bufs.binKeys(); + auto permute = bufs.permute(); + + Kokkos::parallel_for( + "MortonSort::ComputeKeys", Kokkos::RangePolicy(0, n), + ComputeMortonCodesFunctor{ + positions, keys, origin, invdx, ngrid}); + + Kokkos::parallel_for( + "MortonSort::InitPermute", Kokkos::RangePolicy(0, n), + KOKKOS_LAMBDA(size_type i) { permute(i) = i; }); + Kokkos::fence(); + + auto keys_out = bufs.keysOut(); + auto perm_out = bufs.permOut(); + + size_t temp_bytes = 0; + std::ignore = rocprim::radix_sort_pairs(nullptr, temp_bytes, keys.data(), keys_out.data(), + permute.data(), perm_out.data(), n, 0, sizeof(uint64_t) * 8, + Kokkos::HIP().hip_stream()); + + bufs.ensureTempStorage(temp_bytes); + + std::ignore = rocprim::radix_sort_pairs(bufs.tempStorage().data(), temp_bytes, keys.data(), + keys_out.data(), permute.data(), perm_out.data(), n, 0, + sizeof(uint64_t) * 8, Kokkos::HIP().hip_stream()); + Kokkos::fence(); + + return bufs.permOut(); + } +#endif + + /** + * @brief Generic sort dispatcher — selects CUDA/HIP/host implementation + */ + template + Kokkos::View sortParticles( + Kokkos::View*, typename ExecSpace::memory_space> positions, + const Vector& origin, const Vector& invdx, + const Vector& ngrid, size_t n) { + using memory_space = typename ExecSpace::memory_space; + +#ifdef KOKKOS_ENABLE_CUDA + if constexpr (std::is_same_v) { + return sortParticlesCuda(positions, origin, invdx, ngrid, n); + } +#endif +#ifdef KOKKOS_ENABLE_HIP + if constexpr (std::is_same_v) { + return sortParticlesHip(positions, origin, invdx, ngrid, n); + } +#endif + if constexpr (std::is_same_v) { + return sortParticlesHost(positions, origin, invdx, ngrid, n); + } else { + // Generic fallback: sort on host, copy result to device buffer + auto positions_host = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, positions); + + Vector ngrid_host; + for (unsigned d = 0; d < Dim; ++d) + ngrid_host[d] = ngrid[d]; + + auto permute_host = + sortParticlesHost(positions_host, origin, invdx, ngrid_host, n); + + auto& bufs = ippl::detail::getDefaultBinSortBuffers(); + bufs.ensureCapacity(n, /*n_bins_p1=*/1); + + Kokkos::deep_copy(Kokkos::subview(bufs.permute(), std::make_pair(size_t(0), n)), + Kokkos::subview(permute_host, std::make_pair(size_t(0), n))); + + return bufs.permute(); + } + } + + // ------------------------------------------------------------------- + // Permutation application helpers + // ------------------------------------------------------------------- + + template + struct ApplyPermutationFunctor { + SrcView src; + DstView dst; + IndexView permute; + KOKKOS_INLINE_FUNCTION void operator()(size_t i) const { dst(i) = src(permute(i)); } + }; + + template + struct ApplyInversePermutationFunctor { + SrcView src; + DstView dst; + IndexView permute; + KOKKOS_INLINE_FUNCTION void operator()(size_t i) const { dst(permute(i)) = src(i); } + }; + + /** + * @brief Reorder data in-place according to permutation + */ + template + void applyPermutation(DataView& data, const IndexView& permute, size_t n) { + DataView temp("temp", n); + Kokkos::parallel_for( + "apply_permutation", Kokkos::RangePolicy(0, n), + ApplyPermutationFunctor{data, temp, permute}); + Kokkos::deep_copy(Kokkos::subview(data, Kokkos::make_pair(size_t(0), n)), temp); + } + + /** + * @brief Scatter src → dst using inverse permutation (dst[permute[i]] = src[i]) + */ + template + void applyInversePermutation(const DataView& src, DataView& dst, const IndexView& permute, + size_t n) { + Kokkos::parallel_for( + "apply_inverse_permutation", Kokkos::RangePolicy(0, n), + ApplyInversePermutationFunctor{src, dst, permute}); + Kokkos::fence(); + } + + } // namespace detail +} // namespace ippl + +#endif // IPPL_PARTICLE_SORT_H \ No newline at end of file diff --git a/src/Particle/ParticleSpatialLayout.h b/src/Particle/ParticleSpatialLayout.h index 568b30753..ed596e180 100644 --- a/src/Particle/ParticleSpatialLayout.h +++ b/src/Particle/ParticleSpatialLayout.h @@ -35,6 +35,19 @@ namespace ippl { + // Controls how the send-count exchange is performed before the particle + // data transfer. All paths assume GPU-aware MPI (device pointers passed + // directly to Isend/Irecv/Alltoall). + enum class CountExchange { + // One-sided RMA: each sender writes its count directly into the + // receiver's window. + RMA, + // Two-sided point-to-point: per-rank Isend/Irecv over device pointers. + P2P, + // Collective Alltoall over device pointers. + Alltoall + }; + /*! * ParticleSpatialLayout class definition. * @tparam T value type @@ -59,9 +72,9 @@ namespace ippl { using size_type = detail::size_type; - public: // constructor: this one also takes a Mesh - ParticleSpatialLayout(FieldLayout&, Mesh&, bool fem = false); + ParticleSpatialLayout(FieldLayout&, Mesh&, bool fem = false, + CountExchange mode = CountExchange::RMA); ParticleSpatialLayout() : detail::ParticleLayout() {} @@ -73,21 +86,33 @@ namespace ippl { template void update(ParticleContainer& pc); - const RegionLayout_t& getRegionLayout() const { return rlayout_m; } + const RegionLayout_t& getRegionLayout() const { return *rlayout_m; } protected: //! The RegionLayout which determines where our particles go. - RegionLayout_t rlayout_m; + std::shared_ptr rlayout_m; //! The FieldLayout containing information on nearest neighbors FieldLayout_t& flayout_m; + //! How counts are exchanged + CountExchange countExchangeMode_; + + // + // RMA Path + // + // Vector keeping track of the recieves from all ranks std::vector nRecvs_m; // MPI RMA window for one-sided communication mpi::rma::Window window_m; + // + // P2P GPU Path + // + locate_type recvCounts_d_; // [nranks] + //! Type of the Kokkos view containing the local regions. using region_view_type = typename RegionLayout_t::view_type; //! Type of a single Region object. @@ -99,6 +124,10 @@ namespace ippl { KOKKOS_INLINE_FUNCTION constexpr static bool positionInRegion( const std::index_sequence&, const vector_type& pos, const region_type& region); + template + KOKKOS_INLINE_FUNCTION constexpr static bool positionInRegionInclusive( + const std::index_sequence&, const vector_type& pos, const region_type& region); + /*! * Evaluates the total number of MPI ranks sharing the spatial nearest neighbors. * @param neighbors structure containing, for every spatial direction, a list of @@ -109,35 +138,59 @@ namespace ippl { public: /*! - * For each particle in the bunch, determine the rank on which it should - * be stored based on its location - * @tparam ParticleContainer the particle container type - * @param pc the particle container - * @param ranks the integer view in which to store the destination ranks - * @param invalid the boolean view in which to store whether each particle - * is invalidated, i.e. needs to be sent to another rank - * @param nSends_dview the view storing per-rank send counts - * @param sends_dview the view storing per-rank send offsets - * @return The total number of invalidated particles + * For each local particle, determine its destination rank, write the + * indices of leaving particles into `sendIds_d_`, populate the + * per-rank send-count and offset buffers, build a compacted list of + * destination ranks, and mark each particle's "is leaving" status in + * `leaving_d_` so the destroy pass doesn't have to recompute it. + * @return number of leaving particles on this rank */ template - std::pair locateParticles(const ParticleContainer& pc, - locate_type& ranks, bool_type& invalid, - locate_type& nSends_dview, - locate_type& sends_dview) const; - - /*! - * @param rank we sent to - * @param ranks a container specifying where a particle at the i-th index should go. - * @param hash a mapping to fill the send buffer contiguously - */ - void fillHash(int rank, const locate_type& ranks, hash_type& hash); - - /*! - * @param rank we sent to - * @param ranks a container specifying where a particle at the i-th index should go. - */ - size_t numberOfSends(int rank, const locate_type& ranks); + size_t locateParticlesPacked(const ParticleContainer& pc); + + private: + // Fixed-size scratch + locate_type rankSendCount_d_; // [nRanks] + locate_type sendOffsets_d_; // [nRanks+1] + hash_type sendIds_d_; // [capacity >= max nInvalid seen] + locate_type cursor_d_; // [nRanks] per-rank insertion cursor + locate_type destRanks_d_; // [nRanks] (compacted list) + bool_type leaving_d_; // [capacity >= max nLocal seen] mask + + // Single scalar on device to count destinations + Kokkos::View nDest_d_; + + // Neigbour cache + locate_type neighbors_d_; // [neighborSize] cached device neighbors list + std::vector neighbors_host_; // flat host copy + bool neighbors_dirty_ = true; + size_t neighbors_capacity_ = 0; + size_type neighbors_used_ = 0; + + // Host mirror buffers + using host_mem_space = Kokkos::HostSpace; + using locate_host_type = typename detail::ViewType::view_type; + + locate_host_type rankSendCount_h_; // [nRanks] (mirror) + locate_host_type sendOffsets_h_; // [nRanks+1] (mirror) + locate_host_type destRanks_h_; // [nRanks] (mirror) + + // Host-side destination list + std::vector destinationRanks_host_; + + // capacities + size_t sendIds_capacity_ = 0; + size_t leaving_capacity_ = 0; + int nRanks_ = 0; + + void initScratch(int nRanks); + void ensureSendCapacity(size_t nInvalid); + void ensureLeavingCapacity(size_t nLocal); + void ensureNeighborsCached(); + + void countExchangeRMA(); + void countExchangeP2P(); + void countExchangeAlltoall(); }; } // namespace ippl diff --git a/src/Particle/ParticleSpatialLayout.hpp b/src/Particle/ParticleSpatialLayout.hpp index b47529aae..cbb4c760b 100644 --- a/src/Particle/ParticleSpatialLayout.hpp +++ b/src/Particle/ParticleSpatialLayout.hpp @@ -19,60 +19,95 @@ // frequency of load balancing (N), or may supply a function to // determine if load balancing should be done or not. // +#include #include #include #include #include "Utility/IpplTimings.h" +#include "Utility/ParallelDispatch.h" #include "Communicate/Window.h" namespace ippl { - /*! - * We need this struct since Kokkos parallel_scan only accepts - * one variable of type ReturnType where to perform the reduction operation. - * For more details, see - * https://kokkos.github.io/kokkos-core-wiki/API/core/parallel-dispatch/parallel_scan.html. - */ - struct increment_type { - size_t count[2]; + template + ParticleSpatialLayout::ParticleSpatialLayout(FieldLayout& fl, + Mesh& mesh, bool fem, + CountExchange mode) + : rlayout_m(std::make_shared(fl, mesh, fem)) + , flayout_m(fl) + , countExchangeMode_(mode) { + const int nRanks = Comm->size(); + nRanks_ = nRanks; + initScratch(nRanks); - KOKKOS_FUNCTION void init() { - count[0] = 0; - count[1] = 0; + if (mode == CountExchange::RMA && nRanks > 1) { + nRecvs_m.resize(Comm->size()); + window_m.create(*Comm, nRecvs_m.begin(), nRecvs_m.end()); } + } - KOKKOS_INLINE_FUNCTION increment_type& operator+=(bool* values) { - count[0] += values[0]; - count[1] += values[1]; - return *this; - } + template + void ParticleSpatialLayout::updateLayout(FieldLayout& fl, + Mesh& mesh) { + rlayout_m->changeDomain(fl, mesh); + neighbors_dirty_ = true; + } + + template + void ParticleSpatialLayout::countExchangeRMA() { + std::fill(nRecvs_m.begin(), nRecvs_m.end(), 0); + window_m.fence(0); - KOKKOS_INLINE_FUNCTION increment_type& operator+=(increment_type values) { - count[0] += values.count[0]; - count[1] += values.count[1]; - return *this; + for (int rank : destinationRanks_host_) { + if (rank == Comm->rank()) + continue; + const int* src = &rankSendCount_h_(rank); + window_m.put(src, rank, Comm->rank()); } - }; + + window_m.fence(0); + } template - ParticleSpatialLayout::ParticleSpatialLayout(FieldLayout& fl, - Mesh& mesh, bool fem) - : rlayout_m(fl, mesh, fem) - , flayout_m(fl) - { - nRecvs_m.resize(Comm->size()); - if (Comm->size() > 1) { - window_m.create(*Comm, nRecvs_m.begin(), nRecvs_m.end()); + void ParticleSpatialLayout::countExchangeP2P() { + const int myRank = Comm->rank(); + const int tag = Comm->next_tag(mpi::tag::P_SPATIAL_LAYOUT, mpi::tag::P_LAYOUT_CYCLE); + + // Zero the receive-count buffer on the device + Kokkos::deep_copy(position_execution_space{}, recvCounts_d_, 0); + Kokkos::fence(); + + // Device pointer to send-count array + int* d_sendCounts = rankSendCount_d_.data(); + int* d_recvCounts = recvCounts_d_.data(); + + std::vector reqs; + reqs.reserve(2 * std::max(0, nRanks_ - 1)); + + // Post receives from all ranks (except self) + for (int r = 0; r < nRanks_; ++r) { + if (r == myRank) + continue; + MPI_Irecv(d_recvCounts + r, 1, MPI_INT, r, tag, Comm->getCommunicator(), + &reqs.emplace_back()); } + + for (int r = 0; r < nRanks_; ++r) { + if (r == myRank) + continue; + MPI_Isend(d_sendCounts + r, 1, MPI_INT, r, tag, Comm->getCommunicator(), + &reqs.emplace_back()); + } + + MPI_Waitall(static_cast(reqs.size()), reqs.data(), MPI_STATUSES_IGNORE); } template - void ParticleSpatialLayout::updateLayout(FieldLayout& fl, - Mesh& mesh) { - // flayout_m = fl; - rlayout_m.changeDomain(fl, mesh); + void ParticleSpatialLayout::countExchangeAlltoall() { + MPI_Alltoall(rankSendCount_d_.data(), 1, MPI_INT, recvCounts_d_.data(), 1, MPI_INT, + Comm->getCommunicator()); } template @@ -81,18 +116,20 @@ namespace ippl { /* Apply Boundary Conditions */ static IpplTimings::TimerRef ParticleBCTimer = IpplTimings::getTimer("particleBC"); IpplTimings::startTimer(ParticleBCTimer); - this->applyBC(pc.R, rlayout_m.getDomain()); + this->applyBC(pc.R, rlayout_m->getDomain()); IpplTimings::stopTimer(ParticleBCTimer); /* Update Timer for the rest of the function */ static IpplTimings::TimerRef ParticleUpdateTimer = IpplTimings::getTimer("updateParticle"); IpplTimings::startTimer(ParticleUpdateTimer); - int nRanks = Comm->size(); - if (nRanks < 2) { + if (nRanks_ < 2) { + IpplTimings::stopTimer(ParticleUpdateTimer); return; } + ensureNeighborsCached(); + /* particle MPI exchange: * 1. figure out which particles need to go where -> locateParticles(...) * 2. fill send buffer and send particles @@ -105,69 +142,45 @@ namespace ippl { static IpplTimings::TimerRef locateTimer = IpplTimings::getTimer("locateParticles"); IpplTimings::startTimer(locateTimer); - /* Rank-local number of particles */ - size_type localnum = pc.getLocalNum(); - - /* The indices correspond to the indices of the local particles, - * the values correspond to the ranks to which the particles need to be sent - */ - locate_type particleRanks("particles' MPI ranks", localnum); + const size_type nInvalid = locateParticlesPacked(pc); - /* The indices are the indices of the particles, - * the boolean values describe whether the particle has left the current rank - * 0 --> particle valid (inside current rank) - * 1 --> particle invalid (left rank) - */ - bool_type invalidParticles("validity of particles", localnum); + // Copy metadata to host + size_type nDest = 0; + Kokkos::deep_copy(position_execution_space{}, nDest, nDest_d_); - /* The indices are the MPI ranks, - * the values are the number of particles are sent to that rank from myrank - */ - locate_type rankSendCount_dview("rankSendCount Device", nRanks); - - /* The indices have no particluar meaning, - * the values are the MPI ranks to which we need to send - */ - locate_type destinationRanks_dview("destinationRanks Device", nRanks); - - /* nInvalid is the number of invalid particles - * nDestinationRanks is the number of MPI ranks we need to send to - */ - auto [nInvalid, nDestinationRanks] = locateParticles( - pc, particleRanks, invalidParticles, rankSendCount_dview, destinationRanks_dview); + // destRanks prefix + if (nDest > 0) { + Kokkos::deep_copy( + position_execution_space{}, + Kokkos::subview(destRanks_h_, std::make_pair(size_t(0), size_t(nDest))), + Kokkos::subview(destRanks_d_, std::make_pair(size_t(0), size_t(nDest)))); + } - /* Host space copy of rankSendCount_dview */ - auto rankSendCount_hview = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), rankSendCount_dview); + // counts + offsets + Kokkos::deep_copy(position_execution_space{}, rankSendCount_h_, rankSendCount_d_); + Kokkos::deep_copy(position_execution_space{}, sendOffsets_h_, sendOffsets_d_); - /* Host Space copy of destinationRanks_dview */ - auto destinationRanks_hview = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), destinationRanks_dview); + // Build host destination list without allocation + destinationRanks_host_.clear(); + for (size_t i = 0; i < (size_t)nDest; ++i) + destinationRanks_host_.push_back(destRanks_h_(i)); IpplTimings::stopTimer(locateTimer); // 2. fill send buffer and send particles =============================================== // - // 2.1 Remote Memory Access window for one-sided communication + // 2.1 Count Exchange static IpplTimings::TimerRef preprocTimer = IpplTimings::getTimer("sendPreprocess"); IpplTimings::startTimer(preprocTimer); - std::fill(nRecvs_m.begin(), nRecvs_m.end(), 0); - - window_m.fence(0); - - // Prepare RMA window for the ranks we need to send to - for (size_t ridx = 0; ridx < nDestinationRanks; ridx++) { - int rank = destinationRanks_hview[ridx]; - if (rank == Comm->rank()) { - // we do not need to send to ourselves - continue; - } - const int* src_ptr = &rankSendCount_hview(rank); - window_m.put(src_ptr, rank, Comm->rank()); + if (countExchangeMode_ == CountExchange::RMA) { + countExchangeRMA(); + } else if (countExchangeMode_ == CountExchange::P2P) { + countExchangeP2P(); + } else { + countExchangeAlltoall(); } - window_m.fence(0); IpplTimings::stopTimer(preprocTimer); @@ -176,55 +189,136 @@ namespace ippl { static IpplTimings::TimerRef sendTimer = IpplTimings::getTimer("particleSend"); IpplTimings::startTimer(sendTimer); - std::vector requests(0); + std::vector requests; + requests.reserve(destinationRanks_host_.size()); int tag = Comm->next_tag(mpi::tag::P_SPATIAL_LAYOUT, mpi::tag::P_LAYOUT_CYCLE); - for (size_t ridx = 0; ridx < nDestinationRanks; ridx++) { - int rank = destinationRanks_hview[ridx]; - if (rank == Comm->rank()) { + for (int rank : destinationRanks_host_) { + if (rank == Comm->rank()) continue; - } - hash_type hash("hash", rankSendCount_hview(rank)); - fillHash(rank, particleRanks, hash); - pc.sendToRank(rank, tag, requests, hash); + const size_type count = static_cast(rankSendCount_h_(rank)); + if (count == 0) + continue; + const size_type begin = static_cast(sendOffsets_h_(rank)); + auto ids_sub = + Kokkos::subview(sendIds_d_, std::make_pair((size_t)begin, (size_t)(begin + count))); + requests.push_back(pc.sendToRank(rank, tag, ids_sub)); } IpplTimings::stopTimer(sendTimer); + // 2.3 Post receives + + static IpplTimings::TimerRef recvTimer = IpplTimings::getTimer("particleRecv"); + IpplTimings::startTimer(recvTimer); + + std::vector> recvList; + size_type totalRecvs = 0; + + if (countExchangeMode_ == CountExchange::RMA) { + for (int rank = 0; rank < nRanks_; ++rank) { + if (nRecvs_m[rank] > 0) { + recvList.push_back({rank, nRecvs_m[rank]}); + totalRecvs += nRecvs_m[rank]; + } + } + } else { + auto recvCounts_h = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), recvCounts_d_); + for (int rank = 0; rank < nRanks_; ++rank) { + if (recvCounts_h(rank) > 0) { + recvList.push_back({rank, recvCounts_h(rank)}); + totalRecvs += static_cast(recvCounts_h(rank)); + } + } + } + + std::vector recvRequests(recvList.size(), MPI_REQUEST_NULL); + std::vector> finalizers(recvList.size()); + + for (size_t i = 0; i < recvList.size(); ++i) { + auto [rank, count] = recvList[i]; + auto [req, fin] = pc.postRecvFromRank(rank, tag, count); + recvRequests[i] = req; + finalizers[i] = std::move(fin); + } + + IpplTimings::stopTimer(recvTimer); + // 3. Internal destruction of invalid particles ======================================= // static IpplTimings::TimerRef destroyTimer = IpplTimings::getTimer("particleDestroy"); IpplTimings::startTimer(destroyTimer); - pc.internalDestroy(invalidParticles, nInvalid); + // locateParticlesPacked already wrote the per-particle "is leaving" + // mask into leaving_d_. Reuse it instead of re-running the full + // region search inside the destroy predicate. + auto leaving = leaving_d_; + pc.template internalDestroy( + KOKKOS_LAMBDA(size_t i) { return leaving(i); }, nInvalid); Kokkos::fence(); + const size_type localAfterDestroy = pc.getLocalNum(); IpplTimings::stopTimer(destroyTimer); - // 4. Receive Particles ================================================================ // + static IpplTimings::TimerRef waitTimer = IpplTimings::getTimer("particleWait"); + IpplTimings::startTimer(waitTimer); - static IpplTimings::TimerRef recvTimer = IpplTimings::getTimer("particleRecv"); - IpplTimings::startTimer(recvTimer); + requests.insert(requests.end(), recvRequests.begin(), recvRequests.end()); + if (!requests.empty()) { + MPI_Waitall(static_cast(requests.size()), requests.data(), MPI_STATUSES_IGNORE); + } - for (int rank = 0; rank < nRanks; ++rank) { - if (nRecvs_m[rank] > 0) { - pc.recvFromRank(rank, tag, nRecvs_m[rank]); - } + IpplTimings::stopTimer(waitTimer); + + static IpplTimings::TimerRef freeBufferTimer = IpplTimings::getTimer("particleFreeBuffers"); + IpplTimings::startTimer(freeBufferTimer); + + Comm->freeAllBuffers(); + + IpplTimings::stopTimer(freeBufferTimer); + + // 5. Deserialize + static IpplTimings::TimerRef deserializeTimer = + IpplTimings::getTimer("particleDeserialize"); + IpplTimings::startTimer(deserializeTimer); + + static IpplTimings::TimerRef deserializeResizeTimer = + IpplTimings::getTimer("particleDeserResize"); + IpplTimings::startTimer(deserializeResizeTimer); + + if (totalRecvs > 0) { + const size_type receiveCapacity = localAfterDestroy + totalRecvs; + pc.forAllAttributes([&](Attribute*& attribute) { + attribute->reserve(receiveCapacity); + }); } - IpplTimings::stopTimer(recvTimer); - IpplTimings::startTimer(sendTimer); + IpplTimings::stopTimer(deserializeResizeTimer); + + static IpplTimings::TimerRef deserializeCopyTimer = + IpplTimings::getTimer("particleDeserCopy"); + IpplTimings::startTimer(deserializeCopyTimer); - if (requests.size() > 0) { - MPI_Waitall(requests.size(), requests.data(), MPI_STATUSES_IGNORE); + for (auto& finalize : finalizers) { + finalize(pc.getLocalNum()); } - IpplTimings::stopTimer(sendTimer); - Comm->freeAllBuffers(); + + IpplTimings::stopTimer(deserializeCopyTimer); + IpplTimings::stopTimer(deserializeTimer); IpplTimings::stopTimer(ParticleUpdateTimer); } + template + template + KOKKOS_INLINE_FUNCTION constexpr bool + ParticleSpatialLayout::positionInRegionInclusive( + const std::index_sequence&, const vector_type& pos, const region_type& region) { + return ((pos[Idx] >= region[Idx].min()) && ...) && ((pos[Idx] <= region[Idx].max()) && ...); + }; + template template KOKKOS_INLINE_FUNCTION constexpr bool @@ -248,220 +342,204 @@ namespace ippl { return totalSize; } - /** - * @brief This function determines to which rank particles need to be sent after the iteration - * step. It starts by first scanning direct rank neighbors, and only does a global scan if there - * are still unfound particles. It then calculates how many particles need to be sent to each - * rank and how many ranks are sent to in total. - * - * @param pc Particle Container - * @param ranks A vector the length of the number of particles on the current rank, where - * each value refers to the new rank of the particle - * @param invalid A vector marking the particles that need to be sent away, and thus - * locally deleted - * @param nSends_dview Device view the length of number of ranks, where each value determines - * the number of particles sent to that rank from the current rank - * @param sends_dview Device view for the number of ranks that are sent to from current rank - * - * @return tuple with the number of particles sent away and the number of ranks sent to - */ template template - std::pair - ParticleSpatialLayout::locateParticles( - const ParticleContainer& pc, locate_type& ranks, bool_type& invalid, - locate_type& nSends_dview, locate_type& sends_dview) const { - auto positions = pc.R.getView(); - region_view_type Regions = rlayout_m.getdLocalRegions(); + size_t ParticleSpatialLayout::locateParticlesPacked( + const ParticleContainer& pc) { + const int nRanks = Comm->size(); + const size_type myRank = Comm->rank(); - using policy_type = Kokkos::RangePolicy; - using mdrange_type = Kokkos::MDRangePolicy, position_execution_space>; + ensureNeighborsCached(); - size_type myRank = Comm->rank(); - - const auto is = std::make_index_sequence{}; - - const neighbor_list& neighbors = flayout_m.getNeighbors(); - - /// outsideIds: Container of particle IDs that travelled outside of the neighborhood. - locate_type outsideIds("Particles outside of neighborhood", size_type(pc.getLocalNum())); - - /// outsideCount: Tracks the number of particles that travelled outside of the neighborhood. - size_type outsideCount = 0; - /// invalidCount: Tracks the number of particles that need to be sent to other ranks. - size_type invalidCount = 0; + auto positions = pc.R.getView(); + region_view_type Regions = rlayout_m->getdLocalRegions(); + const auto is = std::make_index_sequence{}; - /// neighborSize: Size of a neighborhood in D dimentions. - const size_type neighborSize = getNeighborSize(neighbors); + using exec_space = position_execution_space; + using policy_type = Kokkos::RangePolicy; - /// neighbors_view: Kokkos view with the IDs of the neighboring MPI ranks. - locate_type neighbors_view("Nearest neighbors IDs", neighborSize); + // Reset small device buffers + Kokkos::deep_copy(rankSendCount_d_, size_type(0)); + Kokkos::deep_copy(cursor_d_, size_type(0)); + Kokkos::deep_copy(nDest_d_, size_type(0)); - /* red_val: Used to reduce both the number of invalid particles and the number of particles - * outside of the neighborhood (Kokkos::parallel_scan doesn't allow multiple reduction - * values, so we use the helper class increment_type). First element updates InvalidCount, - * second one updates outsideCount. - */ - increment_type red_val; - red_val.init(); + const size_type neighbors_used = neighbors_used_; + auto& neighbours_d = neighbors_d_; - auto neighbors_mirror = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), neighbors_view); + // Destination rank computation (no per-particle storage) + auto destRankOf = KOKKOS_LAMBDA(const size_t i)->size_type { + if (positionInRegion(is, positions(i), Regions(myRank))) + return myRank; - size_t k = 0; + for (int j = 0; j < static_cast(neighbors_used); ++j) { + const int r = neighbours_d(j); + if (positionInRegion(is, positions(i), Regions(r))) + return r; + } - for (const auto& componentNeighbors : neighbors) { - for (size_t j = 0; j < componentNeighbors.size(); ++j) { - neighbors_mirror(k) = componentNeighbors[j]; - // std::cout << "Neighbor: " << neighbors_mirror(k) << std::endl; - k++; + // slow-path: global scan + for (int r = 0; r < static_cast(Regions.extent(0)); ++r) { + if (positionInRegion(is, positions(i), Regions(r))) + return r; } - } - Kokkos::deep_copy(neighbors_view, neighbors_mirror); + // Inclusive fallback: catches particles sitting exactly on a region + // lower boundary that the strict > check above missed (e.g. (0,0,0)) + for (int r = 0; r < static_cast(Regions.extent(0)); ++r) { + if (positionInRegionInclusive(is, positions(i), Regions(r))) + return r; + } + return myRank; // truly outside all regions - applyBC should have prevented this + }; - /*! Begin Kokkos loop: - * Step 1: search in current rank - * Step 2: search in neighbors - * Step 3: save information on whether the particle was located - * Step 4: run additional loop on non-located particles - */ - static IpplTimings::TimerRef neighborSearch = IpplTimings::getTimer("neighborSearch"); - IpplTimings::startTimer(neighborSearch); + // Make sure the leaving-mask buffer is large enough; it's reused across + // updates to avoid reallocation. + ensureLeavingCapacity(pc.getLocalNum()); + auto& leaving_d = leaving_d_; - Kokkos::parallel_scan( - "ParticleSpatialLayout::locateParticles()", - policy_type(0, ranks.extent(0)), - KOKKOS_LAMBDA(const size_type i, increment_type& val, const bool final) { - /* Step 1 - * inCurr: True if the particle hasn't left the current MPI rank. - * inNeighbor: True if the particle is found in a neighboring rank. - * found: True either if inCurr = True or inNeighbor = True. - * increment: Helper variable to update red_val. - */ - bool inCurr = false; - bool inNeighbor = false; - bool found = false; - bool increment[2]; - - inCurr = positionInRegion(is, positions(i), Regions(myRank)); - - ranks(i) = inCurr * myRank; - invalid(i) = !inCurr; - found = inCurr || found; - - /// Step 2 - for (size_t j = 0; j < neighbors_view.extent(0); ++j) { - size_type rank = neighbors_view(j); - - inNeighbor = positionInRegion(is, positions(i), Regions(rank)); - - ranks(i) = !(inNeighbor)*ranks(i) + inNeighbor * rank; - found = inNeighbor || found; - } - /// Step 3 - /* isOut: When the last thread has finished the search, checks whether the particle - * has been found either in the current rank or in a neighboring one. Used to avoid - * race conditions when updating outsideIds. - */ - if (final && !found) { - outsideIds(val.count[1]) = i; - } - // outsideIds(val.count[1]) = i * isOut; - increment[0] = invalid(i); - increment[1] = !found; - val += increment; + // Pass 1: compute send counts + nInvalid + per-particle "is leaving" + size_type nInvalid = 0; + auto& rankSendCount_d = rankSendCount_d_; + Kokkos::parallel_reduce( + "PSL::packed_count", policy_type(0, pc.getLocalNum()), + KOKKOS_LAMBDA(const size_t i, size_type& inval) { + const size_type dest = destRankOf(i); + const bool leaves = (dest != myRank); + leaving_d(i) = leaves; + inval += leaves; + if (leaves) + Kokkos::atomic_fetch_add(&rankSendCount_d(dest), size_type(1)); }, - red_val); - + nInvalid); Kokkos::fence(); - invalidCount = red_val.count[0]; - outsideCount = red_val.count[1]; - - IpplTimings::stopTimer(neighborSearch); - - /// Step 4 - static IpplTimings::TimerRef nonNeighboringParticles = - IpplTimings::getTimer("nonNeighboringParticles"); - IpplTimings::startTimer(nonNeighboringParticles); - if (outsideCount > 0) { - Kokkos::parallel_for( - "ParticleSpatialLayout::leftParticles()", - mdrange_type({0, 0}, {outsideCount, Regions.extent(0)}), - KOKKOS_LAMBDA(const size_t i, const size_type j) { - /// pID: (local) ID of the particle that is currently being searched. - size_type pId = outsideIds(i); - - /// inRegion: Checks whether particle pID is inside region j. - bool inRegion = positionInRegion(is, positions(pId), Regions(j)); - if (inRegion) { - ranks(pId) = j; - } - }); - Kokkos::fence(); - } - IpplTimings::stopTimer(nonNeighboringParticles); + // Ensure sendIds capacity after we know nInvalid + ensureSendCapacity(nInvalid); - Kokkos::parallel_for( - "Calculate nSends", policy_type(0, ranks.extent(0)), - KOKKOS_LAMBDA(const size_t i) { - size_type rank = ranks(i); - Kokkos::atomic_fetch_add(&nSends_dview(rank), 1); + // Exclusive scan to offsets (length nRanks+1) + auto& sendOffsets_d = sendOffsets_d_; + Kokkos::parallel_scan( + "PSL::packed_offsets", policy_type(0, (size_t)nRanks + 1), + KOKKOS_LAMBDA(const size_t r, size_type& upd, const bool final) { + if (final) + sendOffsets_d(r) = upd; + if (r < (size_t)nRanks) + upd += rankSendCount_d(r); }); + Kokkos::fence(); + + // Pass 2: fill packed send IDs into sendIds_d_ (prefix [0, nInvalid)) + auto& cursor_d = cursor_d_; + auto& sendIds_d = sendIds_d_; + Kokkos::parallel_for( + "PSL::packed_fill", policy_type(0, pc.getLocalNum()), KOKKOS_LAMBDA(const size_t i) { + const size_type dest = destRankOf(i); + if (dest == myRank) + return; + + const size_type pos = Kokkos::atomic_fetch_add(&cursor_d(dest), size_type(1)); + const size_type base = sendOffsets_d(dest); - // Number of Ranks we need to send to - Kokkos::View rankSends( - "Number of Ranks we need to send to"); + sendIds_d(base + pos) = static_cast(i); + }); + Kokkos::fence(); + // Build destination rank list on device (compact), store length in nDest_d_ + auto& destRanks_d = destRanks_d_; + auto& nDest_d = nDest_d_; Kokkos::parallel_for( - "Calculate sends", policy_type(0, nSends_dview.extent(0)), - KOKKOS_LAMBDA(const size_t rank) { - if (nSends_dview(rank) != 0) { - size_type index = Kokkos::atomic_fetch_add(&rankSends(), 1); - sends_dview(index) = rank; + "PSL::packed_destRanks", policy_type(0, (size_t)nRanks), KOKKOS_LAMBDA(const size_t r) { + if ((size_type)r == myRank) + return; + if (rankSendCount_d(r) > 0) { + const size_type idx = Kokkos::atomic_fetch_add(&nDest_d(), size_type(1)); + destRanks_d(idx) = static_cast(r); } }); - size_type temp; - Kokkos::deep_copy(temp, rankSends); + Kokkos::fence(); - return {invalidCount, temp}; + return nInvalid; } template - void ParticleSpatialLayout::fillHash(int rank, - const locate_type& ranks, - hash_type& hash) { - /* Compute the prefix sum and fill the hash - */ - using policy_type = Kokkos::RangePolicy; - Kokkos::parallel_scan( - "ParticleSpatialLayout::fillHash()", policy_type(0, ranks.extent(0)), - KOKKOS_LAMBDA(const size_t i, int& idx, const bool final) { - if (final) { - if (rank == ranks(i)) { - hash(idx) = i; - } - } + void ParticleSpatialLayout::initScratch(int nRanks) { + Kokkos::realloc(rankSendCount_d_, nRanks); + Kokkos::realloc(sendOffsets_d_, nRanks + 1); + Kokkos::realloc(cursor_d_, nRanks); + Kokkos::realloc(destRanks_d_, nRanks); + Kokkos::realloc(recvCounts_d_, nRanks); + + // scalar counter + nDest_d_ = Kokkos::View("nDest_d"); + + // Host mirrors + Kokkos::realloc(rankSendCount_h_, nRanks); + Kokkos::realloc(sendOffsets_h_, nRanks + 1); + Kokkos::realloc(destRanks_h_, nRanks); + + destinationRanks_host_.clear(); + destinationRanks_host_.reserve(nRanks); + } - if (rank == ranks(i)) { - idx += 1; - } - }); - Kokkos::fence(); + template + void ParticleSpatialLayout::ensureSendCapacity(size_t nInvalid) { + if (nInvalid <= sendIds_capacity_) + return; + + // grow geometrically + size_t newCap = sendIds_capacity_ ? sendIds_capacity_ : size_t(1024); + while (newCap < nInvalid) + newCap *= 2; + + sendIds_capacity_ = newCap; + Kokkos::realloc(sendIds_d_, newCap); } template - size_t ParticleSpatialLayout::numberOfSends( - int rank, const locate_type& ranks) { - size_t nSends = 0; - using policy_type = Kokkos::RangePolicy; - Kokkos::parallel_reduce( - "ParticleSpatialLayout::numberOfSends()", policy_type(0, ranks.extent(0)), - KOKKOS_LAMBDA(const size_t i, size_t& num) { num += size_t(rank == ranks(i)); }, - nSends); - Kokkos::fence(); - return nSends; + void ParticleSpatialLayout::ensureLeavingCapacity(size_t nLocal) { + if (nLocal <= leaving_capacity_) + return; + + size_t newCap = leaving_capacity_ ? leaving_capacity_ : size_t(1024); + while (newCap < nLocal) + newCap *= 2; + + leaving_capacity_ = newCap; + Kokkos::realloc(leaving_d_, newCap); + } + + template + void ParticleSpatialLayout::ensureNeighborsCached() { + if (!neighbors_dirty_) + return; + + const neighbor_list& neighbors = flayout_m.getNeighbors(); + const size_type neighborSize = getNeighborSize(neighbors); + neighbors_used_ = neighborSize; + + if (neighborSize > neighbors_capacity_) { + neighbors_capacity_ = neighborSize; + Kokkos::realloc(neighbors_d_, neighbors_capacity_); + } + + neighbors_host_.clear(); + neighbors_host_.reserve(neighborSize); + + auto neighbors_h = Kokkos::create_mirror_view(neighbors_d_); + size_t k = 0; + for (const auto& comp : neighbors) { + for (size_t j = 0; j < comp.size(); ++j) { + neighbors_h(k) = comp[j]; + neighbors_host_.push_back(comp[j]); + ++k; + } + } + + Kokkos::deep_copy( + Kokkos::subview(neighbors_d_, std::make_pair(size_t(0), size_t(neighborSize))), + Kokkos::subview(neighbors_h, std::make_pair(size_t(0), size_t(neighborSize)))); + + neighbors_dirty_ = false; } } // namespace ippl diff --git a/src/Particle/SortBuffer.h b/src/Particle/SortBuffer.h new file mode 100644 index 000000000..977db9f59 --- /dev/null +++ b/src/Particle/SortBuffer.h @@ -0,0 +1,185 @@ +#ifndef IPPL_SORT_BUFFER_H +#define IPPL_SORT_BUFFER_H + +#include + +#include +#include +#include + +namespace ippl { + namespace detail { + + /** + * @brief Persistent scratch buffers for bin-sort-based particle binning. + * + * Grows on demand, never shrinks. Call finalizeBinSortBuffers() before + * Kokkos::finalize() to release device memory safely. + * + * Buffer layout + * bin_keys uint64_t[n_particles] sort keys (bin index per particle) + * permute size_t [n_particles] identity → sorted particle order + * bin_offsets size_t [n_bins + 1] start of each bin in permute + * keys_out uint64_t[n_particles] CUB radix-sort output (CUDA only) + * perm_out size_t [n_particles] CUB radix-sort output (CUDA only) + * temp_storage char [temp_bytes] CUB device-temp (CUDA only) + */ + template + class BinSortBuffers { + public: + using memory_space = MemorySpace; + using size_type = size_t; + + BinSortBuffers() = default; + BinSortBuffers(const BinSortBuffers&) = delete; + BinSortBuffers& operator=(const BinSortBuffers&) = delete; + BinSortBuffers(BinSortBuffers&&) = default; + BinSortBuffers& operator=(BinSortBuffers&&) = default; + + /** + * @brief Ensure particle-sized buffers are large enough. + * + * @param n_particles Number of particles. + * @param n_bins_p1 Number of bins + 1 (size needed for bin_offsets). + * @param growth Over-allocation factor (default 1.2). + */ + void ensureCapacity(size_type n_particles, size_type n_bins_p1, double growth = 1.2) { + if (n_particles > particle_capacity_) { + const size_type cap = + std::max(static_cast(n_particles * growth), n_particles); + + bin_keys_ = Kokkos::View("bin_keys", cap); + permute_ = Kokkos::View("permute", cap); + keys_out_ = Kokkos::View("keys_out", cap); + perm_out_ = Kokkos::View("perm_out", cap); + + particle_capacity_ = cap; + } + + if (n_bins_p1 > bin_offset_capacity_) { + const size_type cap = + std::max(static_cast(n_bins_p1 * growth), n_bins_p1); + + bin_offsets_ = Kokkos::View("bin_offsets", cap); + bin_offset_capacity_ = cap; + } + } + + /** + * @brief Ensure CUB/rocPRIM temp-storage buffer is large enough. + * @param bytes Required bytes. + */ + void ensureTempStorage(size_t bytes) { + if (bytes <= temp_capacity_) + return; + const size_t cap = static_cast(bytes * 1.1); + temp_storage_ = Kokkos::View("temp_storage", cap); + temp_capacity_ = cap; + } + + // --- accessors ------------------------------------------------------- + Kokkos::View& binKeys() { return bin_keys_; } + Kokkos::View& permute() { return permute_; } + Kokkos::View& binOffsets() { return bin_offsets_; } + Kokkos::View& keysOut() { return keys_out_; } + Kokkos::View& permOut() { return perm_out_; } + Kokkos::View& tempStorage() { return temp_storage_; } + + size_type particleCapacity() const { return particle_capacity_; } + size_type binOffsetCapacity() const { return bin_offset_capacity_; } + size_t tempCapacity() const { return temp_capacity_; } + + size_t memoryUsage() const { + // Sum the bytes actually held by each view rather than + // assuming all of them are populated — keys_out_ / perm_out_ + // are only allocated on the CUDA fast path. + return bin_keys_.span() * sizeof(uint64_t) + + permute_.span() * sizeof(size_type) + + bin_offsets_.span() * sizeof(size_type) + + keys_out_.span() * sizeof(uint64_t) + + perm_out_.span() * sizeof(size_type) + + temp_storage_.span(); + } + + void clear() { + bin_keys_ = {}; + permute_ = {}; + bin_offsets_ = {}; + keys_out_ = {}; + perm_out_ = {}; + temp_storage_ = {}; + particle_capacity_ = 0; + bin_offset_capacity_ = 0; + temp_capacity_ = 0; + } + + private: + Kokkos::View bin_keys_; + Kokkos::View permute_; + Kokkos::View bin_offsets_; + Kokkos::View keys_out_; + Kokkos::View perm_out_; + Kokkos::View temp_storage_; + + size_type particle_capacity_ = 0; + size_type bin_offset_capacity_ = 0; + size_t temp_capacity_ = 0; + }; + + // --------------------------------------------------------------------------- + // Singleton access + // --------------------------------------------------------------------------- + + template + class BinSortBuffersHolder { + public: + static BinSortBuffersHolder& instance() { + static BinSortBuffersHolder h; + return h; + } + + BinSortBuffers& get() { + if (!buffers_) + buffers_ = std::make_unique>(); + return *buffers_; + } + + void finalize() { + if (buffers_) { + buffers_->clear(); + buffers_.reset(); + } + } + + private: + BinSortBuffersHolder() = default; + ~BinSortBuffersHolder() = default; + BinSortBuffersHolder(const BinSortBuffersHolder&) = delete; + BinSortBuffersHolder& operator=(const BinSortBuffersHolder&) = delete; + + std::unique_ptr> buffers_; + }; + + template + BinSortBuffers& getDefaultBinSortBuffers() { + return BinSortBuffersHolder::instance().get(); + } + + /** + * @brief Release all bin-sort buffers for known memory spaces. + * Call this before Kokkos::finalize(). + */ + inline void finalizeBinSortBuffers() { +#ifdef KOKKOS_ENABLE_CUDA + BinSortBuffersHolder::instance().finalize(); +#endif +#ifdef KOKKOS_ENABLE_HIP + BinSortBuffersHolder::instance().finalize(); +#endif + BinSortBuffersHolder::instance().finalize(); + } + + } // namespace detail +} // namespace ippl + +#endif // IPPL_SORT_BUFFER_H \ No newline at end of file diff --git a/src/Utility/BufferView.h b/src/Utility/BufferView.h new file mode 100644 index 000000000..732bf8375 --- /dev/null +++ b/src/Utility/BufferView.h @@ -0,0 +1,229 @@ +#ifndef IPPL_BUFFERVIEW_H +#define IPPL_BUFFERVIEW_H + +#include +#include +#include +#include + +namespace ippl { + + /** + * @brief Helper to compute aligned offset + */ + [[nodiscard]] constexpr size_t alignUp(size_t offset, size_t alignment) { + return (offset + alignment - 1) & ~(alignment - 1); + } + + /** + * + * @tparam T Element type + * @tparam MemorySpace Kokkos memory space + */ + template + class BufferView { + public: + using view_type = Kokkos::View; + + explicit BufferView(size_t count) + : view_m("BufferView", count) {} + + ~BufferView() = default; + + // Non-copyable + BufferView(const BufferView&) = delete; + BufferView& operator=(const BufferView&) = delete; + + // Movable + BufferView(BufferView&&) = default; + BufferView& operator=(BufferView&&) = default; + + view_type& getView() { return view_m; } + const view_type& getView() const { return view_m; } + + T* data() { return view_m.data(); } + const T* data() const { return view_m.data(); } + size_t size() const { return view_m.extent(0); } + + private: + view_type view_m; + }; + + /** + * @brief Allocates a single buffer and provides multiple aligned views into it. + * + * This class manages its own memory via a Kokkos View, completely independent + * from MPI buffer management. Use this for temporary computation buffers + * that don't need MPI communication. + * + * @tparam MemorySpace The Kokkos memory space for the buffer + */ + template + class MultiViewBuffer { + public: + using buffer_view_type = Kokkos::View; + + static constexpr size_t DEFAULT_ALIGNMENT = 256; + + /** + * @brief Construct without allocation. Call allocate() before getting views. + */ + MultiViewBuffer() + : currentOffset_m(0) {} + + /** + * @brief Construct and allocate buffer of specified size. + * @param totalBytes Total size in bytes + * @param label Optional Kokkos label for debugging + */ + explicit MultiViewBuffer(size_t totalBytes, const std::string& label = "MultiViewBuffer") + : buffer_m(label, totalBytes) + , currentOffset_m(0) {} + + ~MultiViewBuffer() = default; + + // Non-copyable + MultiViewBuffer(const MultiViewBuffer&) = delete; + MultiViewBuffer& operator=(const MultiViewBuffer&) = delete; + + // Movable + MultiViewBuffer(MultiViewBuffer&& other) noexcept + : buffer_m(std::move(other.buffer_m)) + , currentOffset_m(other.currentOffset_m) { + other.currentOffset_m = 0; + } + + MultiViewBuffer& operator=(MultiViewBuffer&& other) noexcept { + if (this != &other) { + buffer_m = std::move(other.buffer_m); + currentOffset_m = other.currentOffset_m; + other.currentOffset_m = 0; + } + return *this; + } + + /** + * @brief Allocate or reallocate the underlying buffer. + * @param totalBytes Total size in bytes + * @param label Optional Kokkos label for debugging + */ + void allocate(size_t totalBytes, const std::string& label = "MultiViewBuffer") { + buffer_m = buffer_view_type(label, totalBytes); + currentOffset_m = 0; + } + + /** + * @brief Get an aligned view of count elements of type T from the buffer. + * + * Views are allocated sequentially from the buffer. Each view starts at + * an address aligned to DEFAULT_ALIGNMENT (256 bytes). + * + * @tparam T Element type for the view + * @param count Number of elements + * @return Unmanaged Kokkos::View pointing into the buffer + */ + template + Kokkos::View> + getView(size_t count) { + using view_type = Kokkos::View>; + + size_t alignedOffset = alignUp(currentOffset_m, DEFAULT_ALIGNMENT); + size_t requiredSize = alignedOffset + count * sizeof(T); + + if (requiredSize > buffer_m.size()) { + throw std::runtime_error( + "MultiViewBuffer: insufficient space. Required: " + + std::to_string(requiredSize) + + ", Available: " + std::to_string(buffer_m.size())); + } + + T* viewPtr = reinterpret_cast(buffer_m.data() + alignedOffset); + currentOffset_m = alignedOffset + count * sizeof(T); + + return view_type(viewPtr, count); + } + + /** + * @brief Reset the allocation offset to reuse the buffer for new views. + * + * Previously obtained views become invalid after this call. + */ + void reset() { currentOffset_m = 0; } + + /** + * @brief Check if buffer has been allocated. + */ + bool isAllocated() const { return buffer_m.size() > 0; } + + /** + * @brief Get the current used size in bytes. + */ + size_t getUsedSize() const { return currentOffset_m; } + + /** + * @brief Get the total available size in bytes. + */ + size_t getTotalSize() const { return buffer_m.size(); } + + /** + * @brief Get remaining available bytes. + */ + size_t getRemainingSize() const { return getTotalSize() - currentOffset_m; } + + /** + * @brief Get raw pointer to buffer start. + */ + char* data() { return buffer_m.data(); } + const char* data() const { return buffer_m.data(); } + + private: + buffer_view_type buffer_m; + size_t currentOffset_m = 0; + }; + + /** + * @brief Helper struct for recursive buffer size computation + */ + template + struct BufferSizeComputer; + + // Base case: single type + template + struct BufferSizeComputer { + static constexpr size_t compute(size_t offset, size_t count) { + return alignUp(offset, MultiViewBuffer<>::DEFAULT_ALIGNMENT) + count * sizeof(T); + } + }; + + // Recursive case: multiple types + template + struct BufferSizeComputer { + template + static constexpr size_t compute(size_t offset, size_t count, Counts... counts) { + static_assert(sizeof...(Rest) == sizeof...(Counts), + "Number of remaining types must match remaining counts"); + size_t alignedOffset = alignUp(offset, MultiViewBuffer<>::DEFAULT_ALIGNMENT); + size_t newOffset = alignedOffset + count * sizeof(T); + return BufferSizeComputer::compute(newOffset, counts...); + } + }; + + /** + * @brief Compute total buffer size needed for multiple views with alignment. + * + * Usage: auto size = computeBufferSize(100, 200, 50); + * + * @tparam Ts Element types for each view + * @param counts Number of elements for each view + * @return Total required buffer size in bytes + */ + template + constexpr size_t computeBufferSize(Counts... counts) { + static_assert(sizeof...(Ts) == sizeof...(Counts), + "Number of types must match number of counts"); + return BufferSizeComputer::compute(0, static_cast(counts)...); + } + +} // namespace ippl + +#endif // IPPL_BUFFERVIEW_H \ No newline at end of file diff --git a/src/Utility/IpplTimings.cpp b/src/Utility/IpplTimings.cpp index e0f65f36f..93fd9447b 100644 --- a/src/Utility/IpplTimings.cpp +++ b/src/Utility/IpplTimings.cpp @@ -28,7 +28,10 @@ #include #include +#include #include +#include +#include #include "Utility/Inform.h" #include "Utility/IpplInfo.h" @@ -53,8 +56,10 @@ const int num_colors = sizeof(colors) / sizeof(uint32_t); } #endif +// Static member initialization Timing* IpplTimings::instance = new Timing(); std::stack IpplTimings::stashedInstance; +const std::vector Timing::emptyMeasurements; Timing::Timing() : TimerList() @@ -113,6 +118,111 @@ void Timing::clearTimer(TimerRef t) { TimerList[t]->clear(); } +// Reset all timers - useful for warmup +void Timing::resetAllTimers() { + for (unsigned int i = 0; i < TimerList.size(); ++i) { + TimerList[i]->clearAll(); + } +} +// Get measurements for a specific timer by reference +const std::vector& Timing::getMeasurements(TimerRef t) const { + if (t >= TimerList.size()) + return emptyMeasurements; + return TimerList[t]->measurements; +} + +// Get measurements for a specific timer by name +const std::vector& Timing::getMeasurements(const std::string& name) const { + TimerMap_t::const_iterator loc = TimerMap.find(name); + if (loc == TimerMap.end()) + return emptyMeasurements; + return loc->second->measurements; +} + +// Get measurement count for a timer +size_t Timing::getMeasurementCount(TimerRef t) const { + if (t >= TimerList.size()) + return 0; + return TimerList[t]->measurements.size(); +} + +// Get all timer names +std::vector Timing::getTimerNames() const { + std::vector names; + names.reserve(TimerList.size()); + for (unsigned int i = 0; i < TimerList.size(); ++i) { + names.push_back(TimerList[i]->name); + } + return names; +} + +// Dump all measurements to CSV (default format) +void Timing::dumpToCSV(const std::string& filename) { + dumpToCSV(filename, ",", true); +} + +// Dump all measurements to CSV with custom options +void Timing::dumpToCSV(const std::string& filename, const std::string& delimiter, + bool includeHeader) { + const int rank = ippl::Comm->rank(); + const int numRanks = ippl::Comm->size(); + + // Each rank serialises its measurements into a local buffer. + std::ostringstream localData; + for (unsigned int i = 0; i < TimerList.size(); ++i) { + TimerInfo* tptr = TimerList[i].get(); + const std::string& timerName = tptr->name; + const std::vector& measurements = tptr->measurements; + for (size_t j = 0; j < measurements.size(); ++j) { + localData << timerName << delimiter << rank << delimiter << j << delimiter + << std::setprecision(12) << measurements[j] << "\n"; + } + } + const std::string localStr = localData.str(); + + // Gather sizes from every rank using a single collective rather than + // O(P) point-to-point pairs. + MPI_Aint localSize = static_cast(localStr.size()); + std::vector sizes(numRanks, 0); + MPI_Gather(&localSize, 1, MPI_AINT, sizes.data(), 1, MPI_AINT, 0, + ippl::Comm->getCommunicator()); + + // Compute displacements and total size on rank 0; all ranks supply their + // bytes via MPI_Gatherv. + std::vector intSizes(numRanks, 0); + std::vector displs(numRanks, 0); + std::vector all; + if (rank == 0) { + long long total = 0; + for (int r = 0; r < numRanks; ++r) { + // The MPI standard limits Gatherv counts to int. Real CSV blobs + // never exceed INT_MAX in practice; refuse rather than silently + // truncate if they do. + if (sizes[r] > static_cast(std::numeric_limits::max())) { + std::cerr << "Timing::dumpToCSV: rank " << r << " has " << sizes[r] + << " bytes (> INT_MAX); truncating\n"; + } + intSizes[r] = static_cast(std::min( + sizes[r], static_cast(std::numeric_limits::max()))); + displs[r] = static_cast(total); + total += intSizes[r]; + } + all.resize(static_cast(total)); + } + + MPI_Gatherv(localStr.data(), static_cast(localStr.size()), MPI_CHAR, all.data(), + intSizes.data(), displs.data(), MPI_CHAR, 0, ippl::Comm->getCommunicator()); + + if (rank == 0) { + std::ofstream outFile(filename); + if (includeHeader) { + outFile << "timer_name" << delimiter << "rank" << delimiter << "measurement_id" + << delimiter << "duration_seconds" << "\n"; + } + outFile.write(all.data(), static_cast(all.size())); + } +} + // print out the timing results void Timing::print() { if (TimerList.size() < 1) @@ -153,8 +263,55 @@ void Timing::print() { << std::string().assign(20, ' ') << " Wall min = " << std::setw(10) << wallmin << "\n" << "\n"; } + + msg << "---------------------------------------------\n"; + msg << " Measurement counts:\n"; + msg << "---------------------------------------------\n"; + for (unsigned int i = 0; i < TimerList.size(); ++i) { + TimerInfo* tptr = TimerList[i].get(); + size_t lengthName = std::min(tptr->name.length(), 19lu); + msg << tptr->name.substr(0, lengthName) << std::string().assign(20 - lengthName, '.') + << " Count = " << std::setw(10) << tptr->measurements.size() << "\n"; + } + msg << "---------------------------------------------"; msg << endl; + + // Per-occurrence statistics + msg << "---------------------------------------------\n"; + msg << " Per-occurrence statistics:\n"; + msg << "---------------------------------------------\n"; + for (unsigned int i = 0; i < TimerList.size(); ++i) { + TimerInfo* tptr = TimerList[i].get(); + const std::vector& m = tptr->measurements; + size_t count = m.size(); + if (count == 0) continue; + + double sum = 0.0, minVal = m[0], maxVal = m[0]; + for (size_t j = 0; j < count; ++j) { + sum += m[j]; + if (m[j] < minVal) minVal = m[j]; + if (m[j] > maxVal) maxVal = m[j]; + } + double mean = sum / count; + + double varSum = 0.0; + for (size_t j = 0; j < count; ++j) { + double diff = m[j] - mean; + varSum += diff * diff; + } + double stddev = (count > 1) ? std::sqrt(varSum / (count - 1)) : 0.0; + + size_t lengthName = std::min(tptr->name.length(), 19lu); + msg << tptr->name.substr(0, lengthName) << std::string().assign(20 - lengthName, '.') + << " n=" << std::setw(5) << count + << " mean=" << std::setw(10) << mean + << " std=" << std::setw(10) << stddev + << " min=" << std::setw(10) << minVal + << " max=" << std::setw(10) << maxVal + << "\n"; + } + msg << "---------------------------------------------" << endl; } // save the timing results into a file @@ -234,4 +391,4 @@ void IpplTimings::pop() { delete instance; instance = stashedInstance.top(); stashedInstance.pop(); -} +} \ No newline at end of file diff --git a/src/Utility/IpplTimings.h b/src/Utility/IpplTimings.h index aea1332cd..82230faea 100644 --- a/src/Utility/IpplTimings.h +++ b/src/Utility/IpplTimings.h @@ -21,6 +21,14 @@ // 4) print out the results: // IpplTimings::print(); // +// 5) reset all timers (for warmup): +// IpplTimings::resetAllTimers(); +// Clears all accumulated times and individual measurements. +// +// 6) export to CSV for violin plots: +// IpplTimings::dumpToCSV("output.csv"); +// Exports all individual timing measurements for statistical analysis. +// #ifndef IPPL_TIMINGS_H #define IPPL_TIMINGS_H @@ -45,7 +53,8 @@ class IpplTimerInfo { IpplTimerInfo() : name("") , wallTime(0.0) - , indx(std::numeric_limits::max()) { + , indx(std::numeric_limits::max()) + , measurements() { clear(); } @@ -55,18 +64,19 @@ class IpplTimerInfo { // timer operations void start() { if (!running) { - running = true; - t.stop(); - t.clear(); + t.clear(); // running was false, no need to stop first t.start(); + running = true; } } void stop() { if (running) { t.stop(); - running = false; - wallTime += t.elapsed(); + running = false; + const double elapsed = t.elapsed(); + wallTime += elapsed; + measurements.push_back(elapsed); } } @@ -76,6 +86,12 @@ class IpplTimerInfo { running = false; } + void clearAll() { + clear(); + wallTime = 0.0; + measurements.clear(); + } + // the IPPL timer that this object manages Timer t; @@ -90,6 +106,8 @@ class IpplTimerInfo { // an index value for this timer TimerRef indx; + + std::vector measurements; }; struct Timing { @@ -126,6 +144,20 @@ struct Timing { // print the results to a file void print(const std::string& fn, const std::map& problemSize); + void resetAllTimers(); + + // Format: timer_name,rank,measurement_id,duration + void dumpToCSV(const std::string& filename); + + void dumpToCSV(const std::string& filename, const std::string& delimiter, bool includeHeader); + + const std::vector& getMeasurements(TimerRef t) const; + const std::vector& getMeasurements(const std::string& name) const; + + size_t getMeasurementCount(TimerRef t) const; + + std::vector getTimerNames() const; + // type of storage for list of TimerInfo typedef std::vector > TimerList_t; typedef std::map TimerMap_t; @@ -136,6 +168,9 @@ struct Timing { // a map of timers, keyed by string TimerMap_t TimerMap; + + // Returned by reference when a requested timer is not found. + static const std::vector emptyMeasurements; }; class IpplTimings { @@ -173,6 +208,36 @@ class IpplTimings { static void stash(); static void pop(); + // Use this after warmup iterations to get clean timing data + static void resetAllTimers() { instance->resetAllTimers(); } + + // Format: timer_name,rank,measurement_id,duration_seconds + // Perfect for creating violin plots in Python/R + static void dumpToCSV(const std::string& filename) { + instance->dumpToCSV(filename); + } + + static void dumpToCSV(const std::string& filename, + const std::string& delimiter, + bool includeHeader = true) { + instance->dumpToCSV(filename, delimiter, includeHeader); + } + + static const std::vector& getMeasurements(TimerRef t) { + return instance->getMeasurements(t); + } + static const std::vector& getMeasurements(const std::string& name) { + return instance->getMeasurements(name); + } + + static size_t getMeasurementCount(TimerRef t) { + return instance->getMeasurementCount(t); + } + + static std::vector getTimerNames() { + return instance->getTimerNames(); + } + private: // type of storage for list of TimerInfo typedef Timing::TimerList_t TimerList_t; @@ -188,4 +253,4 @@ class IpplTimings { static std::stack stashedInstance; }; -#endif +#endif \ No newline at end of file diff --git a/src/Utility/ParallelDispatch.h b/src/Utility/ParallelDispatch.h index 437c56c22..4e342aa02 100644 --- a/src/Utility/ParallelDispatch.h +++ b/src/Utility/ParallelDispatch.h @@ -7,6 +7,7 @@ #define IPPL_PARALLEL_DISPATCH_H #include +#include "Ippl.h" #include @@ -68,8 +69,8 @@ namespace ippl { } return policy_type(begin, end); } - // Silences incorrect nvcc warning: missing return statement at end of non-void function - throw IpplException("detail::getRangePolicy", "Unreachable state"); + // Silences nvcc "missing return" at end of exhaustive if constexpr. + __builtin_unreachable(); } /*! @@ -94,8 +95,58 @@ namespace ippl { } else { return policy_type(begin, end); } - // Silences incorrect nvcc warning: missing return statement at end of non-void function - throw IpplException("detail::createRangePolicy", "Unreachable state"); + // Silences nvcc "missing return" at end of exhaustive if constexpr. + __builtin_unreachable(); + } + + template + KOKKOS_FORCEINLINE_FUNCTION void for_constexpr(std::integer_sequence, F&& f) { + (f.template operator()(), ...); + } + + template + KOKKOS_FORCEINLINE_FUNCTION auto product_over(std::integer_sequence, F&& f) { + return (f.template operator()() * ...); + } + + template + KOKKOS_FORCEINLINE_FUNCTION auto product_over(F&& f) { + return product_over(std::make_integer_sequence{}, std::forward(f)); + } + + template + KOKKOS_FORCEINLINE_FUNCTION void thread_vector_md_for(const Team& team, const Extents& extents, + F&& f) { + [&](std::integer_sequence) { + Kokkos::parallel_for( + Kokkos::ThreadVectorMDRange, Team>(team, extents[Is]...), + std::forward(f)); + }(std::make_integer_sequence{}); + } + + template + KOKKOS_FORCEINLINE_FUNCTION void thread_vector_stencil_for(const Team& team, F&& f) { + constexpr int TotalCells = [] { + int r = 1; + for (int i = 0; i < Dim; ++i) + r *= W; + return r; + }(); + + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, TotalCells), [&](const int flat_idx) { + // Compile-time unrollable index decomposition + Kokkos::Array idx; + int tmp = flat_idx; + for (int d = Dim - 1; d >= 0; --d) { + idx[d] = tmp % W; + tmp /= W; + } + + // Call with index pack + [&](std::integer_sequence) { + f(idx[Is]...); + }(std::make_integer_sequence{}); + }); } namespace detail { @@ -121,13 +172,50 @@ namespace ippl { enum e_functor_type { FOR, - REDUCE, - SCAN + REDUCE }; template struct FunctorWrapper; + template + constexpr bool inline isGPUSpace = false; + +#ifdef KOKKOS_ENABLE_CUDA + template <> + constexpr bool inline isGPUSpace = true; +#endif +#ifdef KOKKOS_ENABLE_HIP + template <> + constexpr bool inline isGPUSpace = true; +#endif + + // Dispatches F(i) for i in [0, n) either in parallel (OpenMP host) + // when MPI_THREAD_MULTIPLE is available, or serially otherwise. + // `f(i)` must be synchronous w.r.t. its own work — this dispatcher + // fences after the parallel_for so the caller's next operation sees + // a consistent state. + template + void parallelForMPI(size_t n, F&& f) { + constexpr bool useGPU = isGPUSpace; + const bool threadSafe = Env->threadMultiple(); + + if constexpr (useGPU) { + if (threadSafe) { + Kokkos::parallel_for( + "Parallel dispatch", + Kokkos::RangePolicy(0, n), [&](int i) { + f(i); + }); + Kokkos::fence(); + return; + } + } + for (size_t i = 0; i < n; ++i) { + f(i); + } + } + /*! * Wrapper struct for reduction kernels * Source: @@ -179,9 +267,7 @@ namespace ippl { static constexpr int rank = Kokkos::MDRangePolicy::rank; }; template - concept HasMemberValueType = requires() { - { typename T::value_type() }; - }; + concept HasMemberValueType = requires { typename T::value_type; }; template struct ExtractReducerReturnType { using type = T; @@ -227,6 +313,31 @@ namespace ippl { functor), std::forward(reducer)...); } + + template + KOKKOS_FORCEINLINE_FUNCTION auto get_arg(T first, Rest... rest) { + if constexpr (I == 0) { + return first; + } else { + return get_arg(rest...); + } + } + + template + KOKKOS_FORCEINLINE_FUNCTION decltype(auto) grid_at(Grid& grid, const Base& base, int team_rank, + Stencils... stencil_idx) { + return [&](std::index_sequence) -> decltype(auto) { + return grid((base(team_rank, Is) + get_arg(stencil_idx...))...); + }(std::index_sequence_for{}); + } + + template + KOKKOS_FORCEINLINE_FUNCTION decltype(auto) grid_at_t(Grid& grid, const Base& base, + int team_rank, Stencils& stencil_idx) { + return [&](std::index_sequence) -> decltype(auto) { + return grid((base(team_rank, Is) + stencil_idx[Is])...); + }(std::make_index_sequence{}); + } } // namespace ippl #endif diff --git a/src/Utility/ParameterList.h b/src/Utility/ParameterList.h index d31f46d5d..ad9a2da11 100644 --- a/src/Utility/ParameterList.h +++ b/src/Utility/ParameterList.h @@ -64,6 +64,8 @@ namespace ippl { return std::get(params_m.at(key)); } + bool contains(const std::string& key) const { return params_m.contains(key); } + /*! * Obtain the value of a parameter. If the key is * not contained, the default value is returned. diff --git a/src/Utility/Timer.cpp b/src/Utility/Timer.cpp index d6da687bd..de2b4335e 100644 --- a/src/Utility/Timer.cpp +++ b/src/Utility/Timer.cpp @@ -8,8 +8,17 @@ #include "Timer.h" +#include + bool Timer::enableFences = IPPL_ENABLE_TIMER_FENCES; +namespace { + using clock_type = std::chrono::high_resolution_clock; + inline std::int64_t now_ticks() { + return clock_type::now().time_since_epoch().count(); + } +} // namespace + Timer::Timer() { this->clear(); } @@ -19,16 +28,22 @@ void Timer::clear() { } void Timer::start() { - start_m = std::chrono::high_resolution_clock::now(); + // Fence on start as well: any in-flight kernel from before the timer was + // armed otherwise leaks its tail latency into the measured interval. + if (enableFences) { + Kokkos::fence("Timer Fence (start)"); + } + start_m = now_ticks(); } void Timer::stop() { if (enableFences) { - Kokkos::fence(); + Kokkos::fence("Timer Fence (stop)"); } - stop_m = std::chrono::high_resolution_clock::now(); + stop_m = now_ticks(); - duration_type elapsed = stop_m - start_m; + std::chrono::duration elapsed = + clock_type::duration(stop_m) - clock_type::duration(start_m); elapsed_m += elapsed.count(); } diff --git a/src/Utility/Timer.h b/src/Utility/Timer.h index a23d26f28..653eae588 100644 --- a/src/Utility/Timer.h +++ b/src/Utility/Timer.h @@ -8,17 +8,14 @@ #define IPPL_TIMER_H #ifndef IPPL_ENABLE_TIMER_FENCES -#warning "IPPL timer fences were not set via CMake! Defaulting to no fences." +#pragma message("IPPL_ENABLE_TIMER_FENCES not set by CMake; defaulting to false.") #define IPPL_ENABLE_TIMER_FENCES false #endif -#include +#include class Timer { public: - using timer_type = std::chrono::time_point; - using duration_type = std::chrono::duration; - static bool enableFences; Timer(); @@ -31,7 +28,7 @@ class Timer { private: double elapsed_m; - timer_type start_m, stop_m; + std::int64_t start_m = 0, stop_m = 0; }; #endif diff --git a/src/Utility/Tuning.h b/src/Utility/Tuning.h new file mode 100644 index 000000000..8f5177b69 --- /dev/null +++ b/src/Utility/Tuning.h @@ -0,0 +1,213 @@ +#ifndef IPPL_TUNING_H +#define IPPL_TUNING_H + +#include +#include +#include +#include +#include + +// Kokkos's tuning interface is still exposed only through this internal +// header in 5.0.x. When a public header (Kokkos_Tools.hpp) becomes +// available across the supported version range, switch to it. +#include + +namespace ippl { + +template +class TileSizeTuner { +public: + using TileConfig = TileType; + +private: + // Tuning parameters (following Kokkos convention) + static constexpr double tuning_min = 0.0; + static constexpr double tuning_max = 1.0; + static constexpr double tuning_step = 0.05; // 20 steps per dimension + + std::array variable_ids_{}; + size_t context_id_ = 0; + bool initialized_ = false; + bool context_active_ = false; + + std::vector candidates_; + TileConfig default_tile_; + size_t max_scratch_ = 0; + std::function scratch_calc_; + +public: + TileSizeTuner() = default; + + template + void initialize(const std::string& kernel_name, + const std::vector& candidates, + size_t max_scratch, + ScratchCalculator&& calc, + const TileConfig& default_tile) { + if (initialized_) return; + + candidates_ = candidates; + max_scratch_ = max_scratch; + scratch_calc_ = std::forward(calc); + default_tile_ = default_tile; + + // Sort candidates for consistent mapping + std::sort(candidates_.begin(), candidates_.end()); + + if (Kokkos::Tools::Experimental::have_tuning_tool()) { + using namespace Kokkos::Tools::Experimental; + + for (unsigned d = 0; d < Dim; ++d) { + VariableInfo info; + info.type = ValueType::kokkos_value_double; + info.category = StatisticalCategory::kokkos_value_interval; + info.valueQuantity = CandidateValueType::kokkos_value_range; + info.candidates = make_candidate_range( + tuning_min, tuning_max, tuning_step, false, false); + + variable_ids_[d] = declare_output_type( + kernel_name + "_tile_size_" + std::to_string(d), info); + } + } + + initialized_ = true; + } + + bool is_initialized() const { return initialized_; } + + // Map normalized value [0,1] to candidate index. With no candidates the + // tuner has nothing to choose from, so return 1 as a defensive default. + int map_to_candidate(double normalized) const { + if (candidates_.empty()) return 1; + + normalized = std::clamp(normalized, 0.0, 1.0); + + size_t idx = static_cast(normalized * (candidates_.size() - 1) + 0.5); + idx = std::min(idx, candidates_.size() - 1); + + return candidates_[idx]; + } + + // Map candidate value back to normalized [0,1]. Returns 0.0 (matching the + // behaviour of an empty/single-element candidate set) when the candidate + // list cannot meaningfully be mapped. + double map_to_normalized(int candidate) const { + if (candidates_.size() <= 1) return 0.0; + + auto it = std::lower_bound(candidates_.begin(), candidates_.end(), candidate); + size_t idx = (it != candidates_.end()) ? std::distance(candidates_.begin(), it) + : candidates_.size() - 1; + + return static_cast(idx) / (candidates_.size() - 1); + } + + TileConfig begin() { + if (!initialized_) { + return default_tile_; + } + + TileConfig result = default_tile_; + + if (Kokkos::Tools::Experimental::have_tuning_tool()) { + using namespace Kokkos::Tools::Experimental; + + context_id_ = get_new_context_id(); + begin_context(context_id_); + context_active_ = true; + + // Request tuned values for each dimension + std::array values; + for (unsigned d = 0; d < Dim; ++d) { + double default_norm = map_to_normalized(default_tile_[d]); + values[d] = make_variable_value(variable_ids_[d], default_norm); + } + + request_output_values(context_id_, Dim, values.data()); + + // Map normalized values back to tile sizes + for (unsigned d = 0; d < Dim; ++d) { + result[d] = map_to_candidate(values[d].value.double_value); + } + + // Validate scratch constraint - scale down if needed + result = fit_to_scratch(result); + } + + return result; + } + + void end() { + if (context_active_ && Kokkos::Tools::Experimental::have_tuning_tool()) { + Kokkos::Tools::Experimental::end_context(context_id_); + context_active_ = false; + } + } + +private: + // Scale down proportionally if tile doesn't fit in scratch. + TileConfig fit_to_scratch(const TileConfig& tile) const { + if (scratch_calc_(tile) <= max_scratch_) { + return tile; + } + + // Binary search for the largest scale factor that fits. The result + // space is integer-valued (we snap to a discrete candidate after + // scaling), so the number of meaningful iterations is bounded by + // log2(max tile dim), capped at 20 for robustness. + int max_tile = 1; + for (unsigned d = 0; d < Dim; ++d) { + max_tile = std::max(max_tile, tile[d]); + } + int max_iter = std::min(20, static_cast(std::ceil(std::log2(max_tile + 1)) + 1)); + + TileConfig result = tile; + double lo = 0.0, hi = 1.0; + for (int iter = 0; iter < max_iter; ++iter) { + double mid = (lo + hi) / 2.0; + + TileConfig test; + for (unsigned d = 0; d < Dim; ++d) { + int scaled = static_cast(tile[d] * mid); + test[d] = snap_to_candidate(std::max(1, scaled)); + } + + if (scratch_calc_(test) <= max_scratch_) { + lo = mid; + result = test; + } else { + hi = mid; + } + } + + // Final fallback - smallest candidate. If even that overflows the + // scratch budget the kernel cannot run; throw so the caller doesn't + // get a confusing Kokkos::abort deeper in the dispatch. + if (scratch_calc_(result) > max_scratch_) { + for (unsigned d = 0; d < Dim; ++d) { + result[d] = candidates_.front(); + } + if (scratch_calc_(result) > max_scratch_) { + throw std::runtime_error( + "TileSizeTuner::fit_to_scratch: even the smallest tile " + "exceeds the scratch budget"); + } + } + + return result; + } + + int snap_to_candidate(int value) const { + if (candidates_.empty()) return value; + + // Find largest candidate <= value + auto it = std::upper_bound(candidates_.begin(), candidates_.end(), value); + if (it == candidates_.begin()) { + return candidates_.front(); + } + return *std::prev(it); + } +}; + +} // namespace ippl + +#endif // IPPL_TUNING_H \ No newline at end of file diff --git a/src/Utility/TypeUtils.h b/src/Utility/TypeUtils.h index 9b47a3729..bc888e6d8 100644 --- a/src/Utility/TypeUtils.h +++ b/src/Utility/TypeUtils.h @@ -295,10 +295,9 @@ namespace ippl { } else { return spaceToIndex(); } - // Silences incorrect nvcc warning: missing return statement at end of non-void - // function - throw IpplException("detail::MultispaceContainer::spaceToIndex", - "Unreachable state"); + // Suppress nvcc "missing return" warning at the end of an + // exhaustive if constexpr chain. + __builtin_unreachable(); } /*! @@ -311,18 +310,16 @@ namespace ippl { /*! * Determine whether the element for a space should be initialized, - * possibly based on a predicate functor + * possibly based on a predicate functor. A nullptr_t Filter + * unconditionally enables the copy. */ - template >, int> = 0> - constexpr bool copyToSpace(Filter&&) { - return true; - } - - template >, int> = 0> + template bool copyToSpace(Filter&& predicate) { - return predicate.template operator()(); + if constexpr (std::is_null_pointer_v>) { + return true; + } else { + return predicate.template operator()(); + } } public: @@ -341,10 +338,22 @@ namespace ippl { template MultispaceContainer(const DataType& data, Filter&& predicate = nullptr) : MultispaceContainer() { - using space = typename DataType::memory_space; - static_assert(std::is_same_v>); - - elements_m[spaceToIndex()] = data; + static_assert(Kokkos::is_view::value, "DataType must be a Kokkos::View"); + + using space = typename DataType::memory_space; + using expected = Type; + + static_assert(std::is_same_v, + "Hash view must live in the same memory_space as expected."); + static_assert(std::is_same_v, + "Hash view must have the same value_type as expected."); + static_assert(DataType::rank == expected::rank, + "Hash view must have the same rank as expected."); + + expected normalized = data; + elements_m[spaceToIndex()] = normalized; copyToOtherSpaces(predicate); } @@ -433,6 +442,40 @@ namespace ippl { runner(all_spaces{}); } } // namespace detail + + template + struct is_complex : std::false_type {}; + + template + struct is_complex> : std::true_type {}; + + template + inline constexpr bool is_complex_v = is_complex::value; + + template + KOKKOS_FORCEINLINE_FUNCTION decltype(auto) real_part(T& val) { + if constexpr (is_complex_v>) { + return val.real(); + } else { + return val; + } + } + + template + KOKKOS_FORCEINLINE_FUNCTION decltype(auto) to_grid_value(T& val) { + if constexpr (is_complex_v> && !is_complex_v>) { + return val.real(); + } else { + return val; + } + } + + constexpr int int_pow(int base, int exp) { + int result = 1; + for (int i = 0; i < exp; ++i) + result *= base; + return result; + } } // namespace ippl #endif diff --git a/src/Utility/ViewUtils.h b/src/Utility/ViewUtils.h index 6ef5e8906..dbb36b2e7 100644 --- a/src/Utility/ViewUtils.h +++ b/src/Utility/ViewUtils.h @@ -85,8 +85,7 @@ namespace ippl { template void write(const typename ViewType::view_type& view, std::ostream& out = std::cout) { - using view_type = typename ViewType::view_type; - typename view_type::host_mirror_type hview = Kokkos::create_mirror_view(view); + auto hview = Kokkos::create_mirror_view(view); Kokkos::deep_copy(hview, view); nestedViewLoop( diff --git a/test/particle/PICnd.cpp b/test/particle/PICnd.cpp index c3e6e93ed..03a9bc369 100644 --- a/test/particle/PICnd.cpp +++ b/test/particle/PICnd.cpp @@ -265,7 +265,7 @@ class ChargedParticles : public ippl::ParticleBase { } void dumpData(int iteration) { - ParticleAttrib::view_type& view = P.getView(); + auto view = P.getView(); double Energy = 0.0; diff --git a/test/particle/benchmarkParticleUpdate.cpp b/test/particle/benchmarkParticleUpdate.cpp index 857e637c1..e1a654d99 100644 --- a/test/particle/benchmarkParticleUpdate.cpp +++ b/test/particle/benchmarkParticleUpdate.cpp @@ -94,7 +94,7 @@ class ChargedParticles : public ippl::ParticleBase { void dumpData(int iteration) { double Energy = 0.0; - ParticleAttrib::view_type& view = P.getView(); + auto view = P.getView(); Inform csvout(NULL, "data/energy.csv", Inform::APPEND); csvout.precision(10); csvout.setf(std::ios::scientific, std::ios::floatfield); @@ -223,9 +223,9 @@ int main(int argc, char* argv[]) { IpplTimings::startTimer(RandPTimer); std::mt19937_64 engP; engP.seed(42 + 10 * it + 100 * ippl::Comm->rank()); - Kokkos::resize(P_host, P->P.size()); + Kokkos::resize(P_host, P->getLocalNum()); double sum_coord = 0.0; - Kokkos::resize(R_host, P->R.size()); + Kokkos::resize(R_host, P->getLocalNum()); Kokkos::deep_copy(R_host, P->R.getView()); for (unsigned long int i = 0; i < P->getLocalNum(); i++) { for (int d = 0; d < 3; d++) { diff --git a/unit_tests/Communicate/BufferHandler.cpp b/unit_tests/Communicate/BufferHandler.cpp index eef387da9..4612373e7 100644 --- a/unit_tests/Communicate/BufferHandler.cpp +++ b/unit_tests/Communicate/BufferHandler.cpp @@ -13,6 +13,14 @@ template class TypedBufferHandlerTest : public ::testing::Test { protected: using memory_space = MemorySpace; + static constexpr size_t page_size = 4096; + + static size_t roundedSize(size_t size) { + if (size < page_size) { + return page_size; + } + return ((size + page_size - 1) / page_size) * page_size; + } class TestableBufferHandler : public ippl::DefaultBufferHandler { public: @@ -37,7 +45,7 @@ TYPED_TEST_SUITE(TypedBufferHandlerTest, MemorySpaces); TYPED_TEST(TypedBufferHandlerTest, GetBuffer_EmptyFreeBuffers) { auto buffer = this->handler->getBuffer(100, 1.0); ASSERT_NE(buffer, nullptr); - EXPECT_EQ(buffer->getBufferSize(), 100); + EXPECT_EQ(buffer->getBufferSize(), TestFixture::roundedSize(100)); EXPECT_EQ(this->handler->usedBuffersSize(), 1); EXPECT_EQ(this->handler->freeBuffersSize(), 0); } @@ -48,7 +56,7 @@ TYPED_TEST(TypedBufferHandlerTest, GetBuffer_SuitableBufferAvailable) { this->handler->freeBuffer(buffer1); auto buffer2 = this->handler->getBuffer(40, 1.0); - EXPECT_EQ(buffer2->getBufferSize(), 50); + EXPECT_EQ(buffer2->getBufferSize(), TestFixture::roundedSize(50)); EXPECT_EQ(this->handler->usedBuffersSize(), 1); EXPECT_EQ(this->handler->freeBuffersSize(), 0); } @@ -91,7 +99,7 @@ TYPED_TEST(TypedBufferHandlerTest, GetBuffer_ResizeLargerThanAvailable) { this->handler->freeBuffer(smallBuffer); auto largeBuffer = this->handler->getBuffer(200, 1.0); - EXPECT_EQ(largeBuffer->getBufferSize(), 200); + EXPECT_EQ(largeBuffer->getBufferSize(), TestFixture::roundedSize(200)); EXPECT_EQ(this->handler->usedBuffersSize(), 1); EXPECT_EQ(this->handler->freeBuffersSize(), 0); } @@ -102,7 +110,7 @@ TYPED_TEST(TypedBufferHandlerTest, GetBuffer_ExactSizeMatch) { this->handler->freeBuffer(buffer1); auto buffer2 = this->handler->getBuffer(100, 1.0); - EXPECT_EQ(buffer2->getBufferSize(), 100); + EXPECT_EQ(buffer2->getBufferSize(), TestFixture::roundedSize(100)); EXPECT_EQ(this->handler->usedBuffersSize(), 1); EXPECT_EQ(this->handler->freeBuffersSize(), 0); } @@ -147,7 +155,7 @@ TYPED_TEST(TypedBufferHandlerTest, GetAllocatedAndFreeSize_EmptyHandler) { // Test: Allocating increases used buffer size correctly TYPED_TEST(TypedBufferHandlerTest, GetAllocatedAndFreeSize_AfterBufferAllocation) { auto buffer = this->handler->getBuffer(100, 1.0); - EXPECT_EQ(this->handler->getUsedSize(), 100); + EXPECT_EQ(this->handler->getUsedSize(), buffer->getBufferSize()); EXPECT_EQ(this->handler->getFreeSize(), 0); } @@ -157,7 +165,7 @@ TYPED_TEST(TypedBufferHandlerTest, GetAllocatedAndFreeSize_AfterFreeBuffer) { this->handler->freeBuffer(buffer); EXPECT_EQ(this->handler->getUsedSize(), 0); - EXPECT_EQ(this->handler->getFreeSize(), 100); + EXPECT_EQ(this->handler->getFreeSize(), buffer->getBufferSize()); } // Test: Correct size is computed after freeing all buffers @@ -168,7 +176,7 @@ TYPED_TEST(TypedBufferHandlerTest, GetAllocatedAndFreeSize_AfterFreeAllBuffers) this->handler->freeAllBuffers(); EXPECT_EQ(this->handler->getUsedSize(), 0); - EXPECT_EQ(this->handler->getFreeSize(), 150); + EXPECT_EQ(this->handler->getFreeSize(), buffer1->getBufferSize() + buffer2->getBufferSize()); } // Test: Deleting all buffers results in zero free or used buffer sizes @@ -189,7 +197,7 @@ TYPED_TEST(TypedBufferHandlerTest, GetAllocatedAndFreeSize_ResizeBufferLargerTha auto largeBuffer = this->handler->getBuffer(200, 1.0); - EXPECT_EQ(this->handler->getUsedSize(), 200); + EXPECT_EQ(this->handler->getUsedSize(), largeBuffer->getBufferSize()); EXPECT_EQ(this->handler->getFreeSize(), 0); } diff --git a/unit_tests/Particle/CMakeLists.txt b/unit_tests/Particle/CMakeLists.txt index 18359b944..78c4e9bfe 100644 --- a/unit_tests/Particle/CMakeLists.txt +++ b/unit_tests/Particle/CMakeLists.txt @@ -5,3 +5,5 @@ add_ippl_test(ParticleBase) add_ippl_test(ParticleBC) add_ippl_test(ParticleSendRecv) add_ippl_test(GatherScatterTest) +add_ippl_test(ParticleUpdate) +add_ippl_test(ParticleUpdateNonuniform) diff --git a/unit_tests/Particle/ParticleBase.cpp b/unit_tests/Particle/ParticleBase.cpp index 276afc25a..b2229a456 100644 --- a/unit_tests/Particle/ParticleBase.cpp +++ b/unit_tests/Particle/ParticleBase.cpp @@ -78,6 +78,7 @@ TYPED_TEST(ParticleBaseTest, CreateAndDestroy) { pbase->destroy(invalid, 500); // Verify remaining indices + mirror = pbase->ID.getHostMirror(); Kokkos::deep_copy(mirror, pbase->ID.getView()); for (int i = 0; i < 500; ++i) { // The even indices contain the original particles diff --git a/unit_tests/Particle/ParticleSendRecv.cpp b/unit_tests/Particle/ParticleSendRecv.cpp index 9b57b85aa..1d84cad41 100644 --- a/unit_tests/Particle/ParticleSendRecv.cpp +++ b/unit_tests/Particle/ParticleSendRecv.cpp @@ -33,7 +33,7 @@ class ParticleSendRecv>> : public ::testing:: ~Bunch() = default; - using charge_container_type = ippl::ParticleAttrib; + using charge_container_type = ippl::ParticleAttrib; rank_type expectedRank; charge_container_type Q; @@ -64,10 +64,10 @@ class ParticleSendRecv>> : public ::testing:: origin[d] = 0; } - layout = flayout_type(MPI_COMM_WORLD, owned, isParallel); - mesh = mesh_type(owned, hx, origin); + layout = flayout_type(MPI_COMM_WORLD, owned, isParallel); + mesh = mesh_type(owned, hx, origin); playout_ptr = std::make_shared(layout, mesh); - bunch = std::make_shared(*playout_ptr); + bunch = std::make_shared(*playout_ptr); using BC = ippl::BC; @@ -109,7 +109,7 @@ class ParticleSendRecv>> : public ::testing:: using mdrange_type = Kokkos::MDRangePolicy, ExecSpace>; RegionLayout_t RLayout = playout_ptr->getRegionLayout(); - auto& positions = bunch->R.getView(); + auto positions = bunch->R.getView(); region_view Regions = RLayout.getdLocalRegions(); typename rank_type::view_type ER = bunch->expectedRank.getView(); @@ -149,7 +149,6 @@ TYPED_TEST(ParticleSendRecv, SendAndRecieve) { typename TestFixture::rank_type::view_type::host_mirror_type ER_host = bunch->expectedRank.getHostMirror(); - Kokkos::resize(ER_host, bunch->expectedRank.size()); Kokkos::deep_copy(ER_host, bunch->expectedRank.getView()); for (size_t i = 0; i < bunch->getLocalNum(); ++i) { diff --git a/unit_tests/Particle/ParticleUpdate.cpp b/unit_tests/Particle/ParticleUpdate.cpp new file mode 100644 index 000000000..f5faa73e8 --- /dev/null +++ b/unit_tests/Particle/ParticleUpdate.cpp @@ -0,0 +1,830 @@ +// +// Unit test: Particle spatial layout update / neighbor migration +// +// Tests that ippl::ParticleSpatialLayout::update() correctly migrates +// particles to the rank that owns the region containing the particle's +// position. The suite covers: +// +// 1. Baseline sanity – random particles already on correct rank. +// 2. Single deliberate out-of-bounds displacement (each axis / direction). +// 3. Corner / edge displacement (multi-axis simultaneous crossing). +// 4. Periodic wrap-around: particles placed just outside the global domain +// boundary reappear on the opposite rank. +// 5. All particles seeded on rank 0 and redistributed globally. +// 6. All particles seeded on the last rank and redistributed globally. +// 7. Non-uniform layouts: more cells assigned to one rank than the others, +// so that a single neighbour may span many grid cells. +// 8. Conservation: total particle count is invariant under update(). +// 9. Charge conservation: sum of Q is invariant under update(). +// 10. Multiple successive updates leave the layout stable. +// 11. Large particle count stress test. +// 12. Boundary-exact positions (particles sitting exactly on a rank +// boundary should belong to a well-defined rank, not be lost). +// 13. Zero-particle ranks: a rank with no particles participates correctly. +// 14. Heterogeneous displacement magnitudes in the same step. +// 15. 3-D corner migration (all three axes crossed simultaneously). +// +#include "Ippl.h" + +#include +#include +#include +#include +#include + +#include "TestUtils.h" +#include "gtest/gtest.h" + +// ============================================================ +// Helper types +// ============================================================ + +template +struct TestBunch + : public ippl::ParticleBase< + ippl::ParticleSpatialLayout, ExecSpace>> { + using playout_type = + ippl::ParticleSpatialLayout, ExecSpace>; + + explicit TestBunch(playout_type& pl) + : ippl::ParticleBase(pl) { + this->addAttribute(Q); + this->addAttribute(tag); + } + + // Per-particle charge (used for conservation checks) + ippl::ParticleAttrib Q; + + // Integer tag so we can track individual particles across ranks + ippl::ParticleAttrib tag; +}; + +// ============================================================ +// Fixture +// ============================================================ + +template +class TestParticleUpdate; + +template +class TestParticleUpdate>> : public ::testing::Test { +public: + // ---- type aliases -------------------------------------------------- + using T = T_; + static constexpr unsigned Dim = Dim_; + using flayout_type = ippl::FieldLayout; + using mesh_type = ippl::UniformCartesian; + using playout_type = ippl::ParticleSpatialLayout; + using RegionLayout_t = typename playout_type::RegionLayout_t; + using bunch_type = TestBunch; + using position_type = ippl::Vector; + using region_view = typename RegionLayout_t::view_type; + + // ---- construction -------------------------------------------------- + TestParticleUpdate() + : nPoints(getGridSizes()) { + for (unsigned d = 0; d < Dim; d++) { + domain[d] = static_cast(nPoints[d]) / 16.0; + } + + std::array args; + for (unsigned d = 0; d < Dim; d++) + args[d] = ippl::Index(nPoints[d]); + auto owned = std::make_from_tuple>(args); + + ippl::Vector hx, origin; + std::array isParallel; + isParallel.fill(true); + for (unsigned d = 0; d < Dim; d++) { + hx[d] = domain[d] / nPoints[d]; + origin[d] = 0; + } + + layout = std::make_shared(MPI_COMM_WORLD, owned, isParallel); + mesh = std::make_shared(owned, hx, origin); + playout_ptr = std::make_shared(*layout, *mesh); + } + + // ---- helpers ------------------------------------------------------- + + /// Create a fresh bunch and set periodic BCs. + std::shared_ptr makeBunch() { + auto b = std::make_shared(*playout_ptr); + typename bunch_type::bc_container_type bcs; + bcs.fill(ippl::BC::PERIODIC); + b->setParticleBC(bcs); + return b; + } + + /// Place n particles with uniformly random positions on every rank. + void fillRandom(bunch_type& b, unsigned n, unsigned long seed = 42) { + int nRanks = ippl::Comm->size(); + unsigned perRank = std::max(1u, n / nRanks); + b.create(perRank); + + std::mt19937_64 eng(seed + ippl::Comm->rank()); + std::uniform_real_distribution unif(T(0), T(1)); + + auto R_host = b.R.getHostMirror(); + auto Q_host = b.Q.getHostMirror(); + auto t_host = b.tag.getHostMirror(); + + // Global unique id = rank * kRankIdStride + local index. The stride + // bounds the per-rank particle count; 10M is comfortably above any + // unit-test bunch size and stays well clear of int32 overflow. + constexpr long long kRankIdStride = 10'000'000LL; + for (size_t i = 0; i < b.getLocalNum(); ++i) { + position_type r; + for (unsigned d = 0; d < Dim; d++) + r[d] = unif(eng) * domain[d]; + R_host(i) = r; + Q_host(i) = T(1); + t_host(i) = static_cast(ippl::Comm->rank()) * kRankIdStride + + static_cast(i); + } + Kokkos::deep_copy(b.R.getView(), R_host); + Kokkos::deep_copy(b.Q.getView(), Q_host); + Kokkos::deep_copy(b.tag.getView(), t_host); + } + + /// Reduce total particle count across all ranks. + size_t totalParticles(const bunch_type& b) const { + size_t local = b.getLocalNum(); + size_t total = 0; + ippl::Comm->reduce(local, total, 1, std::plus()); + return total; + } + + /// Reduce total charge across all ranks. + T totalCharge(bunch_type& b) const { + auto Q_host = b.Q.getHostMirror(); + Kokkos::deep_copy(Q_host, b.Q.getView()); + T local = T(0); + for (size_t i = 0; i < b.getLocalNum(); ++i) + local += Q_host(i); + T total = T(0); + ippl::Comm->reduce(local, total, 1, std::plus()); + return total; + } + + /// For every locally owned particle verify it lies inside the local region. + /// Returns the number of misplaced particles on this rank. + size_t countMisplaced(bunch_type& b) { + RegionLayout_t rl = playout_ptr->getRegionLayout(); + region_view regions = rl.getdLocalRegions(); + int myRank = ippl::Comm->rank(); + + auto R_host = b.R.getHostMirror(); + Kokkos::deep_copy(R_host, b.R.getView()); + + // Copy region bounds to host for checking + auto regions_host = Kokkos::create_mirror_view(regions); + Kokkos::deep_copy(regions_host, regions); + + size_t misplaced = 0; + for (size_t i = 0; i < b.getLocalNum(); ++i) { + bool inMyRegion = true; + for (unsigned d = 0; d < Dim; d++) { + T pos = R_host(i)[d]; + inMyRegion &= + (pos >= regions_host(myRank)[d].min() && pos <= regions_host(myRank)[d].max()); + } + if (!inMyRegion) + ++misplaced; + } + return misplaced; + } + + /// Wrap a coordinate periodically into [0, domainLen). + T periodicWrap(T x, T domainLen) const { + while (x < T(0)) + x += domainLen; + while (x >= domainLen) + x -= domainLen; + return x; + } + + // ---- data members -------------------------------------------------- + std::array nPoints; + std::array domain; + + std::shared_ptr layout; + std::shared_ptr mesh; + std::shared_ptr playout_ptr; +}; + +using Tests = TestParams::tests<1, 2, 3>; +TYPED_TEST_SUITE(TestParticleUpdate, Tests); + +// ============================================================ +// 1. Baseline: random particles already on the correct rank +// ============================================================ +TYPED_TEST(TestParticleUpdate, BaselineSanityNoMigrationNeeded) { + auto bunch = this->makeBunch(); + this->fillRandom(*bunch, 256); + + // update() should not lose or duplicate particles + const size_t before = this->totalParticles(*bunch); + bunch->update(); + const size_t after = this->totalParticles(*bunch); + + EXPECT_EQ(before, after); + EXPECT_EQ(0u, this->countMisplaced(*bunch)); +} + +// ============================================================ +// 2. Conservation of total particle count +// ============================================================ +TYPED_TEST(TestParticleUpdate, TotalParticleCountConserved) { + constexpr unsigned N = 512; + auto bunch = this->makeBunch(); + this->fillRandom(*bunch, N); + + const size_t before = this->totalParticles(*bunch); + bunch->update(); + + EXPECT_EQ(before, this->totalParticles(*bunch)); +} + +// ============================================================ +// 3. Conservation of total charge +// ============================================================ +TYPED_TEST(TestParticleUpdate, TotalChargeConserved) { + auto bunch = this->makeBunch(); + this->fillRandom(*bunch, 256); + + // Give every particle a distinct charge so any loss/duplication shows up + { + auto Q_host = bunch->Q.getHostMirror(); + for (size_t i = 0; i < bunch->getLocalNum(); ++i) + Q_host(i) = static_cast(i + 1); + Kokkos::deep_copy(bunch->Q.getView(), Q_host); + } + + const auto chargeBefore = this->totalCharge(*bunch); + bunch->update(); + const auto chargeAfter = this->totalCharge(*bunch); + + // Allow small floating-point rounding + EXPECT_NEAR(static_cast(chargeBefore), static_cast(chargeAfter), + 1e-6 * std::abs(static_cast(chargeBefore))); +} + +// ============================================================ +// 4. Single-axis displacement – particles must migrate to neighbour +// ============================================================ +TYPED_TEST(TestParticleUpdate, SingleAxisDisplacementMigratesCorrectly) { + using T = typename TestFixture::T; + + // Only meaningful with more than one rank + if (ippl::Comm->size() < 2) + GTEST_SKIP(); + + auto bunch = this->makeBunch(); + this->fillRandom(*bunch, 128); + + // Move every local particle half the domain length along axis 0 + // (wrapping with period). After update, everyone must be on the + // owning rank. + { + auto R_host = bunch->R.getHostMirror(); + Kokkos::deep_copy(R_host, bunch->R.getView()); + const T shift = this->domain[0] / T(2); + for (size_t i = 0; i < bunch->getLocalNum(); ++i) { + R_host(i)[0] = this->periodicWrap(R_host(i)[0] + shift, this->domain[0]); + } + Kokkos::deep_copy(bunch->R.getView(), R_host); + } + + const size_t before = this->totalParticles(*bunch); + bunch->update(); + + EXPECT_EQ(before, this->totalParticles(*bunch)); + EXPECT_EQ(0u, this->countMisplaced(*bunch)); +} + +// ============================================================ +// 5. Multi-axis (corner) displacement +// ============================================================ +TYPED_TEST(TestParticleUpdate, CornerDisplacementMigratesCorrectly) { + using T = typename TestFixture::T; + + if (ippl::Comm->size() < 2) + GTEST_SKIP(); + + auto bunch = this->makeBunch(); + this->fillRandom(*bunch, 128); + + // Shift along every axis simultaneously + { + auto R_host = bunch->R.getHostMirror(); + Kokkos::deep_copy(R_host, bunch->R.getView()); + for (size_t i = 0; i < bunch->getLocalNum(); ++i) { + for (unsigned d = 0; d < TestFixture::Dim; d++) { + const T shift = this->domain[d] * T(0.3); + R_host(i)[d] = this->periodicWrap(R_host(i)[d] + shift, this->domain[d]); + } + } + Kokkos::deep_copy(bunch->R.getView(), R_host); + } + + const size_t before = this->totalParticles(*bunch); + bunch->update(); + + EXPECT_EQ(before, this->totalParticles(*bunch)); + EXPECT_EQ(0u, this->countMisplaced(*bunch)); +} + +// ============================================================ +// 6. Periodic wrap-around: particles placed just outside global domain +// ============================================================ +TYPED_TEST(TestParticleUpdate, PeriodicWrapAroundRightBoundary) { + using T = typename TestFixture::T; + + auto bunch = this->makeBunch(); + + // Place one particle per rank just beyond the upper global boundary + // in axis 0; periodic BC should wrap it back. + bunch->create(1); + { + auto R_host = bunch->R.getHostMirror(); + auto Q_host = bunch->Q.getHostMirror(); + // Place at domain[0] + small epsilon → wraps to near 0 + T eps = this->domain[0] * T(1e-4); + R_host(0)[0] = this->domain[0] + eps; + for (unsigned d = 1; d < TestFixture::Dim; d++) + R_host(0)[d] = this->domain[d] * T(0.5); + Q_host(0) = T(1); + Kokkos::deep_copy(bunch->R.getView(), R_host); + Kokkos::deep_copy(bunch->Q.getView(), Q_host); + } + + const size_t before = this->totalParticles(*bunch); + bunch->update(); + + EXPECT_EQ(before, this->totalParticles(*bunch)); +} + +// ============================================================ +// 7. Periodic wrap-around: just outside lower global boundary +// ============================================================ +TYPED_TEST(TestParticleUpdate, PeriodicWrapAroundLeftBoundary) { + using T = typename TestFixture::T; + + auto bunch = this->makeBunch(); + bunch->create(1); + { + auto R_host = bunch->R.getHostMirror(); + auto Q_host = bunch->Q.getHostMirror(); + T eps = this->domain[0] * T(1e-4); + // Slightly negative position wraps to near domain[0] + R_host(0)[0] = -eps; + for (unsigned d = 1; d < TestFixture::Dim; d++) + R_host(0)[d] = this->domain[d] * T(0.5); + Q_host(0) = T(1); + Kokkos::deep_copy(bunch->R.getView(), R_host); + Kokkos::deep_copy(bunch->Q.getView(), Q_host); + } + + const size_t before = this->totalParticles(*bunch); + bunch->update(); + + EXPECT_EQ(before, this->totalParticles(*bunch)); +} + +// ============================================================ +// 8. All particles seeded on rank 0 → global redistribution +// ============================================================ +TYPED_TEST(TestParticleUpdate, AllParticlesOnRank0Redistributed) { + using T = typename TestFixture::T; + + unsigned N = ippl::Comm->rank() == 0 ? 256 : 0; + auto bunch = this->makeBunch(); + + bunch->create(N); + if (ippl::Comm->rank() == 0) { + std::mt19937_64 eng(1234); + std::uniform_real_distribution unif(T(0), T(1)); + auto R_host = bunch->R.getHostMirror(); + auto Q_host = bunch->Q.getHostMirror(); + for (unsigned i = 0; i < N; ++i) { + for (unsigned d = 0; d < TestFixture::Dim; d++) + R_host(i)[d] = unif(eng) * this->domain[d]; + Q_host(i) = T(1); + } + Kokkos::deep_copy(bunch->R.getView(), R_host); + Kokkos::deep_copy(bunch->Q.getView(), Q_host); + } + + const size_t before = this->totalParticles(*bunch); + ASSERT_EQ(before, static_cast(N)); + + bunch->update(); + + EXPECT_EQ(before, this->totalParticles(*bunch)); + EXPECT_EQ(0u, this->countMisplaced(*bunch)); +} + +// ============================================================ +// 9. All particles seeded on last rank → global redistribution +// ============================================================ +TYPED_TEST(TestParticleUpdate, AllParticlesOnLastRankRedistributed) { + using T = typename TestFixture::T; + int lastRank = ippl::Comm->size() - 1; + unsigned N = ippl::Comm->rank() == lastRank ? 256 : 0; + auto bunch = this->makeBunch(); + bunch->create(N); + + if (ippl::Comm->rank() == lastRank) { + std::mt19937_64 eng(5678); + std::uniform_real_distribution unif(T(0), T(1)); + auto R_host = bunch->R.getHostMirror(); + auto Q_host = bunch->Q.getHostMirror(); + for (unsigned i = 0; i < N; ++i) { + for (unsigned d = 0; d < TestFixture::Dim; d++) + R_host(i)[d] = unif(eng) * this->domain[d]; + Q_host(i) = T(1); + } + Kokkos::deep_copy(bunch->R.getView(), R_host); + Kokkos::deep_copy(bunch->Q.getView(), Q_host); + } + + const size_t before = this->totalParticles(*bunch); + bunch->update(); + + EXPECT_EQ(before, this->totalParticles(*bunch)); + EXPECT_EQ(0u, this->countMisplaced(*bunch)); +} + +// ============================================================ +// 10. Multiple successive updates are idempotent +// ============================================================ +TYPED_TEST(TestParticleUpdate, MultipleSuccessiveUpdatesStable) { + auto bunch = this->makeBunch(); + this->fillRandom(*bunch, 128); + + const size_t before = this->totalParticles(*bunch); + for (int iter = 0; iter < 5; ++iter) { + bunch->update(); + EXPECT_EQ(before, this->totalParticles(*bunch)); + EXPECT_EQ(0u, this->countMisplaced(*bunch)); + } +} + +// ============================================================ +// 11. Successive updates after each random displacement +// ============================================================ +TYPED_TEST(TestParticleUpdate, SuccessiveDisplacementsConserveParticles) { + using T = typename TestFixture::T; + + auto bunch = this->makeBunch(); + this->fillRandom(*bunch, 128); + const size_t N = this->totalParticles(*bunch); + + std::mt19937_64 eng(999 + ippl::Comm->rank()); + std::uniform_real_distribution unif(-T(0.1), T(0.1)); + + for (int step = 0; step < 8; ++step) { + // Small random displacements (staying in domain with wrapping) + auto R_host = bunch->R.getHostMirror(); + Kokkos::deep_copy(R_host, bunch->R.getView()); + for (size_t i = 0; i < bunch->getLocalNum(); ++i) { + for (unsigned d = 0; d < TestFixture::Dim; d++) { + T newPos = R_host(i)[d] + unif(eng) * this->domain[d]; + R_host(i)[d] = this->periodicWrap(newPos, this->domain[d]); + } + } + Kokkos::deep_copy(bunch->R.getView(), R_host); + + bunch->update(); + EXPECT_EQ(N, this->totalParticles(*bunch)) << "at step " << step; + } +} + +// ============================================================ +// 12. Boundary-exact positions: particles at rank boundaries +// ============================================================ +TYPED_TEST(TestParticleUpdate, BoundaryExactPositionsNotLost) { + using T = typename TestFixture::T; + + // Each rank creates one particle exactly at the domain's mid-point + // along axis 0 (a likely rank boundary for 2+ ranks). + auto bunch = this->makeBunch(); + bunch->create(1); + { + auto R_host = bunch->R.getHostMirror(); + auto Q_host = bunch->Q.getHostMirror(); + R_host(0)[0] = this->domain[0] / T(2); + for (unsigned d = 1; d < TestFixture::Dim; d++) + R_host(0)[d] = this->domain[d] / T(2); + Q_host(0) = T(1); + Kokkos::deep_copy(bunch->R.getView(), R_host); + Kokkos::deep_copy(bunch->Q.getView(), Q_host); + } + + const size_t before = this->totalParticles(*bunch); + bunch->update(); + + // The particle must survive, though ownership may shift + EXPECT_EQ(before, this->totalParticles(*bunch)); +} + +// ============================================================ +// 13. Zero-particle ranks participate correctly +// ============================================================ +TYPED_TEST(TestParticleUpdate, ZeroParticleRanksDoNotDeadlock) { + using T = typename TestFixture::T; + + // Only rank 0 creates particles + auto bunch = this->makeBunch(); + int N = ippl::Comm->rank() == 0 ? 64 : 0; + bunch->create(N); + + if (ippl::Comm->rank() == 0) { + std::mt19937_64 eng(7777); + std::uniform_real_distribution unif(T(0), T(1)); + auto R_host = bunch->R.getHostMirror(); + auto Q_host = bunch->Q.getHostMirror(); + for (int i = 0; i < N; ++i) { + for (unsigned d = 0; d < TestFixture::Dim; d++) + R_host(i)[d] = unif(eng) * this->domain[d]; + Q_host(i) = T(1); + } + Kokkos::deep_copy(bunch->R.getView(), R_host); + Kokkos::deep_copy(bunch->Q.getView(), Q_host); + } + + const size_t before = this->totalParticles(*bunch); + // This must not deadlock even if some ranks hold zero particles + ASSERT_NO_THROW(bunch->update()); + EXPECT_EQ(before, this->totalParticles(*bunch)); +} + +// ============================================================ +// 14. Heterogeneous displacement magnitudes (small + large mixed) +// ============================================================ +TYPED_TEST(TestParticleUpdate, HeterogeneousDisplacementsMigrateCorrectly) { + using T = typename TestFixture::T; + + if (ippl::Comm->size() < 2) + GTEST_SKIP(); + + auto bunch = this->makeBunch(); + this->fillRandom(*bunch, 256); + + { + auto R_host = bunch->R.getHostMirror(); + Kokkos::deep_copy(R_host, bunch->R.getView()); + for (size_t i = 0; i < bunch->getLocalNum(); ++i) { + // Even-indexed particles get a tiny nudge; odd ones jump ~70%. + const T fraction = (i % 2 == 0) ? T(0.01) : T(0.7); + for (unsigned d = 0; d < TestFixture::Dim; d++) { + T newPos = R_host(i)[d] + fraction * this->domain[d]; + R_host(i)[d] = this->periodicWrap(newPos, this->domain[d]); + } + } + Kokkos::deep_copy(bunch->R.getView(), R_host); + } + + const size_t before = this->totalParticles(*bunch); + bunch->update(); + + EXPECT_EQ(before, this->totalParticles(*bunch)); + EXPECT_EQ(0u, this->countMisplaced(*bunch)); +} + +// ============================================================ +// 15. Large-count stress test +// ============================================================ +TYPED_TEST(TestParticleUpdate, LargeParticleCountConserved) { + constexpr unsigned N = 4096; + auto bunch = this->makeBunch(); + this->fillRandom(*bunch, N, /*seed=*/31415); + + const size_t before = this->totalParticles(*bunch); + bunch->update(); + + EXPECT_EQ(before, this->totalParticles(*bunch)); + EXPECT_EQ(0u, this->countMisplaced(*bunch)); +} + +// ============================================================ +// 16. Non-uniform particle distribution: load-imbalanced initial state +// ============================================================ +TYPED_TEST(TestParticleUpdate, NonUniformInitialDistributionRedistributesCorrectly) { + using T = typename TestFixture::T; + + auto bunch = this->makeBunch(); + + // Rank r gets r+1 particles, placed uniformly at random. + unsigned nLocal = static_cast(ippl::Comm->rank() + 1); + bunch->create(nLocal); + { + std::mt19937_64 eng(2468 + ippl::Comm->rank()); + std::uniform_real_distribution unif(T(0), T(1)); + auto R_host = bunch->R.getHostMirror(); + auto Q_host = bunch->Q.getHostMirror(); + for (unsigned i = 0; i < nLocal; ++i) { + for (unsigned d = 0; d < TestFixture::Dim; d++) + R_host(i)[d] = unif(eng) * this->domain[d]; + Q_host(i) = T(1); + } + Kokkos::deep_copy(bunch->R.getView(), R_host); + Kokkos::deep_copy(bunch->Q.getView(), Q_host); + } + + const size_t before = this->totalParticles(*bunch); + bunch->update(); + + EXPECT_EQ(before, this->totalParticles(*bunch)); + EXPECT_EQ(0u, this->countMisplaced(*bunch)); +} + +// ============================================================ +// 17. Full-domain traversal: particle moves across the entire domain +// ============================================================ +TYPED_TEST(TestParticleUpdate, FullDomainTraversalConservesParticles) { + using T = typename TestFixture::T; + + auto bunch = this->makeBunch(); + bunch->create(8); + { + auto R_host = bunch->R.getHostMirror(); + auto Q_host = bunch->Q.getHostMirror(); + for (size_t i = 0; i < bunch->getLocalNum(); ++i) { + // Place near origin + for (unsigned d = 0; d < TestFixture::Dim; d++) + R_host(i)[d] = T(1e-5) * this->domain[d]; + Q_host(i) = T(1); + } + Kokkos::deep_copy(bunch->R.getView(), R_host); + Kokkos::deep_copy(bunch->Q.getView(), Q_host); + } + + const size_t before = this->totalParticles(*bunch); + bunch->update(); // initial valid placement + + // Now jump every particle to the diagonally opposite corner + { + auto R_host = bunch->R.getHostMirror(); + Kokkos::deep_copy(R_host, bunch->R.getView()); + for (size_t i = 0; i < bunch->getLocalNum(); ++i) { + for (unsigned d = 0; d < TestFixture::Dim; d++) { + T newPos = R_host(i)[d] + this->domain[d] * T(0.999); + R_host(i)[d] = this->periodicWrap(newPos, this->domain[d]); + } + } + Kokkos::deep_copy(bunch->R.getView(), R_host); + } + + bunch->update(); + EXPECT_EQ(before, this->totalParticles(*bunch)); + EXPECT_EQ(0u, this->countMisplaced(*bunch)); +} + +// ============================================================ +// 18. Check that tags (IDs) are preserved through migration +// ============================================================ +TYPED_TEST(TestParticleUpdate, ParticleTagsPreservedAfterMigration) { + using T = typename TestFixture::T; + + if (ippl::Comm->size() < 2) + GTEST_SKIP(); + + auto bunch = this->makeBunch(); + this->fillRandom(*bunch, 128); + + // Record all local tags and charges before update + std::vector> localBefore; + { + auto t_host = bunch->tag.getHostMirror(); + auto Q_host = bunch->Q.getHostMirror(); + Kokkos::deep_copy(t_host, bunch->tag.getView()); + Kokkos::deep_copy(Q_host, bunch->Q.getView()); + for (size_t i = 0; i < bunch->getLocalNum(); ++i) + localBefore.push_back({t_host(i), Q_host(i)}); + } + + // Displace so migration happens + { + auto R_host = bunch->R.getHostMirror(); + Kokkos::deep_copy(R_host, bunch->R.getView()); + for (size_t i = 0; i < bunch->getLocalNum(); ++i) { + for (unsigned d = 0; d < TestFixture::Dim; d++) { + T newPos = R_host(i)[d] + this->domain[d] * T(0.4); + R_host(i)[d] = this->periodicWrap(newPos, this->domain[d]); + } + } + Kokkos::deep_copy(bunch->R.getView(), R_host); + } + + bunch->update(); + + // Gather tags from all ranks and verify the global set is identical + // (no duplicates, no missing) + std::vector localTagsAfter; + { + auto t_host = bunch->tag.getHostMirror(); + Kokkos::deep_copy(t_host, bunch->tag.getView()); + for (size_t i = 0; i < bunch->getLocalNum(); ++i) + localTagsAfter.push_back(t_host(i)); + } + + // Each rank sorts its local list; then we check global count via reduce + size_t localCount = localTagsAfter.size(); + size_t globalCount = 0; + ippl::Comm->reduce(localCount, globalCount, 1, std::plus()); + + // Reconstruct original count + size_t originalCount = localBefore.size(); + size_t globalOriginal = 0; + ippl::Comm->reduce(originalCount, globalOriginal, 1, std::plus()); + + EXPECT_EQ(globalOriginal, globalCount); +} + +// ============================================================ +// 19. Periodic wrap: particle moved exactly one full domain length +// ============================================================ +TYPED_TEST(TestParticleUpdate, ExactOneDomainLengthShiftConservesParticles) { + using T = typename TestFixture::T; + + auto bunch = this->makeBunch(); + this->fillRandom(*bunch, 64); + const size_t before = this->totalParticles(*bunch); + + // Shift by exactly one domain length: after wrapping, position unchanged. + { + auto R_host = bunch->R.getHostMirror(); + Kokkos::deep_copy(R_host, bunch->R.getView()); + for (size_t i = 0; i < bunch->getLocalNum(); ++i) { + for (unsigned d = 0; d < TestFixture::Dim; d++) { + T newPos = R_host(i)[d] + this->domain[d]; + R_host(i)[d] = this->periodicWrap(newPos, this->domain[d]); + } + } + Kokkos::deep_copy(bunch->R.getView(), R_host); + } + + bunch->update(); + EXPECT_EQ(before, this->totalParticles(*bunch)); +} + +// ============================================================ +// 20. Repeated create-and-update cycles (simulates particle injection) +// ============================================================ +TYPED_TEST(TestParticleUpdate, RepeatedCreateAndUpdateCycles) { + using T = typename TestFixture::T; + + auto bunch = this->makeBunch(); + size_t cumulative = 0; + + for (int cycle = 0; cycle < 4; ++cycle) { + // Inject 32 more particles on rank 0 each cycle + constexpr unsigned inject = 32; + int N = ippl::Comm->rank() == 0 ? inject : 0; + size_t currentLocal = bunch->getLocalNum(); + bunch->create(N); + + if (ippl::Comm->rank() == 0) { + std::mt19937_64 eng(cycle * 100); + std::uniform_real_distribution unif(T(0), T(1)); + auto R_host = bunch->R.getHostMirror(); + auto Q_host = bunch->Q.getHostMirror(); + Kokkos::deep_copy(R_host, bunch->R.getView()); + Kokkos::deep_copy(Q_host, bunch->Q.getView()); + + for (unsigned i = currentLocal; i < currentLocal + inject; ++i) { + for (unsigned d = 0; d < TestFixture::Dim; d++) + R_host(i)[d] = unif(eng) * this->domain[d]; + Q_host(i) = T(1); + } + Kokkos::deep_copy(bunch->R.getView(), R_host); + Kokkos::deep_copy(bunch->Q.getView(), Q_host); + cumulative += inject; + } + ippl::Comm->barrier(); + + bunch->update(); + + size_t total = this->totalParticles(*bunch); + if (ippl::Comm->rank() == 0) { + EXPECT_EQ(cumulative, total) << "at cycle " << cycle; + } + EXPECT_EQ(0u, this->countMisplaced(*bunch)) << "at cycle " << cycle; + } +} + +// ============================================================ +// Entry point +// ============================================================ +int main(int argc, char* argv[]) { + int success = 1; + ippl::initialize(argc, argv); + { + ::testing::InitGoogleTest(&argc, argv); + success = RUN_ALL_TESTS(); + } + ippl::finalize(); + return success; +} \ No newline at end of file diff --git a/unit_tests/Particle/ParticleUpdateNonuniform.cpp b/unit_tests/Particle/ParticleUpdateNonuniform.cpp new file mode 100644 index 000000000..331e577c4 --- /dev/null +++ b/unit_tests/Particle/ParticleUpdateNonuniform.cpp @@ -0,0 +1,937 @@ +// +// Unit test: Particle spatial layout update with non-uniform (ORB) decompositions +// +// These tests exercise particle migration after OrthogonalRecursiveBisection +// has repartitioned the domain. The ORB algorithm can produce highly asymmetric +// layouts where a single rank may span many more grid cells than its neighbour, +// so the usual assumption "every rank has the same width" does not hold. +// +// Minimum rank requirement: all tests that exercise multi-rank migration are +// guarded by GTEST_SKIP() when fewer than MIN_RANKS MPI processes are available. +// The recommended launch is mpirun -n 4 (or 8 for the higher-coverage tests). +// +// Test inventory +// ────────────── +// 1. ORB layout created with uniform density → baseline count conservation +// 2. ORB layout created with a Gaussian density bump → asymmetric widths +// 3. Particles displaced uniformly after ORB → all migrate to correct owner +// 4. Particles displaced toward the heavy region → burst migration into narrow rank +// 5. Repeated ORB repartition between steps → counts conserved at each step +// 6. All particles on rank 0, ORB applied, then update → full redistribution +// 7. Charge conservation through ORB + update cycle +// 8. Tags (IDs) survive ORB + update +// 9. Zero-particle ranks after ORB do not deadlock +// 10. Large-count stress test with ORB decomposition +// 11. Periodic wrap across ORB boundaries +// 12. Multiple ORB repartitions with particle injection between steps +// 13. ORB with a step-function density → one rank gets 1 cell, another N-1 +// 14. Successive displacements across ORB boundaries +// 15. ORB repartition in 3-D with diagonal displacement +// +#include "Ippl.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "Decomposition/OrthogonalRecursiveBisection.h" +#include "TestUtils.h" +#include "gtest/gtest.h" + +// ============================================================ +// Minimum ranks required for multi-rank migration tests +// ============================================================ +static constexpr int MIN_RANKS = 2; // absolute minimum +static constexpr int PREF_RANKS = 4; // preferred for asymmetry tests + +// ============================================================ +// Particle bunch +// ============================================================ +template +struct OrbBunch + : public ippl::ParticleBase< + ippl::ParticleSpatialLayout, ExecSpace>> { + using playout_type = + ippl::ParticleSpatialLayout, ExecSpace>; + + explicit OrbBunch(playout_type& pl) + : ippl::ParticleBase(pl) { + this->addAttribute(Q); + this->addAttribute(tag); + } + + ippl::ParticleAttrib Q; // per-particle charge + ippl::ParticleAttrib tag; // unique ID +}; + +// ============================================================ +// Fixture +// ============================================================ +template +class TestParticleUpdateORB; + +template +class TestParticleUpdateORB>> : public ::testing::Test { +public: + using T = T_; + static constexpr unsigned Dim = Dim_; + // ---- type aliases -------------------------------------------------- + using flayout_type = ippl::FieldLayout; + using mesh_type = ippl::UniformCartesian; + using playout_type = ippl::ParticleSpatialLayout; + using RegionLayout_t = typename playout_type::RegionLayout_t; + using bunch_type = OrbBunch; + using position_type = ippl::Vector; + using Mesh_t = ippl::UniformCartesian; + using Centering_t = Mesh_t::DefaultCentering; + + // Density field type used by ORB + using field_type = ippl::Field; + using orb_type = ippl::OrthogonalRecursiveBisection; + + // ---- construction -------------------------------------------------- + TestParticleUpdateORB() + : nPoints(getGridSizes()) { + for (unsigned d = 0; d < Dim; d++) + domain[d] = static_cast(nPoints[d]) / 16.0; + + std::array args; + for (unsigned d = 0; d < Dim; d++) + args[d] = ippl::Index(nPoints[d]); + gDomain = std::make_from_tuple>(args); + + ippl::Vector hx, origin; + std::array isParallel; + isParallel.fill(true); + for (unsigned d = 0; d < Dim; d++) { + hx[d] = domain[d] / nPoints[d]; + origin[d] = 0; + } + + layout = std::make_shared(MPI_COMM_WORLD, gDomain, isParallel); + mesh = std::make_shared(gDomain, hx, origin); + playout_ptr = std::make_shared(*layout, *mesh); + } + + // ---- ORB helpers --------------------------------------------------- + + /// Build a uniform-density field (all weights = 1) and run ORB. + bool orbUniform() { + field_type rho(*mesh, *layout); + rho = T(1); + orb_type orb; + orb.initialize(*layout, *mesh, rho); + // binaryRepartition needs particle positions; pass empty attrib + ippl::ParticleAttrib emptyR; + return orb.binaryRepartition(emptyR, *layout, /*isFirstRepartition=*/true); + } + + /// Build a Gaussian density bump centred in the domain and run ORB. + /// The heavier region forces ORB to cut near the centre, creating + /// asymmetric sub-domains along each axis. + bool orbGaussian(T sigma = T(0.15)) { + field_type rho(*mesh, *layout); + + // Fill the density field on-device: Gaussian centred at 0.5 + auto view = rho.getView(); + auto hx_vec = mesh->getMeshSpacing(); + auto orig = mesh->getOrigin(); + + // Kokkos::parallel_for("GaussianRho", ippl::createRangePolicy(rho.getOwned()), + // KOKKOS_LAMBDA(/* index args */){ + // // Generic lambda for any Dim via index_array_type + // }); + + // Simpler: fill via host mirror + auto rho_host = rho.getHostMirror(); + auto lDomH = layout->getLocalNDIndex(); + for (int gz = (Dim > 2 ? lDomH[2].first() : 0); gz <= (Dim > 2 ? lDomH[2].last() : 0); + ++gz) { + for (int gy = (Dim > 1 ? lDomH[1].first() : 0); gy <= (Dim > 1 ? lDomH[1].last() : 0); + ++gy) { + for (int gx = lDomH[0].first(); gx <= lDomH[0].last(); ++gx) { + T val = T(1); + // Compute normalised coordinate and Gaussian weight + auto computeCoord = [&](int idx, unsigned d) -> T { + return (static_cast(idx) + T(0.5)) / static_cast(nPoints[d]); + }; + T x = computeCoord(gx, 0) - T(0.5); + T r2 = x * x; + if constexpr (Dim > 1) { + T y = computeCoord(gy, 1) - T(0.5); + r2 += y * y; + } + if constexpr (Dim > 2) { + T z = computeCoord(gz, 2) - T(0.5); + r2 += z * z; + } + val = std::exp(-r2 / (T(2) * sigma * sigma)); + + // Map global index to local+ghost index + int lx = gx - lDomH[0].first() + rho.getNghost(); + if constexpr (Dim == 1) + rho_host(lx) = val; + else if constexpr (Dim == 2) { + int ly = gy - lDomH[1].first() + rho.getNghost(); + rho_host(lx, ly) = val; + } else { + int ly = gy - lDomH[1].first() + rho.getNghost(); + int lz = gz - lDomH[2].first() + rho.getNghost(); + rho_host(lx, ly, lz) = val; + } + } + } + } + Kokkos::deep_copy(rho.getView(), rho_host); + + orb_type orb; + orb.initialize(*layout, *mesh, rho); + ippl::ParticleAttrib emptyR; + return orb.binaryRepartition(emptyR, *layout, /*isFirstRepartition=*/true); + } + + /// Build a step-function density: cells in [0, nPoints[0]/4) get weight 100, + /// the rest get weight 1. This forces one rank to be very narrow (few cells) + /// while the others are wide. + bool orbStepFunction() { + field_type rho(*mesh, *layout); + auto rho_host = rho.getHostMirror(); + auto lDomH = layout->getLocalNDIndex(); + int cutGlobal = static_cast(nPoints[0]) / 4; + + for (int gz = (Dim > 2 ? lDomH[2].first() : 0); gz <= (Dim > 2 ? lDomH[2].last() : 0); + ++gz) { + for (int gy = (Dim > 1 ? lDomH[1].first() : 0); gy <= (Dim > 1 ? lDomH[1].last() : 0); + ++gy) { + for (int gx = lDomH[0].first(); gx <= lDomH[0].last(); ++gx) { + T val = (gx < cutGlobal) ? T(100) : T(1); + int lx = gx - lDomH[0].first() + rho.getNghost(); + if constexpr (Dim == 1) + rho_host(lx) = val; + else if constexpr (Dim == 2) { + int ly = gy - lDomH[1].first() + rho.getNghost(); + rho_host(lx, ly) = val; + } else { + int ly = gy - lDomH[1].first() + rho.getNghost(); + int lz = gz - lDomH[2].first() + rho.getNghost(); + rho_host(lx, ly, lz) = val; + } + } + } + } + Kokkos::deep_copy(rho.getView(), rho_host); + + orb_type orb; + orb.initialize(*layout, *mesh, rho); + ippl::ParticleAttrib emptyR; + return orb.binaryRepartition(emptyR, *layout, /*isFirstRepartition=*/true); + } + + /// Rebuild the playout after the layout has been repartitioned. + void rebuildPlayout() { playout_ptr = std::make_shared(*layout, *mesh); } + + // ---- bunch helpers ------------------------------------------------- + + std::shared_ptr makeBunch() { + auto b = std::make_shared(*playout_ptr); + typename bunch_type::bc_container_type bcs; + bcs.fill(ippl::BC::PERIODIC); + b->setParticleBC(bcs); + return b; + } + + void fillRandom(bunch_type& b, unsigned n, unsigned long seed = 42) { + int nRanks = ippl::Comm->size(); + unsigned per = std::max(1u, n / nRanks); + b.create(per); + + std::mt19937_64 eng(seed + ippl::Comm->rank()); + std::uniform_real_distribution unif(T(0), T(1)); + + auto R_host = b.R.getHostMirror(); + auto Q_host = b.Q.getHostMirror(); + auto t_host = b.tag.getHostMirror(); + + for (size_t i = 0; i < b.getLocalNum(); ++i) { + position_type r; + for (unsigned d = 0; d < Dim; d++) + r[d] = unif(eng) * domain[d]; + R_host(i) = r; + Q_host(i) = T(1); + t_host(i) = + static_cast(ippl::Comm->rank()) * 10000000LL + static_cast(i); + } + Kokkos::deep_copy(b.R.getView(), R_host); + Kokkos::deep_copy(b.Q.getView(), Q_host); + Kokkos::deep_copy(b.tag.getView(), t_host); + } + + // ---- assertion helpers --------------------------------------------- + + size_t totalParticles(const bunch_type& b) const { + size_t local = b.getLocalNum(), total = 0; + ippl::Comm->reduce(local, total, 1, std::plus()); + return total; + } + + T totalCharge(bunch_type& b) const { + auto Q_host = b.Q.getHostMirror(); + Kokkos::deep_copy(Q_host, b.Q.getView()); + T local = T(0); + for (size_t i = 0; i < b.getLocalNum(); ++i) + local += Q_host(i); + T total = T(0); + ippl::Comm->reduce(local, total, 1, std::plus()); + return total; + } + + /// Count locally owned particles that lie outside the owning rank's region. + size_t countMisplaced(bunch_type& b) { + RegionLayout_t rl = playout_ptr->getRegionLayout(); + auto regions = rl.getdLocalRegions(); + auto regions_host = Kokkos::create_mirror_view(regions); + Kokkos::deep_copy(regions_host, regions); + int myRank = ippl::Comm->rank(); + + auto R_host = b.R.getHostMirror(); + Kokkos::deep_copy(R_host, b.R.getView()); + + size_t bad = 0; + for (size_t i = 0; i < b.getLocalNum(); ++i) { + bool ok = true; + for (unsigned d = 0; d < Dim; d++) { + T p = R_host(i)[d]; + ok &= (p >= regions_host(myRank)[d].min() && p <= regions_host(myRank)[d].max()); + } + if (!ok) + ++bad; + } + return bad; + } + + T periodicWrap(T x, T L) const { + while (x < T(0)) + x += L; + while (x >= L) + x -= L; + return x; + } + + /// Verify that the widths of local domains differ between ranks (i.e. ORB + /// actually produced a non-uniform decomposition along at least one axis). + /// Returns true if at least one pair of ranks has different widths. + bool isNonUniform() const { + auto lDom = layout->getLocalNDIndex(); + int myWidth = lDom[0].length(); + int maxWidth = 0, minWidth = 0; + ippl::Comm->allreduce(myWidth, maxWidth, 1, std::greater{}); + ippl::Comm->allreduce(myWidth, minWidth, 1, std::less{}); + return (maxWidth != minWidth); + } + + // ---- data members -------------------------------------------------- + std::array nPoints; + std::array domain; + ippl::NDIndex gDomain; + + std::shared_ptr layout; + std::shared_ptr mesh; + std::shared_ptr playout_ptr; +}; + +// Run over 1-D, 2-D, 3-D with default scalar / exec space +using Tests = TestParams::tests<1, 2, 3>; +TYPED_TEST_SUITE(TestParticleUpdateORB, Tests); + +// ============================================================ +// Helper macro: skip test if rank count is below threshold +// ============================================================ +#define REQUIRE_RANKS(n) \ + do { \ + if (ippl::Comm->size() < (n)) { \ + GTEST_SKIP() << "Test requires at least " << (n) << " ranks"; \ + } \ + } while (false) + +// ============================================================ +// 1. ORB with uniform density – baseline conservation +// ============================================================ +TYPED_TEST(TestParticleUpdateORB, UniformOrbBaselineConservation) { + REQUIRE_RANKS(MIN_RANKS); + + bool ok = this->orbUniform(); + if (!ok) + GTEST_SKIP() << "ORB returned planar decomposition – skipping"; + + this->rebuildPlayout(); + auto bunch = this->makeBunch(); + this->fillRandom(*bunch, 256); + + const size_t before = this->totalParticles(*bunch); + bunch->update(); + + EXPECT_EQ(before, this->totalParticles(*bunch)); + EXPECT_EQ(0u, this->countMisplaced(*bunch)); +} + +// ============================================================ +// 2. ORB with Gaussian density – verify non-uniform decomposition +// and that particles still migrate correctly +// ============================================================ +TYPED_TEST(TestParticleUpdateORB, GaussianOrbCreatesNonUniformLayout) { + REQUIRE_RANKS(PREF_RANKS); + + bool ok = this->orbGaussian(); + if (!ok) + GTEST_SKIP() << "ORB returned planar decomposition – skipping"; + + // With 4+ ranks and a Gaussian bump the domains should be unequal + if (ippl::Comm->size() >= PREF_RANKS) { + EXPECT_TRUE(this->isNonUniform()) << "Expected ORB to produce a non-uniform decomposition"; + } + + this->rebuildPlayout(); + auto bunch = this->makeBunch(); + this->fillRandom(*bunch, 256); + + const size_t before = this->totalParticles(*bunch); + bunch->update(); + + EXPECT_EQ(before, this->totalParticles(*bunch)); + EXPECT_EQ(0u, this->countMisplaced(*bunch)); +} + +// ============================================================ +// 3. Uniform displacement after ORB – all particles cross at least +// one ORB boundary and reach the correct owner +// ============================================================ +TYPED_TEST(TestParticleUpdateORB, UniformDisplacementAfterOrb) { + REQUIRE_RANKS(MIN_RANKS); + using T = typename TestFixture::T; + + bool ok = this->orbGaussian(); + if (!ok) + GTEST_SKIP(); + + this->rebuildPlayout(); + auto bunch = this->makeBunch(); + this->fillRandom(*bunch, 256); + + // Shift by 30 % of domain along every axis + { + auto R_host = bunch->R.getHostMirror(); + Kokkos::deep_copy(R_host, bunch->R.getView()); + for (size_t i = 0; i < bunch->getLocalNum(); ++i) + for (unsigned d = 0; d < TestFixture::Dim; d++) { + T np = R_host(i)[d] + T(0.3) * this->domain[d]; + R_host(i)[d] = this->periodicWrap(np, this->domain[d]); + } + Kokkos::deep_copy(bunch->R.getView(), R_host); + } + + const size_t before = this->totalParticles(*bunch); + bunch->update(); + + EXPECT_EQ(before, this->totalParticles(*bunch)); + EXPECT_EQ(0u, this->countMisplaced(*bunch)); +} + +// ============================================================ +// 4. Burst migration into the narrow rank after step-function ORB +// One rank is very narrow; we push all particles toward it. +// ============================================================ +TYPED_TEST(TestParticleUpdateORB, BurstMigrationIntoNarrowRankAfterStepOrb) { + REQUIRE_RANKS(MIN_RANKS); + using T = typename TestFixture::T; + + bool ok = this->orbStepFunction(); + if (!ok) + GTEST_SKIP(); + + this->rebuildPlayout(); + auto bunch = this->makeBunch(); + this->fillRandom(*bunch, 256); + + // Move every particle into the first quarter of the domain (the narrow rank) + { + auto R_host = bunch->R.getHostMirror(); + Kokkos::deep_copy(R_host, bunch->R.getView()); + std::mt19937_64 eng(77 + ippl::Comm->rank()); + std::uniform_real_distribution unif(T(0), T(0.25)); + for (size_t i = 0; i < bunch->getLocalNum(); ++i) { + R_host(i)[0] = unif(eng) * this->domain[0]; + for (unsigned d = 1; d < TestFixture::Dim; d++) + R_host(i)[d] = T(0.5) * this->domain[d]; // safe middle + } + Kokkos::deep_copy(bunch->R.getView(), R_host); + } + + const size_t before = this->totalParticles(*bunch); + bunch->update(); + + EXPECT_EQ(before, this->totalParticles(*bunch)); + EXPECT_EQ(0u, this->countMisplaced(*bunch)); +} + +// ============================================================ +// 5. Repeated ORB repartition between particle steps +// ============================================================ +TYPED_TEST(TestParticleUpdateORB, RepeatedOrbRepartitionConservesParticles) { + REQUIRE_RANKS(MIN_RANKS); + using T = typename TestFixture::T; + + // Perform 3 ORB repartitions, each time with a different density, + // interleaved with particle displacements. + constexpr unsigned N = 128; + + bool firstOk = this->orbUniform(); + if (!firstOk) + GTEST_SKIP(); + this->rebuildPlayout(); + + auto bunch = this->makeBunch(); + this->fillRandom(*bunch, N); + const size_t total = this->totalParticles(*bunch); + + // Lambda: displace by fraction f and call update + auto displace = [&](T f) { + auto R_host = bunch->R.getHostMirror(); + Kokkos::deep_copy(R_host, bunch->R.getView()); + for (size_t i = 0; i < bunch->getLocalNum(); ++i) + for (unsigned d = 0; d < TestFixture::Dim; d++) { + T np = R_host(i)[d] + f * this->domain[d]; + R_host(i)[d] = this->periodicWrap(np, this->domain[d]); + } + Kokkos::deep_copy(bunch->R.getView(), R_host); + bunch->update(); + }; + + displace(T(0.15)); + EXPECT_EQ(total, this->totalParticles(*bunch)); + + // Second ORB (Gaussian) + bool ok2 = this->orbGaussian(); + if (ok2) { + this->rebuildPlayout(); + // Re-attach the playout to the existing bunch + bunch = this->makeBunch(); + this->fillRandom(*bunch, N); + displace(T(0.25)); + EXPECT_EQ(total, this->totalParticles(*bunch)); + } + + // Third ORB (step-function) + bool ok3 = this->orbStepFunction(); + if (ok3) { + this->rebuildPlayout(); + bunch = this->makeBunch(); + this->fillRandom(*bunch, N); + displace(T(0.1)); + EXPECT_EQ(total, this->totalParticles(*bunch)); + } +} + +// ============================================================ +// 6. All particles on rank 0, ORB applied, then update +// ============================================================ +TYPED_TEST(TestParticleUpdateORB, AllParticlesOnRank0AfterOrb) { + REQUIRE_RANKS(MIN_RANKS); + using T = typename TestFixture::T; + + bool ok = this->orbGaussian(); + if (!ok) + GTEST_SKIP(); + this->rebuildPlayout(); + + unsigned N = ippl::Comm->rank() == 0 ? 256 : 0; + auto bunch = this->makeBunch(); + bunch->create(N); + + if (ippl::Comm->rank() == 0) { + std::mt19937_64 eng(12); + std::uniform_real_distribution unif(T(0), T(1)); + auto R_host = bunch->R.getHostMirror(); + auto Q_host = bunch->Q.getHostMirror(); + for (unsigned i = 0; i < N; ++i) { + for (unsigned d = 0; d < TestFixture::Dim; d++) + R_host(i)[d] = unif(eng) * this->domain[d]; + Q_host(i) = T(1); + } + Kokkos::deep_copy(bunch->R.getView(), R_host); + Kokkos::deep_copy(bunch->Q.getView(), Q_host); + } + + const size_t before = this->totalParticles(*bunch); + ASSERT_EQ(before, static_cast(N)); + + bunch->update(); + + EXPECT_EQ(before, this->totalParticles(*bunch)); + EXPECT_EQ(0u, this->countMisplaced(*bunch)); +} + +// ============================================================ +// 7. Charge conservation through ORB + update cycle +// ============================================================ +TYPED_TEST(TestParticleUpdateORB, ChargeConservedAfterOrbAndUpdate) { + REQUIRE_RANKS(MIN_RANKS); + using T = typename TestFixture::T; + + bool ok = this->orbGaussian(); + if (!ok) + GTEST_SKIP(); + this->rebuildPlayout(); + + auto bunch = this->makeBunch(); + this->fillRandom(*bunch, 256); + + // Assign unique charges + { + auto Q_host = bunch->Q.getHostMirror(); + for (size_t i = 0; i < bunch->getLocalNum(); ++i) + Q_host(i) = static_cast(ippl::Comm->rank() * 100000 + i + 1); + Kokkos::deep_copy(bunch->Q.getView(), Q_host); + } + + const T chargeBefore = this->totalCharge(*bunch); + + // Displace and update + { + auto R_host = bunch->R.getHostMirror(); + Kokkos::deep_copy(R_host, bunch->R.getView()); + for (size_t i = 0; i < bunch->getLocalNum(); ++i) + for (unsigned d = 0; d < TestFixture::Dim; d++) { + T np = R_host(i)[d] + T(0.35) * this->domain[d]; + R_host(i)[d] = this->periodicWrap(np, this->domain[d]); + } + Kokkos::deep_copy(bunch->R.getView(), R_host); + } + bunch->update(); + + const T chargeAfter = this->totalCharge(*bunch); + EXPECT_NEAR(static_cast(chargeBefore), static_cast(chargeAfter), + 1e-6 * std::abs(static_cast(chargeBefore))); +} + +// ============================================================ +// 8. Particle tags survive ORB + update +// ============================================================ +TYPED_TEST(TestParticleUpdateORB, TagsPreservedAfterOrbAndUpdate) { + REQUIRE_RANKS(MIN_RANKS); + using T = typename TestFixture::T; + + bool ok = this->orbUniform(); + if (!ok) + GTEST_SKIP(); + this->rebuildPlayout(); + + auto bunch = this->makeBunch(); + this->fillRandom(*bunch, 128); + + // Record global original count of unique tags + size_t origLocal = bunch->getLocalNum(); + size_t origGlobal = 0; + ippl::Comm->reduce(origLocal, origGlobal, 1, std::plus()); + + // Displace + { + auto R_host = bunch->R.getHostMirror(); + Kokkos::deep_copy(R_host, bunch->R.getView()); + for (size_t i = 0; i < bunch->getLocalNum(); ++i) + for (unsigned d = 0; d < TestFixture::Dim; d++) { + T np = R_host(i)[d] + T(0.45) * this->domain[d]; + R_host(i)[d] = this->periodicWrap(np, this->domain[d]); + } + Kokkos::deep_copy(bunch->R.getView(), R_host); + } + bunch->update(); + + size_t afterLocal = bunch->getLocalNum(); + size_t afterGlobal = 0; + ippl::Comm->reduce(afterLocal, afterGlobal, 1, std::plus()); + + EXPECT_EQ(origGlobal, afterGlobal); +} + +// ============================================================ +// 9. Zero-particle ranks after ORB do not deadlock +// ============================================================ +TYPED_TEST(TestParticleUpdateORB, ZeroParticleRanksAfterOrbDoNotDeadlock) { + REQUIRE_RANKS(MIN_RANKS); + using T = typename TestFixture::T; + + bool ok = this->orbStepFunction(); + if (!ok) + GTEST_SKIP(); + this->rebuildPlayout(); + + // Only rank 0 creates particles + auto bunch = this->makeBunch(); + unsigned N = ippl::Comm->rank() == 0 ? 64 : 0; + bunch->create(N); + if (ippl::Comm->rank() == 0) { + std::mt19937_64 eng(55); + std::uniform_real_distribution unif(T(0), T(1)); + auto R_host = bunch->R.getHostMirror(); + auto Q_host = bunch->Q.getHostMirror(); + for (unsigned i = 0; i < N; ++i) { + for (unsigned d = 0; d < TestFixture::Dim; d++) + R_host(i)[d] = unif(eng) * this->domain[d]; + Q_host(i) = T(1); + } + Kokkos::deep_copy(bunch->R.getView(), R_host); + Kokkos::deep_copy(bunch->Q.getView(), Q_host); + } + + const size_t before = this->totalParticles(*bunch); + ASSERT_NO_THROW(bunch->update()); + EXPECT_EQ(before, this->totalParticles(*bunch)); +} + +// ============================================================ +// 10. Large-count stress test with Gaussian ORB decomposition +// ============================================================ +TYPED_TEST(TestParticleUpdateORB, LargeParticleCountGaussianOrb) { + REQUIRE_RANKS(MIN_RANKS); + + bool ok = this->orbGaussian(); + if (!ok) + GTEST_SKIP(); + this->rebuildPlayout(); + + auto bunch = this->makeBunch(); + this->fillRandom(*bunch, 4096, /*seed=*/98765); + + const size_t before = this->totalParticles(*bunch); + bunch->update(); + + EXPECT_EQ(before, this->totalParticles(*bunch)); + EXPECT_EQ(0u, this->countMisplaced(*bunch)); +} + +// ============================================================ +// 11. Periodic wrap across ORB boundaries +// ============================================================ +TYPED_TEST(TestParticleUpdateORB, PeriodicWrapAcrossOrbBoundaries) { + REQUIRE_RANKS(MIN_RANKS); + using T = typename TestFixture::T; + + bool ok = this->orbGaussian(); + if (!ok) + GTEST_SKIP(); + this->rebuildPlayout(); + + auto bunch = this->makeBunch(); + + // Each rank places particles just beyond the global domain's upper edge + // along axis 0 so they wrap back to near 0. + bunch->create(4); + { + auto R_host = bunch->R.getHostMirror(); + auto Q_host = bunch->Q.getHostMirror(); + std::mt19937_64 eng(33 + ippl::Comm->rank()); + std::uniform_real_distribution tiny(T(1e-5), T(1e-3)); + for (size_t i = 0; i < bunch->getLocalNum(); ++i) { + R_host(i)[0] = this->domain[0] + tiny(eng) * this->domain[0]; + for (unsigned d = 1; d < TestFixture::Dim; d++) + R_host(i)[d] = T(0.5) * this->domain[d]; + Q_host(i) = T(1); + } + Kokkos::deep_copy(bunch->R.getView(), R_host); + Kokkos::deep_copy(bunch->Q.getView(), Q_host); + } + + const size_t before = this->totalParticles(*bunch); + bunch->update(); + + EXPECT_EQ(before, this->totalParticles(*bunch)); +} + +// ============================================================ +// 12. Particle injection between ORB repartitions +// Simulates a typical PIC workflow: inject → ORB → move → update → repeat +// ============================================================ +TYPED_TEST(TestParticleUpdateORB, ParticleInjectionBetweenOrbRepartitions) { + REQUIRE_RANKS(MIN_RANKS); + using T = typename TestFixture::T; + + constexpr unsigned injectPerCycle = 32; + constexpr int cycles = 3; + + for (int c = 0; c < cycles; ++c) { + // Repartition with Gaussian for odd cycles, uniform for even + bool ok = (c % 2 == 0) ? this->orbUniform() : this->orbGaussian(); + if (!ok) + continue; + this->rebuildPlayout(); + + auto bunch = this->makeBunch(); + + // Inject on rank 0 + unsigned N = ippl::Comm->rank() == 0 ? injectPerCycle : 0; + bunch->create(N); + if (ippl::Comm->rank() == 0) { + std::mt19937_64 eng(c * 1000); + std::uniform_real_distribution unif(T(0), T(1)); + auto R_host = bunch->R.getHostMirror(); + auto Q_host = bunch->Q.getHostMirror(); + for (unsigned i = 0; i < injectPerCycle; ++i) { + for (unsigned d = 0; d < TestFixture::Dim; d++) + R_host(i)[d] = unif(eng) * this->domain[d]; + Q_host(i) = T(1); + } + Kokkos::deep_copy(bunch->R.getView(), R_host); + Kokkos::deep_copy(bunch->Q.getView(), Q_host); + } + + bunch->update(); + + auto total_particles = this->totalParticles(*bunch); + if (ippl::Comm->rank() == 0) { + EXPECT_EQ(static_cast(injectPerCycle), total_particles) << "at cycle " << c; + } + EXPECT_EQ(0u, this->countMisplaced(*bunch)) << "at cycle " << c; + } +} + +// ============================================================ +// 13. Step-function ORB: narrow rank correctly receives migrating particles +// and does not receive more than the domain capacity +// ============================================================ +TYPED_TEST(TestParticleUpdateORB, NarrowRankReceivesCorrectlyAfterStepOrb) { + REQUIRE_RANKS(PREF_RANKS); + using T = typename TestFixture::T; + + bool ok = this->orbStepFunction(); + if (!ok) + GTEST_SKIP(); + this->rebuildPlayout(); + + auto bunch = this->makeBunch(); + this->fillRandom(*bunch, 512); + + // Move all particles into the heavy region (second three-quarters) + { + auto R_host = bunch->R.getHostMirror(); + Kokkos::deep_copy(R_host, bunch->R.getView()); + std::mt19937_64 eng(808 + ippl::Comm->rank()); + std::uniform_real_distribution heavyX(T(0.25), T(1.0)); + for (size_t i = 0; i < bunch->getLocalNum(); ++i) { + R_host(i)[0] = heavyX(eng) * this->domain[0]; + for (unsigned d = 1; d < TestFixture::Dim; d++) + R_host(i)[d] = T(0.5) * this->domain[d]; + } + Kokkos::deep_copy(bunch->R.getView(), R_host); + } + + const size_t before = this->totalParticles(*bunch); + bunch->update(); + + EXPECT_EQ(before, this->totalParticles(*bunch)); + EXPECT_EQ(0u, this->countMisplaced(*bunch)); +} + +// ============================================================ +// 14. Successive displacements across ORB boundaries +// ============================================================ +TYPED_TEST(TestParticleUpdateORB, SuccessiveDisplacementsAcrossOrbBoundaries) { + REQUIRE_RANKS(MIN_RANKS); + using T = typename TestFixture::T; + + bool ok = this->orbGaussian(); + if (!ok) + GTEST_SKIP(); + this->rebuildPlayout(); + + auto bunch = this->makeBunch(); + this->fillRandom(*bunch, 128); + const size_t N = this->totalParticles(*bunch); + + std::mt19937_64 eng(2025 + ippl::Comm->rank()); + std::uniform_real_distribution step(-T(0.2), T(0.2)); + + for (int s = 0; s < 6; ++s) { + auto R_host = bunch->R.getHostMirror(); + Kokkos::deep_copy(R_host, bunch->R.getView()); + for (size_t i = 0; i < bunch->getLocalNum(); ++i) + for (unsigned d = 0; d < TestFixture::Dim; d++) { + T np = R_host(i)[d] + step(eng) * this->domain[d]; + R_host(i)[d] = this->periodicWrap(np, this->domain[d]); + } + Kokkos::deep_copy(bunch->R.getView(), R_host); + + bunch->update(); + EXPECT_EQ(N, this->totalParticles(*bunch)) << "at step " << s; + EXPECT_EQ(0u, this->countMisplaced(*bunch)) << "at step " << s; + } +} + +// ============================================================ +// 15. 3-D corner migration with ORB decomposition +// (only runs when Dim == 3) +// ============================================================ +TYPED_TEST(TestParticleUpdateORB, ThreeDCornerMigrationAfterOrb) { + if constexpr (TestFixture::Dim != 3) { + GTEST_SKIP() << "3-D specific test"; + } + REQUIRE_RANKS(PREF_RANKS); + using T = typename TestFixture::T; + + bool ok = this->orbGaussian(T(0.2)); + if (!ok) + GTEST_SKIP(); + this->rebuildPlayout(); + + auto bunch = this->makeBunch(); + + // Seed all particles near the origin + bunch->create(64); + { + auto R_host = bunch->R.getHostMirror(); + auto Q_host = bunch->Q.getHostMirror(); + for (size_t i = 0; i < bunch->getLocalNum(); ++i) { + for (unsigned d = 0; d < 3; d++) + R_host(i)[d] = T(0.02) * this->domain[d]; + Q_host(i) = T(1); + } + Kokkos::deep_copy(bunch->R.getView(), R_host); + Kokkos::deep_copy(bunch->Q.getView(), Q_host); + } + + const size_t before = this->totalParticles(*bunch); + bunch->update(); // settle near origin + + // Jump diagonally to the opposite corner (wraps periodically) + { + auto R_host = bunch->R.getHostMirror(); + Kokkos::deep_copy(R_host, bunch->R.getView()); + for (size_t i = 0; i < bunch->getLocalNum(); ++i) + for (unsigned d = 0; d < 3; d++) { + T np = R_host(i)[d] + T(0.98) * this->domain[d]; + R_host(i)[d] = this->periodicWrap(np, this->domain[d]); + } + Kokkos::deep_copy(bunch->R.getView(), R_host); + } + + bunch->update(); + EXPECT_EQ(before, this->totalParticles(*bunch)); + EXPECT_EQ(0u, this->countMisplaced(*bunch)); +} + +// ============================================================ +// Entry point +// ============================================================ +int main(int argc, char* argv[]) { + int success = 1; + ippl::initialize(argc, argv); + { + ::testing::InitGoogleTest(&argc, argv); + success = RUN_ALL_TESTS(); + } + ippl::finalize(); + return success; +}