diff --git a/CMakeLists.txt b/CMakeLists.txt index 894a3d99..32bab039 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -9,7 +9,7 @@ option( BUILD_UNITY "enables unity build for faster compile times" ON ) option( BUILD_CODE_COV "enables compiler option required for code coverage analysis" OFF ) option( BUILD_ML "enables build with tensorflow backend access" OFF ) option( BUILD_MPI "enables build with MPI access" OFF ) -option( BUILD_CUDA_HPC "enables CUDA backend for SN HPC solver (single GPU)" OFF ) +option( BUILD_CUDA_HPC "enables CUDA backend for SN HPC solver (MPI rank to GPU mapping)" OFF ) ################################################# diff --git a/README.md b/README.md index 987ab671..d7b1df1b 100644 --- a/README.md +++ b/README.md @@ -142,7 +142,7 @@ singularity exec tools/singularity/kit_rt_MPI.sif \ mpirun -np 4 ./build_singularity_mpi/KiT-RT tests/input/validation_tests/SN_solver_hpc/lattice_hpc_200_cpu_order2.cfg ``` -### 3. CPU + single GPU (OpenMP + CUDA) +### 3. CPU + CUDA (single or multi-GPU via MPI) #### 3a) Singularity installation ```bash @@ -152,11 +152,11 @@ cd ../.. mkdir -p build_singularity_cuda cd build_singularity_cuda singularity exec --nv ../tools/singularity/kit_rt_MPI_cuda.sif \ - cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_MPI=OFF -DBUILD_CUDA_HPC=ON -DBUILD_ML=OFF .. + cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_MPI=ON -DBUILD_CUDA_HPC=ON -DBUILD_ML=OFF .. singularity exec --nv ../tools/singularity/kit_rt_MPI_cuda.sif make -j cd .. singularity exec --nv tools/singularity/kit_rt_MPI_cuda.sif \ - ./build_singularity_cuda/KiT-RT tests/input/validation_tests/SN_solver_hpc/lattice_hpc_200_cuda_order2.cfg + mpirun -np 2 ./build_singularity_cuda/KiT-RT tests/input/validation_tests/SN_solver_hpc/lattice_hpc_200_cuda_order2.cfg ``` When compiled with `-DBUILD_CUDA_HPC=ON`, HPC runs use the CUDA backend if a GPU is visible, and fall back to CPU if no GPU is detected. @@ -204,14 +204,12 @@ gcovr -r .. --html-details coverage.html ## Python API The Python interface is provided via [charm_kit](https://github.com/KiT-RT/charm_kit), allowing seamless integration into AI and outer-loop (UQ, Optimization) workflows. -Check the corresponding readme for further info +Check the corresponding readme for further info. -## Scaling Studies -Performance benchmarks and scaling plots can be found \[[here](https://doi.org/10.1145/3630001)]. diff --git a/include/solvers/snsolver_hpc_cuda.hpp b/include/solvers/snsolver_hpc_cuda.hpp index 5a6f73f0..651d729e 100644 --- a/include/solvers/snsolver_hpc_cuda.hpp +++ b/include/solvers/snsolver_hpc_cuda.hpp @@ -151,7 +151,7 @@ class SNSolverHPCCUDA std::vector _historyOutputFields; /*!< @brief Solver Output: dimensions (FieldID). */ std::vector _historyOutputFieldNames; /*!< @brief Names of the outputFields: dimensions (FieldID) */ - // CUDA backend (single GPU for first version) + // CUDA backend state bool _cudaInitialized; int _cudaDeviceId; DeviceBuffers* _device; diff --git a/src/solvers/snsolver_hpc.cu b/src/solvers/snsolver_hpc.cu index e016c797..c6facfca 100644 --- a/src/solvers/snsolver_hpc.cu +++ b/src/solvers/snsolver_hpc.cu @@ -369,21 +369,14 @@ SNSolverHPCCUDA::SNSolverHPCCUDA( Config* settings ) { ErrorMessages::Error( "The number of processors must be less than or equal to the number of quadrature points.", CURRENT_FUNCTION ); } - if( _numProcs == 1 ) { - _localNSys = _nSys; - _startSysIdx = 0; - _endSysIdx = _nSys; - } - else { - _localNSys = _nSys / ( _numProcs - 1 ); - _startSysIdx = _rank * _localNSys; - _endSysIdx = _rank * _localNSys + _localNSys; - - if( _rank == _numProcs - 1 ) { - _localNSys = _nSys - _startSysIdx; - _endSysIdx = _nSys; - } - } + const unsigned long numRanks = static_cast( _numProcs ); + const unsigned long rankIndex = static_cast( _rank ); + const unsigned long baseChunk = _nSys / numRanks; + const unsigned long remainder = _nSys % numRanks; + + _localNSys = baseChunk + ( rankIndex < remainder ? 1UL : 0UL ); + _startSysIdx = rankIndex * baseChunk + std::min( rankIndex, remainder ); + _endSysIdx = _startSysIdx + _localNSys; // std::cout << "Rank: " << _rank << " startSysIdx: " << _startSysIdx << " endSysIdx: " << _endSysIdx << " localNSys: " << _localNSys << // std::endl; @@ -613,9 +606,29 @@ void SNSolverHPCCUDA::InitCUDA() { ErrorMessages::Error( "No CUDA-capable GPU detected, but SNSolverHPCCUDA was requested.", CURRENT_FUNCTION ); } - _cudaDeviceId = 0; // first version: pin to one GPU + int localRank = 0; + int localSize = 1; +#ifdef IMPORT_MPI + MPI_Comm localComm = MPI_COMM_NULL; + MPI_Comm_split_type( MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, _rank, MPI_INFO_NULL, &localComm ); + MPI_Comm_rank( localComm, &localRank ); + MPI_Comm_size( localComm, &localSize ); + MPI_Comm_free( &localComm ); +#endif + + _cudaDeviceId = localRank % nDevices; CheckCuda( cudaSetDevice( _cudaDeviceId ), "cudaSetDevice" ); + if( _rank == 0 ) { + auto log = spdlog::get( "event" ); + if( log ) { + log->info( "| CUDA backend: {} local MPI rank(s), {} visible CUDA device(s).", localSize, nDevices ); + if( localSize > nDevices ) { + log->warn( "| CUDA backend: {} local MPI rank(s) exceed {} visible device(s); GPUs will be shared.", localSize, nDevices ); + } + } + } + _device = new DeviceBuffers(); const std::size_t nCells = static_cast( _nCells ); @@ -805,6 +818,20 @@ void SNSolverHPCCUDA::Solve() { RK2AverageAndScalarFluxKernel<<>>( _nCells, _localNSys, _device->quadWeights, _device->solRK0, _device->sol, _device->scalarFlux ); CheckCuda( cudaGetLastError(), "RK2AverageAndScalarFluxKernel launch" ); +#ifdef IMPORT_MPI + CheckCuda( cudaMemcpy( _scalarFlux.data(), + _device->scalarFlux, + static_cast( _nCells ) * sizeof( double ), + cudaMemcpyDeviceToHost ), + "download scalar flux after RK2 average" ); + std::vector tempScalarFlux( _scalarFlux ); + MPI_Barrier( MPI_COMM_WORLD ); + MPI_Allreduce( tempScalarFlux.data(), _scalarFlux.data(), _nCells, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD ); + MPI_Barrier( MPI_COMM_WORLD ); + CheckCuda( + cudaMemcpy( _device->scalarFlux, _scalarFlux.data(), static_cast( _nCells ) * sizeof( double ), cudaMemcpyHostToDevice ), + "sync allreduced scalar flux after RK2 average" ); +#endif } else { ( _spatialOrder == 2 ) ? FluxOrder2() : FluxOrder1(); diff --git a/tools/singularity/install_kitrt_singularity_cuda.sh b/tools/singularity/install_kitrt_singularity_cuda.sh index b88fe4a1..a59653d2 100755 --- a/tools/singularity/install_kitrt_singularity_cuda.sh +++ b/tools/singularity/install_kitrt_singularity_cuda.sh @@ -4,5 +4,5 @@ set -euo pipefail cd ../../ mkdir -p build_singularity_cuda cd build_singularity_cuda -cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_MPI=OFF -DBUILD_CUDA_HPC=ON -DBUILD_ML=OFF .. +cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_MPI=ON -DBUILD_CUDA_HPC=ON -DBUILD_ML=OFF .. make -j