Skip to content

Commit 091f8f1

Browse files
authored
Mpi cuda (#64)
* add mpi cuda * added cuda mpi
1 parent a65ac29 commit 091f8f1

5 files changed

Lines changed: 50 additions & 25 deletions

File tree

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ option( BUILD_UNITY "enables unity build for faster compile times" ON )
99
option( BUILD_CODE_COV "enables compiler option required for code coverage analysis" OFF )
1010
option( BUILD_ML "enables build with tensorflow backend access" OFF )
1111
option( BUILD_MPI "enables build with MPI access" OFF )
12-
option( BUILD_CUDA_HPC "enables CUDA backend for SN HPC solver (single GPU)" OFF )
12+
option( BUILD_CUDA_HPC "enables CUDA backend for SN HPC solver (MPI rank to GPU mapping)" OFF )
1313
#################################################
1414

1515

README.md

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,7 @@ singularity exec tools/singularity/kit_rt_MPI.sif \
142142
mpirun -np 4 ./build_singularity_mpi/KiT-RT tests/input/validation_tests/SN_solver_hpc/lattice_hpc_200_cpu_order2.cfg
143143
```
144144

145-
### 3. CPU + single GPU (OpenMP + CUDA)
145+
### 3. CPU + CUDA (single or multi-GPU via MPI)
146146

147147
#### 3a) Singularity installation
148148
```bash
@@ -152,11 +152,11 @@ cd ../..
152152
mkdir -p build_singularity_cuda
153153
cd build_singularity_cuda
154154
singularity exec --nv ../tools/singularity/kit_rt_MPI_cuda.sif \
155-
cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_MPI=OFF -DBUILD_CUDA_HPC=ON -DBUILD_ML=OFF ..
155+
cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_MPI=ON -DBUILD_CUDA_HPC=ON -DBUILD_ML=OFF ..
156156
singularity exec --nv ../tools/singularity/kit_rt_MPI_cuda.sif make -j
157157
cd ..
158158
singularity exec --nv tools/singularity/kit_rt_MPI_cuda.sif \
159-
./build_singularity_cuda/KiT-RT tests/input/validation_tests/SN_solver_hpc/lattice_hpc_200_cuda_order2.cfg
159+
mpirun -np 2 ./build_singularity_cuda/KiT-RT tests/input/validation_tests/SN_solver_hpc/lattice_hpc_200_cuda_order2.cfg
160160
```
161161

162162
When compiled with `-DBUILD_CUDA_HPC=ON`, HPC runs use the CUDA backend if a GPU is visible, and fall back to CPU if no GPU is detected.
@@ -204,14 +204,12 @@ gcovr -r .. --html-details coverage.html
204204
## Python API
205205

206206
The Python interface is provided via [charm_kit](https://github.com/KiT-RT/charm_kit), allowing seamless integration into AI and outer-loop (UQ, Optimization) workflows.
207-
Check the corresponding readme for further info
207+
Check the corresponding readme for further info.
208208

209209

210210

211211

212-
## Scaling Studies
213212

214-
Performance benchmarks and scaling plots can be found \[[here](https://doi.org/10.1145/3630001)].
215213

216214

217215

include/solvers/snsolver_hpc_cuda.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,7 @@ class SNSolverHPCCUDA
151151
std::vector<double> _historyOutputFields; /*!< @brief Solver Output: dimensions (FieldID). */
152152
std::vector<std::string> _historyOutputFieldNames; /*!< @brief Names of the outputFields: dimensions (FieldID) */
153153

154-
// CUDA backend (single GPU for first version)
154+
// CUDA backend state
155155
bool _cudaInitialized;
156156
int _cudaDeviceId;
157157
DeviceBuffers* _device;

src/solvers/snsolver_hpc.cu

Lines changed: 43 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -369,21 +369,14 @@ SNSolverHPCCUDA::SNSolverHPCCUDA( Config* settings ) {
369369
ErrorMessages::Error( "The number of processors must be less than or equal to the number of quadrature points.", CURRENT_FUNCTION );
370370
}
371371

372-
if( _numProcs == 1 ) {
373-
_localNSys = _nSys;
374-
_startSysIdx = 0;
375-
_endSysIdx = _nSys;
376-
}
377-
else {
378-
_localNSys = _nSys / ( _numProcs - 1 );
379-
_startSysIdx = _rank * _localNSys;
380-
_endSysIdx = _rank * _localNSys + _localNSys;
381-
382-
if( _rank == _numProcs - 1 ) {
383-
_localNSys = _nSys - _startSysIdx;
384-
_endSysIdx = _nSys;
385-
}
386-
}
372+
const unsigned long numRanks = static_cast<unsigned long>( _numProcs );
373+
const unsigned long rankIndex = static_cast<unsigned long>( _rank );
374+
const unsigned long baseChunk = _nSys / numRanks;
375+
const unsigned long remainder = _nSys % numRanks;
376+
377+
_localNSys = baseChunk + ( rankIndex < remainder ? 1UL : 0UL );
378+
_startSysIdx = rankIndex * baseChunk + std::min( rankIndex, remainder );
379+
_endSysIdx = _startSysIdx + _localNSys;
387380

388381
// std::cout << "Rank: " << _rank << " startSysIdx: " << _startSysIdx << " endSysIdx: " << _endSysIdx << " localNSys: " << _localNSys <<
389382
// std::endl;
@@ -613,9 +606,29 @@ void SNSolverHPCCUDA::InitCUDA() {
613606
ErrorMessages::Error( "No CUDA-capable GPU detected, but SNSolverHPCCUDA was requested.", CURRENT_FUNCTION );
614607
}
615608

616-
_cudaDeviceId = 0; // first version: pin to one GPU
609+
int localRank = 0;
610+
int localSize = 1;
611+
#ifdef IMPORT_MPI
612+
MPI_Comm localComm = MPI_COMM_NULL;
613+
MPI_Comm_split_type( MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, _rank, MPI_INFO_NULL, &localComm );
614+
MPI_Comm_rank( localComm, &localRank );
615+
MPI_Comm_size( localComm, &localSize );
616+
MPI_Comm_free( &localComm );
617+
#endif
618+
619+
_cudaDeviceId = localRank % nDevices;
617620
CheckCuda( cudaSetDevice( _cudaDeviceId ), "cudaSetDevice" );
618621

622+
if( _rank == 0 ) {
623+
auto log = spdlog::get( "event" );
624+
if( log ) {
625+
log->info( "| CUDA backend: {} local MPI rank(s), {} visible CUDA device(s).", localSize, nDevices );
626+
if( localSize > nDevices ) {
627+
log->warn( "| CUDA backend: {} local MPI rank(s) exceed {} visible device(s); GPUs will be shared.", localSize, nDevices );
628+
}
629+
}
630+
}
631+
619632
_device = new DeviceBuffers();
620633

621634
const std::size_t nCells = static_cast<std::size_t>( _nCells );
@@ -805,6 +818,20 @@ void SNSolverHPCCUDA::Solve() {
805818
RK2AverageAndScalarFluxKernel<<<gridCells, threads>>>(
806819
_nCells, _localNSys, _device->quadWeights, _device->solRK0, _device->sol, _device->scalarFlux );
807820
CheckCuda( cudaGetLastError(), "RK2AverageAndScalarFluxKernel launch" );
821+
#ifdef IMPORT_MPI
822+
CheckCuda( cudaMemcpy( _scalarFlux.data(),
823+
_device->scalarFlux,
824+
static_cast<std::size_t>( _nCells ) * sizeof( double ),
825+
cudaMemcpyDeviceToHost ),
826+
"download scalar flux after RK2 average" );
827+
std::vector<double> tempScalarFlux( _scalarFlux );
828+
MPI_Barrier( MPI_COMM_WORLD );
829+
MPI_Allreduce( tempScalarFlux.data(), _scalarFlux.data(), _nCells, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD );
830+
MPI_Barrier( MPI_COMM_WORLD );
831+
CheckCuda(
832+
cudaMemcpy( _device->scalarFlux, _scalarFlux.data(), static_cast<std::size_t>( _nCells ) * sizeof( double ), cudaMemcpyHostToDevice ),
833+
"sync allreduced scalar flux after RK2 average" );
834+
#endif
808835
}
809836
else {
810837
( _spatialOrder == 2 ) ? FluxOrder2() : FluxOrder1();

tools/singularity/install_kitrt_singularity_cuda.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,5 @@ set -euo pipefail
44
cd ../../
55
mkdir -p build_singularity_cuda
66
cd build_singularity_cuda
7-
cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_MPI=OFF -DBUILD_CUDA_HPC=ON -DBUILD_ML=OFF ..
7+
cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_MPI=ON -DBUILD_CUDA_HPC=ON -DBUILD_ML=OFF ..
88
make -j

0 commit comments

Comments
 (0)