Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ option( BUILD_UNITY "enables unity build for faster compile times" ON )
option( BUILD_CODE_COV "enables compiler option required for code coverage analysis" OFF )
option( BUILD_ML "enables build with tensorflow backend access" OFF )
option( BUILD_MPI "enables build with MPI access" OFF )
option( BUILD_CUDA_HPC "enables CUDA backend for SN HPC solver (single GPU)" OFF )
option( BUILD_CUDA_HPC "enables CUDA backend for SN HPC solver (MPI rank to GPU mapping)" OFF )
#################################################


Expand Down
10 changes: 4 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ singularity exec tools/singularity/kit_rt_MPI.sif \
mpirun -np 4 ./build_singularity_mpi/KiT-RT tests/input/validation_tests/SN_solver_hpc/lattice_hpc_200_cpu_order2.cfg
```

### 3. CPU + single GPU (OpenMP + CUDA)
### 3. CPU + CUDA (single or multi-GPU via MPI)

#### 3a) Singularity installation
```bash
Expand All @@ -152,11 +152,11 @@ cd ../..
mkdir -p build_singularity_cuda
cd build_singularity_cuda
singularity exec --nv ../tools/singularity/kit_rt_MPI_cuda.sif \
cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_MPI=OFF -DBUILD_CUDA_HPC=ON -DBUILD_ML=OFF ..
cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_MPI=ON -DBUILD_CUDA_HPC=ON -DBUILD_ML=OFF ..
singularity exec --nv ../tools/singularity/kit_rt_MPI_cuda.sif make -j
cd ..
singularity exec --nv tools/singularity/kit_rt_MPI_cuda.sif \
./build_singularity_cuda/KiT-RT tests/input/validation_tests/SN_solver_hpc/lattice_hpc_200_cuda_order2.cfg
mpirun -np 2 ./build_singularity_cuda/KiT-RT tests/input/validation_tests/SN_solver_hpc/lattice_hpc_200_cuda_order2.cfg
```

When compiled with `-DBUILD_CUDA_HPC=ON`, HPC runs use the CUDA backend if a GPU is visible, and fall back to CPU if no GPU is detected.
Expand Down Expand Up @@ -204,14 +204,12 @@ gcovr -r .. --html-details coverage.html
## Python API

The Python interface is provided via [charm_kit](https://github.com/KiT-RT/charm_kit), allowing seamless integration into AI and outer-loop (UQ, Optimization) workflows.
Check the corresponding readme for further info
Check the corresponding readme for further info.




## Scaling Studies

Performance benchmarks and scaling plots can be found \[[here](https://doi.org/10.1145/3630001)].



Expand Down
2 changes: 1 addition & 1 deletion include/solvers/snsolver_hpc_cuda.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ class SNSolverHPCCUDA
std::vector<double> _historyOutputFields; /*!< @brief Solver Output: dimensions (FieldID). */
std::vector<std::string> _historyOutputFieldNames; /*!< @brief Names of the outputFields: dimensions (FieldID) */

// CUDA backend (single GPU for first version)
// CUDA backend state
bool _cudaInitialized;
int _cudaDeviceId;
DeviceBuffers* _device;
Expand Down
59 changes: 43 additions & 16 deletions src/solvers/snsolver_hpc.cu
Original file line number Diff line number Diff line change
Expand Up @@ -369,21 +369,14 @@ SNSolverHPCCUDA::SNSolverHPCCUDA( Config* settings ) {
ErrorMessages::Error( "The number of processors must be less than or equal to the number of quadrature points.", CURRENT_FUNCTION );
}

if( _numProcs == 1 ) {
_localNSys = _nSys;
_startSysIdx = 0;
_endSysIdx = _nSys;
}
else {
_localNSys = _nSys / ( _numProcs - 1 );
_startSysIdx = _rank * _localNSys;
_endSysIdx = _rank * _localNSys + _localNSys;

if( _rank == _numProcs - 1 ) {
_localNSys = _nSys - _startSysIdx;
_endSysIdx = _nSys;
}
}
const unsigned long numRanks = static_cast<unsigned long>( _numProcs );
const unsigned long rankIndex = static_cast<unsigned long>( _rank );
const unsigned long baseChunk = _nSys / numRanks;
const unsigned long remainder = _nSys % numRanks;

_localNSys = baseChunk + ( rankIndex < remainder ? 1UL : 0UL );
_startSysIdx = rankIndex * baseChunk + std::min( rankIndex, remainder );
_endSysIdx = _startSysIdx + _localNSys;

// std::cout << "Rank: " << _rank << " startSysIdx: " << _startSysIdx << " endSysIdx: " << _endSysIdx << " localNSys: " << _localNSys <<
// std::endl;
Expand Down Expand Up @@ -613,9 +606,29 @@ void SNSolverHPCCUDA::InitCUDA() {
ErrorMessages::Error( "No CUDA-capable GPU detected, but SNSolverHPCCUDA was requested.", CURRENT_FUNCTION );
}

_cudaDeviceId = 0; // first version: pin to one GPU
int localRank = 0;
int localSize = 1;
#ifdef IMPORT_MPI
MPI_Comm localComm = MPI_COMM_NULL;
MPI_Comm_split_type( MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, _rank, MPI_INFO_NULL, &localComm );
MPI_Comm_rank( localComm, &localRank );
MPI_Comm_size( localComm, &localSize );
MPI_Comm_free( &localComm );
#endif

_cudaDeviceId = localRank % nDevices;
CheckCuda( cudaSetDevice( _cudaDeviceId ), "cudaSetDevice" );

if( _rank == 0 ) {
auto log = spdlog::get( "event" );
if( log ) {
log->info( "| CUDA backend: {} local MPI rank(s), {} visible CUDA device(s).", localSize, nDevices );
if( localSize > nDevices ) {
log->warn( "| CUDA backend: {} local MPI rank(s) exceed {} visible device(s); GPUs will be shared.", localSize, nDevices );
}
}
}

_device = new DeviceBuffers();

const std::size_t nCells = static_cast<std::size_t>( _nCells );
Expand Down Expand Up @@ -805,6 +818,20 @@ void SNSolverHPCCUDA::Solve() {
RK2AverageAndScalarFluxKernel<<<gridCells, threads>>>(
_nCells, _localNSys, _device->quadWeights, _device->solRK0, _device->sol, _device->scalarFlux );
CheckCuda( cudaGetLastError(), "RK2AverageAndScalarFluxKernel launch" );
#ifdef IMPORT_MPI
CheckCuda( cudaMemcpy( _scalarFlux.data(),
_device->scalarFlux,
static_cast<std::size_t>( _nCells ) * sizeof( double ),
cudaMemcpyDeviceToHost ),
"download scalar flux after RK2 average" );
std::vector<double> tempScalarFlux( _scalarFlux );
MPI_Barrier( MPI_COMM_WORLD );
MPI_Allreduce( tempScalarFlux.data(), _scalarFlux.data(), _nCells, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD );
MPI_Barrier( MPI_COMM_WORLD );
CheckCuda(
cudaMemcpy( _device->scalarFlux, _scalarFlux.data(), static_cast<std::size_t>( _nCells ) * sizeof( double ), cudaMemcpyHostToDevice ),
"sync allreduced scalar flux after RK2 average" );
#endif
}
else {
( _spatialOrder == 2 ) ? FluxOrder2() : FluxOrder1();
Expand Down
2 changes: 1 addition & 1 deletion tools/singularity/install_kitrt_singularity_cuda.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,5 @@ set -euo pipefail
cd ../../
mkdir -p build_singularity_cuda
cd build_singularity_cuda
cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_MPI=OFF -DBUILD_CUDA_HPC=ON -DBUILD_ML=OFF ..
cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_MPI=ON -DBUILD_CUDA_HPC=ON -DBUILD_ML=OFF ..
make -j