diff --git a/CMakeLists.txt b/CMakeLists.txt
index 894a3d99..32bab039 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -9,7 +9,7 @@ option( BUILD_UNITY "enables unity build for faster compile times" ON )
 option( BUILD_CODE_COV "enables compiler option required for code coverage analysis" OFF )
 option( BUILD_ML "enables build with tensorflow backend access" OFF )
 option( BUILD_MPI "enables build with MPI access" OFF )
-option( BUILD_CUDA_HPC "enables CUDA backend for SN HPC solver (single GPU)" OFF )
+option( BUILD_CUDA_HPC "enables CUDA backend for SN HPC solver (MPI rank to GPU mapping)" OFF )
 #################################################
 
 
diff --git a/README.md b/README.md
index 987ab671..d7b1df1b 100644
--- a/README.md
+++ b/README.md
@@ -142,7 +142,7 @@ singularity exec tools/singularity/kit_rt_MPI.sif \
   mpirun -np 4 ./build_singularity_mpi/KiT-RT tests/input/validation_tests/SN_solver_hpc/lattice_hpc_200_cpu_order2.cfg
 ```
 
-### 3. CPU + single GPU (OpenMP + CUDA)
+### 3. CPU + CUDA (single or multi-GPU via MPI)
 
 #### 3a) Singularity installation
 ```bash
@@ -152,11 +152,11 @@ cd ../..
 mkdir -p build_singularity_cuda
 cd build_singularity_cuda
 singularity exec --nv ../tools/singularity/kit_rt_MPI_cuda.sif \
-  cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_MPI=OFF -DBUILD_CUDA_HPC=ON -DBUILD_ML=OFF ..
+  cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_MPI=ON -DBUILD_CUDA_HPC=ON -DBUILD_ML=OFF ..
 singularity exec --nv ../tools/singularity/kit_rt_MPI_cuda.sif make -j
 cd ..
 singularity exec --nv tools/singularity/kit_rt_MPI_cuda.sif \
-  ./build_singularity_cuda/KiT-RT tests/input/validation_tests/SN_solver_hpc/lattice_hpc_200_cuda_order2.cfg
+  mpirun -np 2 ./build_singularity_cuda/KiT-RT tests/input/validation_tests/SN_solver_hpc/lattice_hpc_200_cuda_order2.cfg
 ```
 
 When compiled with `-DBUILD_CUDA_HPC=ON`, HPC runs use the CUDA backend if a GPU is visible, and fall back to CPU if no GPU is detected.
@@ -204,14 +204,12 @@ gcovr -r .. --html-details coverage.html
 ## Python API
 
 The Python interface is provided via [charm_kit](https://github.com/KiT-RT/charm_kit), allowing seamless integration into AI and outer-loop (UQ, Optimization) workflows. 
-Check the corresponding readme for further info
+Check the corresponding readme for further info.
 
 
 
 
-## Scaling Studies
 
-Performance benchmarks and scaling plots can be found \[[here](https://doi.org/10.1145/3630001)].
 
 
 
diff --git a/include/solvers/snsolver_hpc_cuda.hpp b/include/solvers/snsolver_hpc_cuda.hpp
index 5a6f73f0..651d729e 100644
--- a/include/solvers/snsolver_hpc_cuda.hpp
+++ b/include/solvers/snsolver_hpc_cuda.hpp
@@ -151,7 +151,7 @@ class SNSolverHPCCUDA
     std::vector<double> _historyOutputFields;          /*!< @brief Solver Output: dimensions (FieldID). */
     std::vector<std::string> _historyOutputFieldNames; /*!< @brief Names of the outputFields: dimensions (FieldID) */
 
-    // CUDA backend (single GPU for first version)
+    // CUDA backend state
     bool _cudaInitialized;
     int _cudaDeviceId;
     DeviceBuffers* _device;
diff --git a/src/solvers/snsolver_hpc.cu b/src/solvers/snsolver_hpc.cu
index e016c797..c6facfca 100644
--- a/src/solvers/snsolver_hpc.cu
+++ b/src/solvers/snsolver_hpc.cu
@@ -369,21 +369,14 @@ SNSolverHPCCUDA::SNSolverHPCCUDA( Config* settings ) {
         ErrorMessages::Error( "The number of processors must be less than or equal to the number of quadrature points.", CURRENT_FUNCTION );
     }
 
-    if( _numProcs == 1 ) {
-        _localNSys   = _nSys;
-        _startSysIdx = 0;
-        _endSysIdx   = _nSys;
-    }
-    else {
-        _localNSys   = _nSys / ( _numProcs - 1 );
-        _startSysIdx = _rank * _localNSys;
-        _endSysIdx   = _rank * _localNSys + _localNSys;
-
-        if( _rank == _numProcs - 1 ) {
-            _localNSys = _nSys - _startSysIdx;
-            _endSysIdx = _nSys;
-        }
-    }
+    const unsigned long numRanks  = static_cast<unsigned long>( _numProcs );
+    const unsigned long rankIndex = static_cast<unsigned long>( _rank );
+    const unsigned long baseChunk = _nSys / numRanks;
+    const unsigned long remainder = _nSys % numRanks;
+
+    _localNSys   = baseChunk + ( rankIndex < remainder ? 1UL : 0UL );
+    _startSysIdx = rankIndex * baseChunk + std::min( rankIndex, remainder );
+    _endSysIdx   = _startSysIdx + _localNSys;
 
     // std::cout << "Rank: " << _rank << " startSysIdx: " << _startSysIdx << " endSysIdx: " << _endSysIdx <<  " localNSys: " << _localNSys <<
     // std::endl;
@@ -613,9 +606,29 @@ void SNSolverHPCCUDA::InitCUDA() {
         ErrorMessages::Error( "No CUDA-capable GPU detected, but SNSolverHPCCUDA was requested.", CURRENT_FUNCTION );
     }
 
-    _cudaDeviceId = 0;    // first version: pin to one GPU
+    int localRank = 0;
+    int localSize = 1;
+#ifdef IMPORT_MPI
+    MPI_Comm localComm = MPI_COMM_NULL;
+    MPI_Comm_split_type( MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, _rank, MPI_INFO_NULL, &localComm );
+    MPI_Comm_rank( localComm, &localRank );
+    MPI_Comm_size( localComm, &localSize );
+    MPI_Comm_free( &localComm );
+#endif
+
+    _cudaDeviceId = localRank % nDevices;
     CheckCuda( cudaSetDevice( _cudaDeviceId ), "cudaSetDevice" );
 
+    if( _rank == 0 ) {
+        auto log = spdlog::get( "event" );
+        if( log ) {
+            log->info( "| CUDA backend: {} local MPI rank(s), {} visible CUDA device(s).", localSize, nDevices );
+            if( localSize > nDevices ) {
+                log->warn( "| CUDA backend: {} local MPI rank(s) exceed {} visible device(s); GPUs will be shared.", localSize, nDevices );
+            }
+        }
+    }
+
     _device = new DeviceBuffers();
 
     const std::size_t nCells      = static_cast<std::size_t>( _nCells );
@@ -805,6 +818,20 @@ void SNSolverHPCCUDA::Solve() {
             RK2AverageAndScalarFluxKernel<<<gridCells, threads>>>(
                 _nCells, _localNSys, _device->quadWeights, _device->solRK0, _device->sol, _device->scalarFlux );
             CheckCuda( cudaGetLastError(), "RK2AverageAndScalarFluxKernel launch" );
+#ifdef IMPORT_MPI
+            CheckCuda( cudaMemcpy( _scalarFlux.data(),
+                                   _device->scalarFlux,
+                                   static_cast<std::size_t>( _nCells ) * sizeof( double ),
+                                   cudaMemcpyDeviceToHost ),
+                       "download scalar flux after RK2 average" );
+            std::vector<double> tempScalarFlux( _scalarFlux );
+            MPI_Barrier( MPI_COMM_WORLD );
+            MPI_Allreduce( tempScalarFlux.data(), _scalarFlux.data(), _nCells, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD );
+            MPI_Barrier( MPI_COMM_WORLD );
+            CheckCuda(
+                cudaMemcpy( _device->scalarFlux, _scalarFlux.data(), static_cast<std::size_t>( _nCells ) * sizeof( double ), cudaMemcpyHostToDevice ),
+                "sync allreduced scalar flux after RK2 average" );
+#endif
         }
         else {
             ( _spatialOrder == 2 ) ? FluxOrder2() : FluxOrder1();
diff --git a/tools/singularity/install_kitrt_singularity_cuda.sh b/tools/singularity/install_kitrt_singularity_cuda.sh
index b88fe4a1..a59653d2 100755
--- a/tools/singularity/install_kitrt_singularity_cuda.sh
+++ b/tools/singularity/install_kitrt_singularity_cuda.sh
@@ -4,5 +4,5 @@ set -euo pipefail
 cd ../../
 mkdir -p build_singularity_cuda
 cd build_singularity_cuda
-cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_MPI=OFF -DBUILD_CUDA_HPC=ON -DBUILD_ML=OFF ..
+cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_MPI=ON -DBUILD_CUDA_HPC=ON -DBUILD_ML=OFF ..
 make -j