diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml
index be28f9aa..089cb19f 100644
--- a/.github/workflows/c-cpp.yml
+++ b/.github/workflows/c-cpp.yml
@@ -8,32 +8,145 @@ on:
 
 jobs:
   unit_tests:
-      runs-on: ubuntu-latest
-      container: kitrt/test:latest
-      environment: coverage
-      env:
-        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        COVERALLS_REPO_TOKEN: ${{ secrets.COVERALLS_REPO_TOKEN }}
-        OMP_NUM_THREADS: 1
-        OMP_DYNAMIC: FALSE
-        OPENBLAS_NUM_THREADS: 1
-        MKL_NUM_THREADS: 1
-
-      steps:
-      - uses: actions/checkout@v2
-
-      - name: Build code
-        run: |
-          git submodule update --init --recursive
-          mkdir build && cd build
-          cmake -G Ninja -DCMAKE_BUILD_TYPE=Debug -DBUILD_TESTING=ON -DBUILD_ML=OFF -DBUILD_CODE_COV=ON ..
-          ninja
-
-      - name: Run unit tests
-        run: |
-          cd build
-          ./unit_tests 
-
-      - name: Code coverage
-        run: |
-          cpp-coveralls -r . -b "build/" -i "src/" -i "include/" --exclude "ext/" --gcov-options "\-lp" --verbose
+    runs-on: ubuntu-latest
+    container: kitrt/test:latest
+    defaults:
+      run:
+        working-directory: ${{ github.workspace }}
+    environment: coverage
+    env:
+      GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      COVERALLS_REPO_TOKEN: ${{ secrets.COVERALLS_REPO_TOKEN }}
+      OMP_NUM_THREADS: 1
+      OMP_DYNAMIC: FALSE
+      OPENBLAS_NUM_THREADS: 1
+      MKL_NUM_THREADS: 1
+
+    steps:
+    - uses: actions/checkout@v4
+      with:
+        submodules: recursive
+
+    - name: Build code
+      run: |
+        mkdir build && cd build
+        cmake -G Ninja -DCMAKE_BUILD_TYPE=Debug -DBUILD_TESTING=ON -DBUILD_ML=OFF -DBUILD_CODE_COV=ON ..
+        ninja
+
+    - name: Run unit tests
+      run: |
+        cd build
+        ./unit_tests
+
+    - name: Code coverage
+      run: |
+        cpp-coveralls -r . -b "build/" -i "src/" -i "include/" --exclude "ext/" --gcov-options "\-lp" --verbose
+
+  rocm_build:
+    runs-on: ubuntu-latest
+    container: rocm/dev-ubuntu-22.04:7.2
+    env:
+      OMP_NUM_THREADS: 1
+      OMP_DYNAMIC: FALSE
+      OPENBLAS_NUM_THREADS: 1
+      MKL_NUM_THREADS: 1
+
+    steps:
+    - name: Install ROCm build dependencies
+      run: |
+        apt-get update
+        DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+          git \
+          libopenmpi-dev \
+          openmpi-bin \
+          libblas-dev \
+          liblapack-dev \
+          cmake \
+          ninja-build \
+          libvtk9-dev
+
+    - uses: actions/checkout@v4
+      with:
+        submodules: recursive
+
+    - name: Build ROCm target
+      working-directory: ${{ github.workspace }}
+      run: |
+        ROCM_CLANGXX="/opt/rocm-7.2.0/lib/llvm/bin/clang++"
+        if [ ! -x "${ROCM_CLANGXX}" ] && [ -x "/opt/rocm/lib/llvm/bin/clang++" ]; then
+          ROCM_CLANGXX="/opt/rocm/lib/llvm/bin/clang++"
+        fi
+        if [ ! -x "${ROCM_CLANGXX}" ] && [ -x "/opt/rocm/llvm/bin/clang++" ]; then
+          ROCM_CLANGXX="/opt/rocm/llvm/bin/clang++"
+        fi
+        if [ ! -x "${ROCM_CLANGXX}" ]; then
+          echo "Could not find ROCm clang++ compiler." >&2
+          exit 1
+        fi
+        HIP_DIR=""
+        for CANDIDATE in \
+          "/opt/rocm-7.2.0/lib/cmake/hip" \
+          "/opt/rocm/lib/cmake/hip" \
+          "/opt/rocm/cmake/hip"
+        do
+          if [ -f "${CANDIDATE}/hip-config.cmake" ] || [ -f "${CANDIDATE}/hipConfig.cmake" ]; then
+            HIP_DIR="${CANDIDATE}"
+            break
+          fi
+        done
+        if [ -z "${HIP_DIR}" ]; then
+          echo "Could not find hip package config directory." >&2
+          exit 1
+        fi
+        cmake -G Ninja -S . -B build_rocm \
+          -DCMAKE_BUILD_TYPE=Release \
+          -DCMAKE_CXX_COMPILER="${ROCM_CLANGXX}" \
+          -DCMAKE_HIP_COMPILER="${ROCM_CLANGXX}" \
+          -Dhip_DIR="${HIP_DIR}" \
+          -DBUILD_MPI=ON \
+          -DBUILD_CUDA_HPC=OFF \
+          -DBUILD_HIP_HPC=ON \
+          -DBUILD_ML=OFF
+        cmake --build build_rocm -j2
+
+  cuda_build:
+    runs-on: ubuntu-latest
+    container: nvidia/cuda:12.4.1-devel-ubuntu22.04
+    env:
+      OMP_NUM_THREADS: 1
+      OMP_DYNAMIC: FALSE
+      OPENBLAS_NUM_THREADS: 1
+      MKL_NUM_THREADS: 1
+
+    steps:
+    - name: Install CUDA build dependencies
+      run: |
+        apt-get update
+        DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+          git \
+          libopenmpi-dev \
+          openmpi-bin \
+          libblas-dev \
+          liblapack-dev \
+          cmake \
+          ninja-build \
+          libvtk9-dev
+
+    - uses: actions/checkout@v4
+      with:
+        submodules: recursive
+
+    - name: Build CUDA target and tests
+      working-directory: ${{ github.workspace }}
+      run: |
+        cmake -G Ninja -S . -B build_cuda \
+          -DCMAKE_BUILD_TYPE=Release \
+          -DBUILD_TESTING=ON \
+          -DBUILD_MPI=ON \
+          -DBUILD_CUDA_HPC=ON \
+          -DBUILD_HIP_HPC=OFF \
+          -DBUILD_ML=OFF \
+          -DCMAKE_CUDA_ARCHITECTURES=70
+        cmake --build build_cuda -j2
+
+
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 32bab039..26b0c32d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,5 @@
 cmake_minimum_required( VERSION 3.16 )
-project( KiT-RT VERSION 0.1.0 LANGUAGES CXX )
+project( KiT-RT VERSION 1.2.0 LANGUAGES CXX )
 
 ### OPTIONS #####################################
 option( BUILD_TESTING "builds all available unit tests" OFF )
@@ -10,12 +10,16 @@ option( BUILD_CODE_COV "enables compiler option required for code coverage analy
 option( BUILD_ML "enables build with tensorflow backend access" OFF )
 option( BUILD_MPI "enables build with MPI access" OFF )
 option( BUILD_CUDA_HPC "enables CUDA backend for SN HPC solver (MPI rank to GPU mapping)" OFF )
+option( BUILD_HIP_HPC "enables HIP/ROCm backend for SN HPC solver (MPI rank to GPU mapping)" OFF )
 #################################################
 
 
 ### COMPILER ####################################
 set( CMAKE_CXX_STANDARD 17 )
 set( CMAKE_CXX_STANDARD_REQUIRED ON )
+if( BUILD_CUDA_HPC AND BUILD_HIP_HPC )
+    message( FATAL_ERROR "BUILD_CUDA_HPC and BUILD_HIP_HPC cannot both be enabled." )
+endif()
 if( BUILD_CUDA_HPC )
     enable_language( CUDA )
     if( CMAKE_VERSION VERSION_LESS 3.18 )
@@ -27,9 +31,25 @@ if( BUILD_CUDA_HPC )
     set( CMAKE_CUDA_STANDARD_REQUIRED ON )
     set( CMAKE_CUDA_EXTENSIONS OFF )
 endif()
+if( BUILD_HIP_HPC )
+    if( CMAKE_VERSION VERSION_LESS 3.21 )
+        message( FATAL_ERROR "BUILD_HIP_HPC requires CMake >= 3.21 for HIP language support." )
+    endif()
+    enable_language( HIP )
+    set( CMAKE_HIP_STANDARD 17 )
+    set( CMAKE_HIP_STANDARD_REQUIRED ON )
+    set( CMAKE_HIP_EXTENSIONS OFF )
+endif()
 set( KITRT_RELEASE_OPTIONS -march=native -w )
 set( KITRT_RELWITHDEBINFO_OPTIONS -march=native -pg -no-pie )
 set( KITRT_DEBUG_OPTIONS -Wall -Wextra -Wpedantic )
+if( BUILD_HIP_HPC AND CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|X86_64|amd64|AMD64" )
+    # ROCm clang can expose AVX-512 feature macros that trigger an incompatible
+    # Blaze SIMD floor implementation path on some hosts.
+    list( APPEND KITRT_RELEASE_OPTIONS -mno-avx512f -mno-avx512bw -mno-avx512dq )
+    list( APPEND KITRT_RELWITHDEBINFO_OPTIONS -mno-avx512f -mno-avx512bw -mno-avx512dq )
+    message( STATUS "HIP host C++ compile options: disabling AVX-512 for Blaze compatibility" )
+endif()
 if( BUILD_UNITY AND NOT BUILD_CODE_COV )
     message( STATUS "Unity build enabled" )
     set( CMAKE_UNITY_BUILD ON )
@@ -57,6 +77,14 @@ else()
     message( STATUS "CUDA HPC solver: disabled" )
 endif()
 
+if( BUILD_HIP_HPC )
+    add_compile_definitions( KITRT_ENABLE_HIP_HPC )
+    find_package( hip REQUIRED CONFIG )
+    message( STATUS "HIP HPC solver: enabled" )
+else()
+    message( STATUS "HIP HPC solver: disabled" )
+endif()
+
 message(STATUS "MPI build flag: ${BUILD_MPI}")
 if( BUILD_MPI )
     add_definitions(-DIMPORT_MPI)
@@ -113,6 +141,16 @@ if( BUILD_CUDA_HPC )
     endif()
 endif()
 
+if( BUILD_HIP_HPC )
+    if( TARGET hip::host )
+        # Link only the host runtime to avoid propagating HIP device compile flags
+        # (e.g. --offload-arch=...) into non-HIP C++ translation units.
+        list( APPEND CORE_LIBRARIES hip::host )
+    else()
+        message( FATAL_ERROR "Could not find HIP runtime target (hip::host)." )
+    endif()
+endif()
+
 
 #################################################
 
@@ -136,6 +174,10 @@ if( BUILD_CUDA_HPC )
     # imported target features inject an older fallback standard.
     set_source_files_properties( src/solvers/snsolver_hpc.cu PROPERTIES COMPILE_FLAGS "--std=c++17" )
 endif()
+if( BUILD_HIP_HPC )
+    list( APPEND SRCS "src/solvers/snsolver_hpc.hip" "include/solvers/snsolver_hpc_hip.hpp" )
+    set_source_files_properties( src/solvers/snsolver_hpc.hip PROPERTIES LANGUAGE HIP )
+endif()
 set( EXCLUDE_DIR "/gui/" )
 foreach( TMP_PATH ${SRCS} )
     string( FIND ${TMP_PATH} ${EXCLUDE_DIR} EXCLUDE_DIR_FOUND )
@@ -163,6 +205,12 @@ if( BUILD_CUDA_HPC )
         PROPERTIES CUDA_STANDARD 17 CUDA_STANDARD_REQUIRED ON CUDA_EXTENSIONS OFF
     )
 endif()
+if( BUILD_HIP_HPC )
+    set_target_properties(
+        ${CMAKE_PROJECT_NAME}
+        PROPERTIES HIP_STANDARD 17 HIP_STANDARD_REQUIRED ON HIP_EXTENSIONS OFF
+    )
+endif()
 target_compile_options( ${CMAKE_PROJECT_NAME} PUBLIC "$<$<AND:$<COMPILE_LANGUAGE:CXX>,$<CONFIG:DEBUG>>:${KITRT_DEBUG_OPTIONS}>" )
 target_compile_options( ${CMAKE_PROJECT_NAME} PUBLIC "$<$<AND:$<COMPILE_LANGUAGE:CXX>,$<CONFIG:RELWITHDEBINFO>>:${KITRT_RELWITHDEBINFO_OPTIONS}>" )
 target_compile_options( ${CMAKE_PROJECT_NAME} PUBLIC "$<$<AND:$<COMPILE_LANGUAGE:CXX>,$<CONFIG:RELEASE>>:${KITRT_RELEASE_OPTIONS}>" )
@@ -204,6 +252,12 @@ if( BUILD_TESTING )
             PROPERTIES CUDA_STANDARD 17 CUDA_STANDARD_REQUIRED ON CUDA_EXTENSIONS OFF
         )
     endif()
+    if( BUILD_HIP_HPC )
+        set_target_properties(
+            unit_tests
+            PROPERTIES HIP_STANDARD 17 HIP_STANDARD_REQUIRED ON HIP_EXTENSIONS OFF
+        )
+    endif()
     target_compile_options( unit_tests PUBLIC "$<$<AND:$<COMPILE_LANGUAGE:CXX>,$<CONFIG:DEBUG>>:${KITRT_DEBUG_OPTIONS}>" )
     if( BUILD_CODE_COV )
         if( CMAKE_COMPILER_IS_GNUCXX )
diff --git a/README.md b/README.md
index 9197e1e8..f38bfac3 100644
--- a/README.md
+++ b/README.md
@@ -143,9 +143,9 @@ singularity exec tools/singularity/kit_rt_MPI.sif \
   mpirun -np 4 ./build_singularity_mpi/KiT-RT tests/input/validation_tests/SN_solver_hpc/lattice_hpc_200_cpu_order2.cfg
 ```
 
-### 3. CPU + CUDA (single or multi-GPU via MPI)
+### 3. CPU + GPU backend (single or multi-GPU via MPI)
 
-#### 3a) Singularity installation
+#### 3a) CUDA (Singularity installation)
 ```bash
 cd tools/singularity
 sudo singularity build kit_rt_MPI_cuda.sif kit_rt_MPI_cuda.def
@@ -162,6 +162,23 @@ singularity exec --nv tools/singularity/kit_rt_MPI_cuda.sif \
 
 When compiled with `-DBUILD_CUDA_HPC=ON`, HPC runs use the CUDA backend if a GPU is visible, and fall back to CPU if no GPU is detected.
 
+#### 3b) ROCm/HIP (Singularity installation)
+```bash
+cd tools/singularity
+sudo singularity build kit_rt_MPI_rocm72.sif kit_rt_MPI_rocm72.def
+cd ../..
+mkdir -p build_singularity_rocm72
+cd build_singularity_rocm72
+singularity exec --rocm ../tools/singularity/kit_rt_MPI_rocm72.sif \
+  cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_MPI=ON -DBUILD_CUDA_HPC=OFF -DBUILD_HIP_HPC=ON -DBUILD_ML=OFF ..
+singularity exec --rocm ../tools/singularity/kit_rt_MPI_rocm72.sif make -j
+cd ..
+singularity exec --rocm tools/singularity/kit_rt_MPI_rocm72.sif \
+  mpirun -np 2 ./build_singularity_rocm72/KiT-RT tests/input/validation_tests/SN_solver_hpc/lattice_hpc_200_cuda_order2.cfg
+```
+
+When compiled with `-DBUILD_HIP_HPC=ON`, HPC runs use the HIP backend if a ROCm-capable GPU is visible, and fall back to CPU if no GPU is detected.
+
 ### 4. Build with TensorFlow backend (CPU + OpenMP only)
 
 ```bash
diff --git a/examples/configs/readme.md b/examples/configs/readme.md
deleted file mode 100644
index f1fa1529..00000000
--- a/examples/configs/readme.md
+++ /dev/null
@@ -1,3 +0,0 @@
-## Attention
- Please copy the configuration, that you want to simulate, from this folder to its parent directory. 
- 
diff --git a/include/solvers/snsolver_hpc_hip.hpp b/include/solvers/snsolver_hpc_hip.hpp
new file mode 100644
index 00000000..6789b423
--- /dev/null
+++ b/include/solvers/snsolver_hpc_hip.hpp
@@ -0,0 +1,246 @@
+#ifndef SNSOLVERHPCHIP_H
+#define SNSOLVERHPCHIP_H
+
+// include Matrix, Vector definitions
+#include "common/globalconstants.hpp"
+#include "common/typedef.hpp"
+
+// externals
+#include "spdlog/spdlog.h"
+
+// Forward Declarations
+class Config;
+class Mesh;
+class ProblemBase;
+
+/*! @brief Base class for all solvers. */
+class SNSolverHPCHIP
+{
+
+  private:
+    struct DeviceBuffers;
+
+    int _rank;
+    int _numProcs;
+    unsigned long _localNSys;
+    unsigned long _startSysIdx;
+    unsigned long _endSysIdx;
+
+    double _curSimTime;/*!< @brief current in-simulation time after current iteration     */
+    double _currTime;  /*!< @brief wall-time after current iteration */
+    Config* _settings; /*!< @brief config class for global information */
+    Mesh* _mesh;
+    ProblemBase* _problem;
+
+    // Time
+    unsigned long _nIter; /*!< @brief number of time steps, for non CSD, this equals _nEnergies, for _csd, _maxIter =_nEnergies-1*/
+    double _dT;           /*!< @brief energy/time step size */
+    int _idx_start_iter;  /*!< @brief index of first iteration */
+    // Mesh related members, memory optimized
+    unsigned long _nCells; /*!< @brief number of spatial cells */
+    unsigned long _nSys;   /*!< @brief number of equations in the transport system, i.e. num quad pts */
+    unsigned long _nq;     /*!< @brief number of quadrature points */
+    unsigned long _nDim;
+    unsigned long _nNbr;
+    unsigned long _nNodes;
+
+    std::vector<double> _areas;         /*!< @brief surface area of all spatial cells,
+                                           dim(_areas) = _NCells */
+    std::vector<double> _normals;       /*!< @brief edge normals multiplied by edge length,
+                                                        dim(_normals) = (_NCells,nEdgesPerCell,spatialDim) */
+    std::vector<unsigned> _neighbors;   /*!< @brief edge neighbor cell ids, dim(_neighbors) = (_NCells,nEdgesPerCell) */
+    std::vector<double> _cellMidPoints; /*!< @brief dim _nCells x _dim */
+
+    std::vector<double> _interfaceMidPoints;             /*!< @brief dim: _nCells x _nEdgesPerCell x _dim */
+    std::vector<BOUNDARY_TYPE> _cellBoundaryTypes;       /*!< @brief dim: _nCells x _nEdgesPerCell x _dim */
+    std::map<unsigned, std::vector<double>> _ghostCells; /*!< @brief Vector of ghost cells for boundary conditions. CAN BE MORE EFFICIENT */
+    std::vector<double> _relativeInterfaceMidPt;         /*!< @brief dim _nCells * _nNbr * _nDim */
+    std::vector<double> _relativeCellVertices;           /*!< @brief dim _nCells * _nNbr * _nDim */
+
+    std::map<unsigned, bool> _ghostCellsReflectingY; /*!< map that indicates if a ghostcell has a fixed value or is a mirroring boundary */
+    std::map<unsigned, bool> _ghostCellsReflectingX; /*!< map that indicates if a ghostcell has a fixed value or is a mirroring boundary */
+    std::vector<unsigned> _quadratureYReflection;    /*!< map that gives a Reflection against the y axis for the velocity ordinates */
+    std::vector<unsigned> _quadratureXReflection;    /*!< map that gives a Reflection against the y axis for the velocity ordinates */
+
+    unsigned _temporalOrder;      /*!< @brief temporal order (current: 1 & 2) */
+    unsigned _spatialOrder;       /*!< @brief spatial order (current: 1 & 2) */
+    std::vector<double> _solDx;   /*!< @brief dim = _nCells x _nSys x _dim*/
+    std::vector<double> _limiter; /*!< @brief dim = _nCells x _nSys */
+
+    // Scattering, absorption and source
+    std::vector<double> _sigmaS;           /*!< @brief dim: _nCells */
+    std::vector<double> _sigmaT;           /*!< @brief dim: _nCells */
+    std::vector<double> _source;           /*!< @brief dim: _nCells x _nSys */
+    std::vector<double> _scatteringKernel; /*!< @brief dim: _nSys x _nSys  */
+
+    // quadrature related numbers
+    std::vector<double> _quadPts;     /*!< @brief dim: _nSys x _dim*/
+    std::vector<double> _quadWeights; /*!< @brief dim: _nSys*/
+
+    // Solution related members
+    std::vector<double> _sol;  /*!< @brief dim = _nCells x _nSys */
+    std::vector<double> _flux; /*!< @brief dim = _nCells x _nSys */
+
+    // Output related members
+    std::vector<double> _scalarFlux;         /*!< @brief dim = _nCells  */
+    std::vector<double> _scalarFluxPrevIter; /*!< @brief scalar flux snapshot before current solver iteration */
+
+    // Lattice QOIS
+    unsigned _nOutputMoments;
+
+    double _mass;
+    double _rmsFlux;
+    double _curAbsorptionLattice;    /*!< @brief Absorption of particles at Lattice checkerboard regions at current time step */
+    double _totalAbsorptionLattice;  /*!< @brief Absorption of particles at Lattice checkerboard regions integrated until current time step */
+    double _curMaxAbsorptionLattice; /*!< @brief Maximum pointwise absorption of particles at Lattice checkerboard regions  until current time step */
+    double _curScalarOutflow;        /*!< @brief Outflow over whole boundary at current time step */
+    double _totalScalarOutflow;      /*!< @brief Outflow over whole boundary integrated until current time step */
+    double _curMaxOrdinateOutflow;   /*!< @brief Maximum ordinate-wise ouftlow  over boundary over all time steps */
+    std::vector<double> _localMaxOrdinateOutflow; /*!< @brief Maximum ordinate-wise ouftlow  over boundary over all time steps */
+    double _curScalarOutflowPeri1;                /*!< @brief Outflow over whole boundary at current time step */
+    double _totalScalarOutflowPeri1;              /*!< @brief Outflow over whole boundary integrated until current time step */
+    double _curScalarOutflowPeri2;                /*!< @brief Outflow over whole boundary at current time step */
+    double _totalScalarOutflowPeri2;              /*!< @brief Outflow over whole boundary integrated until current time step */
+    // helper
+    std::map<unsigned, std::vector<unsigned>> _cellsLatticePerimeter1;
+    std::map<unsigned, std::vector<unsigned>> _cellsLatticePerimeter2;
+    std::vector<bool> _isPerimeterLatticeCell1;
+    std::vector<bool> _isPerimeterLatticeCell2;
+
+    // Hohlraum QOIS
+    double _totalAbsorptionHohlraumCenter;
+    double _totalAbsorptionHohlraumVertical;
+    double _totalAbsorptionHohlraumHorizontal;
+    double _curAbsorptionHohlraumCenter;
+    double _curAbsorptionHohlraumVertical;
+    double _curAbsorptionHohlraumHorizontal;
+    double _varAbsorptionHohlraumGreen;
+
+    std::vector<std::vector<unsigned>> _probingCellsHohlraum; /*!< @brief Indices of cells that contain a probing sensor */
+    std::vector<double> _probingMoments;                      /*!< @brief Solution Momnets at the probing cells that contain a probing sensor */
+    unsigned _probingMomentsTimeIntervals;                    /*!< @brief Solution Momnets at the probing cells that contain a probing sensor */
+
+    unsigned _nProbingCellsLineGreen;               /*!< @brief Number of sampling cells that contain a probing sensor for the sliding window */
+    std::vector<unsigned> _probingCellsLineGreen;   /*!< @brief Indices of cells that contain a probing sensor for the sliding window */
+    std::vector<double> _absorptionValsLineSegment; /*!< @brief Avg Absorption value at the sampleing points of lineGreen */
+
+    unsigned _nProbingCellsBlocksGreen;
+    std::vector<std::vector<unsigned>> _probingCellsBlocksGreen; /*!< @brief Indices of cells that contain a probing sensor blocks */
+    std::vector<double> _absorptionValsBlocksGreen;              /*!< @brief Avg Absorption value at the sampleing blocks of lineGreen */
+
+    // Design parameters
+    std::vector<double> _cornerUpperLeftGreen; /*!< @brief Coord of corner of the green area (minus thickness/2 of it) relative to the green center */
+    std::vector<double> _cornerLowerLeftGreen; /*!< @brief Coord of corner of the green area (minus thickness/2 of it) relative to the green center */
+    std::vector<double>
+        _cornerUpperRightGreen; /*!< @brief Coord of corner of the green area (minus thickness/2 of it) relative to the green center */
+    std::vector<double>
+        _cornerLowerRightGreen; /*!< @brief Coord of corner of the green area (minus thickness/2 of it) relative to the green center */
+
+    double _thicknessGreen;           /*!< @brief thickness of the green area */
+    std::vector<double> _centerGreen; /*!< @brief Center of the Hohlraum   */
+
+    // Output
+    std::vector<std::vector<std::vector<double>>> _outputFields; /*!< @brief Solver Output: dimensions
+                   (GroupID,FieldID,CellID).*/
+    std::vector<std::vector<std::string>> _outputFieldNames;     /*!< @brief Names of the outputFields: dimensions
+                           (GroupID,FieldID) */
+
+    std::vector<double> _screenOutputFields;          /*!< @brief Solver Output: dimensions (FieldID). */
+    std::vector<std::string> _screenOutputFieldNames; /*!< @brief Names of the outputFields: dimensions (FieldID) */
+
+    std::vector<double> _historyOutputFields;          /*!< @brief Solver Output: dimensions (FieldID). */
+    std::vector<std::string> _historyOutputFieldNames; /*!< @brief Names of the outputFields: dimensions (FieldID) */
+
+    // HIP backend state
+    bool _hipInitialized;
+    int _hipDeviceId;
+    DeviceBuffers* _device;
+    std::vector<int> _cellBoundaryTypesInt;
+    std::vector<double> _ghostCellValues;
+
+    // ---- Member functions ----
+
+    // Solver
+    void FluxOrder1();
+    void FluxOrder2();
+
+    void FVMUpdate();
+
+    void IterPostprocessing();
+
+    /*! @brief Sets vector of ghost cell values according to boundary conditions */
+    void SetGhostCells();
+
+    // Helper
+    /*! @brief ComputeTimeStep calculates the maximal stable time step using the
+    cfl number
+    @param cfl Courant-Friedrichs-Levy condition number */
+    double ComputeTimeStep( double cfl ) const;
+
+    // IO
+    /*! @brief Initializes the output groups and fields of this solver and names
+     * the fields */
+    void PrepareVolumeOutput();
+    /*! @brief Function that prepares VTK export and csv export of the current
+       solver iteration
+        @param idx_iter  current (pseudo) time iteration */
+    void WriteVolumeOutput( unsigned idx_iter );
+    /*! @brief Save Output solution at given energy (pseudo time) to VTK file.
+       Write frequency is given by option VOLUME_OUTPUT_FREQUENCY. Always prints
+       last iteration without iteration affix.
+        @param idx_iter  current (pseudo) time iteration */
+    void PrintVolumeOutput( int idx_iter );
+    /*! @brief Initialized the output fields and their Names for the screenoutput
+     */
+    void PrepareScreenOutput();
+
+    /*! @brief Function that writes screen and history output fields
+        @param idx_iter  current (pseudo) time iteration */
+    void WriteScalarOutput( unsigned idx_iter );
+    /*! @brief Prints ScreenOutputFields to Screen and to logger. Write frequency
+       is given by option SCREEN_OUTPUT_FREQUENCY. Always prints last iteration.
+        @param idx_iter  current (pseudo) time iteration */
+    void PrintScreenOutput( unsigned idx_iter );
+    /*! @brief Initialized the historyOutputFields and their Names for history
+       output. Write frequency is given by option HISTORY_OUTPUT_FREQUENCY. Always
+       prints last iteration. */
+    void PrepareHistoryOutput();
+    /*! @brief Prints HistoryOutputFields to logger
+        @param idx_iter  current (pseudo) time iteration */
+    void PrintHistoryOutput( unsigned idx_iter );
+    /*! @brief Pre Solver Screen and Logger Output */
+    void DrawPreSolverOutput();
+    /*! @brief Post Solver Screen and Logger Output */
+    void DrawPostSolverOutput();
+
+    // Helper
+    unsigned long Idx2D( unsigned long idx1, unsigned long idx2, unsigned long len2 );
+    unsigned long Idx3D( unsigned long idx1, unsigned long idx2, unsigned long idx3, unsigned long len2, unsigned long len3 );
+    bool IsAbsorptionLattice( double x, double y ) const;
+    void ComputeCellsPerimeterLattice();
+
+    void SetProbingCellsLineGreen();
+    void ComputeQOIsGreenProbingLine();
+    std::vector<unsigned> linspace2D( const std::vector<double>& start, const std::vector<double>& end, unsigned num_points );
+
+    void InitHIP();
+    void UploadStaticDataToDevice();
+    void UploadStateToDevice();
+    void DownloadStateFromDevice();
+    void FreeHIP();
+
+  public:
+    /*! @brief Solver constructor
+     *  @param settings config class that stores all needed config information */
+    SNSolverHPCHIP( Config* settings );
+
+    ~SNSolverHPCHIP();
+
+    /*! @brief Solve functions runs main iteration loop. Components of the solve
+     * loop are pure  and subclassed by the child solvers.  */
+    void Solve();
+
+    /*! @brief Save Output solution to VTK file */
+    void PrintVolumeOutput() const {};    // Only for debugging purposes.
+};
+#endif    // SNSOLVERHPCHIP_H
diff --git a/src/main.cpp b/src/main.cpp
index 76d8f63f..02188db7 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -18,6 +18,10 @@
 #include <cuda_runtime.h>
 #include "solvers/snsolver_hpc_cuda.hpp"
 #endif
+#ifdef KITRT_ENABLE_HIP_HPC
+#include <hip/hip_runtime.h>
+#include "solvers/snsolver_hpc_hip.hpp"
+#endif
 #include "solvers/solverbase.hpp"
 
 #ifdef BUILD_GUI
@@ -63,7 +67,7 @@ int main( int argc, char** argv ) {
     else {
         // Build solver
         if( config->GetHPC() ) {
-#ifdef KITRT_ENABLE_CUDA_HPC
+#if defined( KITRT_ENABLE_CUDA_HPC )
             int deviceCount = 0;
             if( cudaGetDeviceCount( &deviceCount ) == cudaSuccess && deviceCount > 0 ) {
                 SNSolverHPCCUDA* solver = new SNSolverHPCCUDA( config );
@@ -75,6 +79,18 @@ int main( int argc, char** argv ) {
                 solver->Solve();
                 delete solver;
             }
+#elif defined( KITRT_ENABLE_HIP_HPC )
+            int deviceCount = 0;
+            if( hipGetDeviceCount( &deviceCount ) == hipSuccess && deviceCount > 0 ) {
+                SNSolverHPCHIP* solver = new SNSolverHPCHIP( config );
+                solver->Solve();
+                delete solver;
+            }
+            else {
+                SNSolverHPC* solver = new SNSolverHPC( config );
+                solver->Solve();
+                delete solver;
+            }
 #else
             SNSolverHPC* solver = new SNSolverHPC( config );
             solver->Solve();
diff --git a/src/solvers/snsolver_hpc.hip b/src/solvers/snsolver_hpc.hip
new file mode 100644
index 00000000..3418da0b
--- /dev/null
+++ b/src/solvers/snsolver_hpc.hip
@@ -0,0 +1,2083 @@
+#ifdef IMPORT_MPI
+#include <mpi.h>
+#endif
+#include "common/config.hpp"
+#include "common/io.hpp"
+#include "common/mesh.hpp"
+#include "kernels/scatteringkernelbase.hpp"
+#include "problems/problembase.hpp"
+#include "quadratures/quadraturebase.hpp"
+#include "solvers/snsolver_hpc_hip.hpp"
+#include "toolboxes/textprocessingtoolbox.hpp"
+#include <algorithm>
+#include <cassert>
+#include <hip/hip_runtime.h>
+#include <limits>
+#include <string>
+
+struct SNSolverHPCHIP::DeviceBuffers {
+    double* areas                = nullptr;
+    double* normals              = nullptr;
+    unsigned* neighbors          = nullptr;
+    int* boundaryTypes           = nullptr;
+    double* ghostCellValues      = nullptr;
+    double* cellMidPoints        = nullptr;
+    double* interfaceMidPoints   = nullptr;
+    double* relativeInterfaceMid = nullptr;
+    double* relativeCellVertices = nullptr;
+
+    double* sigmaS      = nullptr;
+    double* sigmaT      = nullptr;
+    double* source      = nullptr;
+    double* quadPts     = nullptr;
+    double* quadWeights = nullptr;
+
+    double* sol        = nullptr;
+    double* solRK0     = nullptr;
+    double* flux       = nullptr;
+    double* scalarFlux = nullptr;
+    double* solDx      = nullptr;
+    double* limiter    = nullptr;
+};
+
+namespace {
+
+constexpr int HIP_BLOCK_SIZE = 256;
+
+inline void CheckHip( hipError_t status, const std::string& where ) {
+    if( status != hipSuccess ) {
+        ErrorMessages::Error( "HIP error in " + where + ": " + std::string( hipGetErrorString( status ) ), where );
+    }
+}
+
+template <typename T> inline void AllocateDeviceArray( T** ptr, std::size_t count, const std::string& name ) {
+    if( count == 0 ) {
+        *ptr = nullptr;
+        return;
+    }
+    CheckHip( hipMalloc( reinterpret_cast<void**>( ptr ), count * sizeof( T ) ), "hipMalloc(" + name + ")" );
+}
+
+template <typename T> inline void FreeDeviceArray( T*& ptr, const std::string& name ) {
+    if( ptr == nullptr ) return;
+    CheckHip( hipFree( ptr ), "hipFree(" + name + ")" );
+    ptr = nullptr;
+}
+
+__device__ __forceinline__ unsigned long Idx2DDevice( unsigned long idx1, unsigned long idx2, unsigned long len2 ) { return idx1 * len2 + idx2; }
+
+__device__ __forceinline__ unsigned long
+Idx3DDevice( unsigned long idx1, unsigned long idx2, unsigned long idx3, unsigned long len2, unsigned long len3 ) {
+    return idx1 * len2 * len3 + idx2 * len3 + idx3;
+}
+
+__global__ void FluxOrder1Kernel( unsigned long nCells,
+                                  unsigned long localNSys,
+                                  unsigned long nNbr,
+                                  unsigned long nDim,
+                                  const double* __restrict__ quadPts,
+                                  const double* __restrict__ normals,
+                                  const unsigned* __restrict__ neighbors,
+                                  const int* __restrict__ boundaryTypes,
+                                  const double* __restrict__ ghostCellValues,
+                                  const double* __restrict__ sol,
+                                  double* flux,
+                                  int neumannBoundary ) {
+    const unsigned long linearIdx = static_cast<unsigned long>( blockIdx.x ) * blockDim.x + threadIdx.x;
+    const unsigned long nCellSys  = nCells * localNSys;
+    if( linearIdx >= nCellSys ) return;
+
+    const unsigned long idxCell = linearIdx / localNSys;
+    const unsigned long idxSys  = linearIdx % localNSys;
+
+    double localFlux = 0.0;
+    for( unsigned long idxNbr = 0; idxNbr < nNbr; ++idxNbr ) {
+        const unsigned nbrCell  = neighbors[Idx2DDevice( idxCell, idxNbr, nNbr )];
+        const double localInner = quadPts[Idx2DDevice( idxSys, 0, nDim )] * normals[Idx3DDevice( idxCell, idxNbr, 0, nNbr, nDim )] +
+                                  quadPts[Idx2DDevice( idxSys, 1, nDim )] * normals[Idx3DDevice( idxCell, idxNbr, 1, nNbr, nDim )];
+
+        if( boundaryTypes[idxCell] == neumannBoundary && nbrCell == nCells ) {
+            localFlux += ( localInner > 0.0 ) ? localInner * sol[Idx2DDevice( idxCell, idxSys, localNSys )]
+                                              : localInner * ghostCellValues[Idx2DDevice( idxCell, idxSys, localNSys )];
+        }
+        else {
+            localFlux += ( localInner > 0.0 ) ? localInner * sol[Idx2DDevice( idxCell, idxSys, localNSys )]
+                                              : localInner * sol[Idx2DDevice( static_cast<unsigned long>( nbrCell ), idxSys, localNSys )];
+        }
+    }
+    flux[Idx2DDevice( idxCell, idxSys, localNSys )] = localFlux;
+}
+
+__global__ void FluxOrder2SlopeKernel( unsigned long nCells,
+                                       unsigned long localNSys,
+                                       unsigned long nNbr,
+                                       unsigned long nDim,
+                                       const double* __restrict__ areas,
+                                       const double* __restrict__ normals,
+                                       const unsigned* __restrict__ neighbors,
+                                       const int* __restrict__ boundaryTypes,
+                                       const double* __restrict__ sol,
+                                       double* solDx,
+                                       double* limiter,
+                                       int noneBoundary ) {
+    const unsigned long linearIdx = static_cast<unsigned long>( blockIdx.x ) * blockDim.x + threadIdx.x;
+    const unsigned long nCellSys  = nCells * localNSys;
+    if( linearIdx >= nCellSys ) return;
+
+    const unsigned long idxCell = linearIdx / localNSys;
+    const unsigned long idxSys  = linearIdx % localNSys;
+
+    const unsigned long limiterIdx = Idx2DDevice( idxCell, idxSys, localNSys );
+    const unsigned long solDxXIdx  = Idx3DDevice( idxCell, idxSys, 0, localNSys, nDim );
+    const unsigned long solDxYIdx  = Idx3DDevice( idxCell, idxSys, 1, localNSys, nDim );
+
+    if( boundaryTypes[idxCell] != noneBoundary ) {
+        limiter[limiterIdx] = 0.0;
+        solDx[solDxXIdx]    = 0.0;
+        solDx[solDxYIdx]    = 0.0;
+        return;
+    }
+
+    double slopeX        = 0.0;
+    double slopeY        = 0.0;
+    const double thisSol = sol[Idx2DDevice( idxCell, idxSys, localNSys )];
+
+    for( unsigned long idxNbr = 0; idxNbr < nNbr; ++idxNbr ) {
+        const unsigned nbrCell = neighbors[Idx2DDevice( idxCell, idxNbr, nNbr )];
+        const double avgVal    = 0.5 * ( thisSol + sol[Idx2DDevice( static_cast<unsigned long>( nbrCell ), idxSys, localNSys )] );
+        slopeX += avgVal * normals[Idx3DDevice( idxCell, idxNbr, 0, nNbr, nDim )];
+        slopeY += avgVal * normals[Idx3DDevice( idxCell, idxNbr, 1, nNbr, nDim )];
+    }
+
+    slopeX /= areas[idxCell];
+    slopeY /= areas[idxCell];
+
+    limiter[limiterIdx] = 1.0;
+    solDx[solDxXIdx]    = slopeX;
+    solDx[solDxYIdx]    = slopeY;
+}
+
+__global__ void FluxOrder2LimiterKernel( unsigned long nCells,
+                                         unsigned long localNSys,
+                                         unsigned long nNbr,
+                                         unsigned long nDim,
+                                         const double* __restrict__ areas,
+                                         const unsigned* __restrict__ neighbors,
+                                         const int* __restrict__ boundaryTypes,
+                                         const double* __restrict__ relativeCellVertices,
+                                         const double* __restrict__ sol,
+                                         const double* __restrict__ solDx,
+                                         double* limiter,
+                                         int noneBoundary,
+                                         double eps ) {
+    const unsigned long linearIdx = static_cast<unsigned long>( blockIdx.x ) * blockDim.x + threadIdx.x;
+    const unsigned long nCellSys  = nCells * localNSys;
+    if( linearIdx >= nCellSys ) return;
+
+    const unsigned long idxCell = linearIdx / localNSys;
+    const unsigned long idxSys  = linearIdx % localNSys;
+    if( boundaryTypes[idxCell] != noneBoundary ) return;
+
+    const unsigned long solIdx = Idx2DDevice( idxCell, idxSys, localNSys );
+    const double thisSol       = sol[solIdx];
+
+    double minSol = thisSol;
+    double maxSol = thisSol;
+    for( unsigned long idxNbr = 0; idxNbr < nNbr; ++idxNbr ) {
+        const unsigned nbrCell = neighbors[Idx2DDevice( idxCell, idxNbr, nNbr )];
+        const double nbrSol    = sol[Idx2DDevice( static_cast<unsigned long>( nbrCell ), idxSys, localNSys )];
+        maxSol                 = fmax( maxSol, nbrSol );
+        minSol                 = fmin( minSol, nbrSol );
+    }
+
+    double limVal   = 1.0;
+    const double d  = areas[idxCell];
+    const double sx = solDx[Idx3DDevice( idxCell, idxSys, 0, localNSys, nDim )];
+    const double sy = solDx[Idx3DDevice( idxCell, idxSys, 1, localNSys, nDim )];
+
+    for( unsigned long idxNbr = 0; idxNbr < nNbr; ++idxNbr ) {
+        const double gaussPoint = sx * relativeCellVertices[Idx3DDevice( idxCell, idxNbr, 0, nNbr, nDim )] +
+                                  sy * relativeCellVertices[Idx3DDevice( idxCell, idxNbr, 1, nNbr, nDim )];
+
+        const double delta1Max = maxSol - thisSol;
+        const double delta1Min = minSol - thisSol;
+        double r               = 1.0;
+        if( gaussPoint > 0.0 ) {
+            r = ( ( delta1Max * delta1Max + d ) * gaussPoint + 2.0 * gaussPoint * gaussPoint * delta1Max ) /
+                ( delta1Max * delta1Max + 2.0 * gaussPoint * gaussPoint + delta1Max * gaussPoint + d ) / ( gaussPoint + eps );
+        }
+        else {
+            r = ( ( delta1Min * delta1Min + d ) * gaussPoint + 2.0 * gaussPoint * gaussPoint * delta1Min ) /
+                ( delta1Min * delta1Min + 2.0 * gaussPoint * gaussPoint + delta1Min * gaussPoint + d ) / ( gaussPoint - eps );
+        }
+        if( fabs( gaussPoint ) < eps ) {
+            r = 1.0;
+        }
+        limVal = fmin( limVal, r );
+    }
+    limiter[solIdx] = limVal;
+}
+
+__global__ void FluxOrder2Kernel( unsigned long nCells,
+                                  unsigned long localNSys,
+                                  unsigned long nNbr,
+                                  unsigned long nDim,
+                                  const double* __restrict__ quadPts,
+                                  const double* __restrict__ normals,
+                                  const unsigned* __restrict__ neighbors,
+                                  const int* __restrict__ boundaryTypes,
+                                  const double* __restrict__ ghostCellValues,
+                                  const double* __restrict__ interfaceMidPoints,
+                                  const double* __restrict__ cellMidPoints,
+                                  const double* __restrict__ relativeInterfaceMid,
+                                  const double* __restrict__ sol,
+                                  const double* __restrict__ solDx,
+                                  const double* __restrict__ limiter,
+                                  double* flux,
+                                  int neumannBoundary ) {
+    const unsigned long linearIdx = static_cast<unsigned long>( blockIdx.x ) * blockDim.x + threadIdx.x;
+    const unsigned long nCellSys  = nCells * localNSys;
+    if( linearIdx >= nCellSys ) return;
+
+    const unsigned long idxCell = linearIdx / localNSys;
+    const unsigned long idxSys  = linearIdx % localNSys;
+
+    double localFlux = 0.0;
+    for( unsigned long idxNbr = 0; idxNbr < nNbr; ++idxNbr ) {
+        const unsigned nbrCell  = neighbors[Idx2DDevice( idxCell, idxNbr, nNbr )];
+        const double localInner = quadPts[Idx2DDevice( idxSys, 0, nDim )] * normals[Idx3DDevice( idxCell, idxNbr, 0, nNbr, nDim )] +
+                                  quadPts[Idx2DDevice( idxSys, 1, nDim )] * normals[Idx3DDevice( idxCell, idxNbr, 1, nNbr, nDim )];
+
+        if( boundaryTypes[idxCell] == neumannBoundary && nbrCell == nCells ) {
+            localFlux += ( localInner > 0.0 ) ? localInner * sol[Idx2DDevice( idxCell, idxSys, localNSys )]
+                                              : localInner * ghostCellValues[Idx2DDevice( idxCell, idxSys, localNSys )];
+            continue;
+        }
+
+        if( localInner > 0.0 ) {
+            const double recVal =
+                sol[Idx2DDevice( idxCell, idxSys, localNSys )] +
+                limiter[Idx2DDevice( idxCell, idxSys, localNSys )] * ( solDx[Idx3DDevice( idxCell, idxSys, 0, localNSys, nDim )] *
+                                                                           relativeInterfaceMid[Idx3DDevice( idxCell, idxNbr, 0, nNbr, nDim )] +
+                                                                       solDx[Idx3DDevice( idxCell, idxSys, 1, localNSys, nDim )] *
+                                                                           relativeInterfaceMid[Idx3DDevice( idxCell, idxNbr, 1, nNbr, nDim )] );
+            localFlux += localInner * recVal;
+        }
+        else {
+            const unsigned long nbrCellUL = static_cast<unsigned long>( nbrCell );
+            const double recVal =
+                sol[Idx2DDevice( nbrCellUL, idxSys, localNSys )] +
+                limiter[Idx2DDevice( nbrCellUL, idxSys, localNSys )] *
+                    ( solDx[Idx3DDevice( nbrCellUL, idxSys, 0, localNSys, nDim )] *
+                          ( interfaceMidPoints[Idx3DDevice( idxCell, idxNbr, 0, nNbr, nDim )] - cellMidPoints[Idx2DDevice( nbrCellUL, 0, nDim )] ) +
+                      solDx[Idx3DDevice( nbrCellUL, idxSys, 1, localNSys, nDim )] *
+                          ( interfaceMidPoints[Idx3DDevice( idxCell, idxNbr, 1, nNbr, nDim )] - cellMidPoints[Idx2DDevice( nbrCellUL, 1, nDim )] ) );
+            localFlux += localInner * recVal;
+        }
+    }
+
+    flux[Idx2DDevice( idxCell, idxSys, localNSys )] = localFlux;
+}
+
+__global__ void FVMUpdateKernel( unsigned long nCells,
+                                 unsigned long localNSys,
+                                 double dT,
+                                 const double* __restrict__ sigmaS,
+                                 const double* __restrict__ sigmaT,
+                                 const double* __restrict__ source,
+                                 const double* __restrict__ areas,
+                                 const double* __restrict__ quadWeights,
+                                 const double* __restrict__ flux,
+                                 double* sol,
+                                 double* scalarFlux,
+                                 double invTwoPi ) {
+    const unsigned long idxCell = static_cast<unsigned long>( blockIdx.x ) * blockDim.x + threadIdx.x;
+    if( idxCell >= nCells ) return;
+
+    const double scattering = sigmaS[idxCell] * scalarFlux[idxCell] * invTwoPi;
+    double localScalarFlux  = 0.0;
+
+    for( unsigned long idxSys = 0; idxSys < localNSys; ++idxSys ) {
+        const unsigned long idx = Idx2DDevice( idxCell, idxSys, localNSys );
+        double newVal           = ( 1.0 - dT * sigmaT[idxCell] ) * sol[idx] - dT / areas[idxCell] * flux[idx] + dT * ( scattering + source[idx] );
+        newVal                  = fmax( newVal, 0.0 );
+        sol[idx]                = newVal;
+        localScalarFlux += newVal * quadWeights[idxSys];
+    }
+    scalarFlux[idxCell] = localScalarFlux;
+}
+
+__global__ void RK2AverageAndScalarFluxKernel( unsigned long nCells,
+                                               unsigned long localNSys,
+                                               const double* __restrict__ quadWeights,
+                                               const double* __restrict__ solRK0,
+                                               double* sol,
+                                               double* scalarFlux ) {
+    const unsigned long idxCell = static_cast<unsigned long>( blockIdx.x ) * blockDim.x + threadIdx.x;
+    if( idxCell >= nCells ) return;
+
+    double localScalarFlux = 0.0;
+    for( unsigned long idxSys = 0; idxSys < localNSys; ++idxSys ) {
+        const unsigned long idx = Idx2DDevice( idxCell, idxSys, localNSys );
+        const double avgVal     = 0.5 * ( solRK0[idx] + sol[idx] );
+        sol[idx]                = avgVal;
+        localScalarFlux += avgVal * quadWeights[idxSys];
+    }
+    scalarFlux[idxCell] = localScalarFlux;
+}
+
+}    // namespace
+
+SNSolverHPCHIP::SNSolverHPCHIP( Config* settings ) {
+#ifdef IMPORT_MPI
+    // Initialize MPI
+    MPI_Comm_size( MPI_COMM_WORLD, &_numProcs );
+    MPI_Comm_rank( MPI_COMM_WORLD, &_rank );
+#endif
+#ifndef IMPORT_MPI
+    _numProcs = 1;    // default values
+    _rank     = 0;
+#endif
+    _settings       = settings;
+    _currTime       = 0.0;
+    _idx_start_iter = 0;
+    _nOutputMoments = 2;    //  Currently only u_1 (x direction) and u_1 (y direction) are supported
+    _hipInitialized = false;
+    _hipDeviceId    = 0;
+    _device         = nullptr;
+    // Create Mesh
+    _mesh = LoadSU2MeshFromFile( settings );
+    _settings->SetNCells( _mesh->GetNumCells() );
+    auto quad = QuadratureBase::Create( settings );
+    _settings->SetNQuadPoints( quad->GetNq() );
+
+    _nCells = static_cast<unsigned long>( _mesh->GetNumCells() );
+    _nNbr   = static_cast<unsigned long>( _mesh->GetNumNodesPerCell() );
+    _nDim   = static_cast<unsigned long>( _mesh->GetDim() );
+    _nNodes = static_cast<unsigned long>( _mesh->GetNumNodes() );
+
+    _nq   = static_cast<unsigned long>( quad->GetNq() );
+    _nSys = _nq;
+
+    if( static_cast<unsigned long>( _numProcs ) > _nq ) {
+        ErrorMessages::Error( "The number of processors must be less than or equal to the number of quadrature points.", CURRENT_FUNCTION );
+    }
+
+    const unsigned long numRanks  = static_cast<unsigned long>( _numProcs );
+    const unsigned long rankIndex = static_cast<unsigned long>( _rank );
+    const unsigned long baseChunk = _nSys / numRanks;
+    const unsigned long remainder = _nSys % numRanks;
+
+    _localNSys   = baseChunk + ( rankIndex < remainder ? 1UL : 0UL );
+    _startSysIdx = rankIndex * baseChunk + std::min( rankIndex, remainder );
+    _endSysIdx   = _startSysIdx + _localNSys;
+
+    // std::cout << "Rank: " << _rank << " startSysIdx: " << _startSysIdx << " endSysIdx: " << _endSysIdx <<  " localNSys: " << _localNSys <<
+    // std::endl;
+
+    _spatialOrder  = _settings->GetSpatialOrder();
+    _temporalOrder = _settings->GetTemporalOrder();
+
+    _areas                  = std::vector<double>( _nCells );
+    _normals                = std::vector<double>( _nCells * _nNbr * _nDim );
+    _neighbors              = std::vector<unsigned>( _nCells * _nNbr );
+    _cellMidPoints          = std::vector<double>( _nCells * _nDim );
+    _interfaceMidPoints     = std::vector<double>( _nCells * _nNbr * _nDim );
+    _cellBoundaryTypes      = std::vector<BOUNDARY_TYPE>( _nCells );
+    _relativeInterfaceMidPt = std::vector<double>( _nCells * _nNbr * _nDim );
+    _relativeCellVertices   = std::vector<double>( _nCells * _nNbr * _nDim );
+
+    // Slope
+    _solDx   = std::vector<double>( _nCells * _localNSys * _nDim, 0.0 );
+    _limiter = std::vector<double>( _nCells * _localNSys, 0.0 );
+
+    // Physics
+    _sigmaS = std::vector<double>( _nCells );
+    _sigmaT = std::vector<double>( _nCells );
+    _source = std::vector<double>( _nCells * _localNSys );
+
+    // Quadrature
+    _quadPts     = std::vector<double>( _localNSys * _nDim );
+    _quadWeights = std::vector<double>( _localNSys );
+
+    // Solution
+    _sol  = std::vector<double>( _nCells * _localNSys );
+    _flux = std::vector<double>( _nCells * _localNSys );
+
+    _scalarFlux              = std::vector<double>( _nCells );
+    _scalarFluxPrevIter      = std::vector<double>( _nCells );
+    _localMaxOrdinateOutflow = std::vector<double>( _nCells );
+
+    auto areas           = _mesh->GetCellAreas();
+    auto neighbors       = _mesh->GetNeighbours();
+    auto normals         = _mesh->GetNormals();
+    auto cellMidPts      = _mesh->GetCellMidPoints();
+    auto interfaceMidPts = _mesh->GetInterfaceMidPoints();
+    auto boundaryTypes   = _mesh->GetBoundaryTypes();
+    auto nodes           = _mesh->GetNodes();
+    auto cells           = _mesh->GetCells();
+
+#pragma omp parallel for
+    for( unsigned long idx_cell = 0; idx_cell < _nCells; idx_cell++ ) {
+        _areas[idx_cell] = areas[idx_cell];
+    }
+
+    _dT    = ComputeTimeStep( _settings->GetCFL() );
+    _nIter = unsigned( _settings->GetTEnd() / _dT ) + 1;
+
+    auto quadPoints  = quad->GetPoints();
+    auto quadWeights = quad->GetWeights();
+
+    _problem    = ProblemBase::Create( _settings, _mesh, quad );
+    auto sigmaT = _problem->GetTotalXS( Vector( _nIter, 0.0 ) );
+    auto sigmaS = _problem->GetScatteringXS( Vector( _nIter, 0.0 ) );
+    auto source = _problem->GetExternalSource( Vector( _nIter, 0.0 ) );
+
+    // Copy to everything to solver
+    _mass = 0;
+#pragma omp parallel for
+    for( unsigned long idx_sys = 0; idx_sys < _localNSys; idx_sys++ ) {
+        for( unsigned long idx_dim = 0; idx_dim < _nDim; idx_dim++ ) {
+            _quadPts[Idx2D( idx_sys, idx_dim, _nDim )] = quadPoints[idx_sys + _startSysIdx][idx_dim];
+        }
+        if( _settings->GetQuadName() == QUAD_GaussLegendreTensorized2D ) {
+            _quadWeights[idx_sys] =
+                2.0 * quadWeights[idx_sys + _startSysIdx];    // Rescaling of quadweights TODO: Check if this needs general refactoring
+        }
+        else {
+            _quadWeights[idx_sys] = quadWeights[idx_sys + _startSysIdx];    // Rescaling of quadweights TODO: Check if this needs general refactoring}
+        }
+    }
+
+#pragma omp parallel for
+    for( unsigned long idx_cell = 0; idx_cell < _nCells; idx_cell++ ) {
+        _cellBoundaryTypes[idx_cell] = boundaryTypes[idx_cell];
+
+        for( unsigned long idx_dim = 0; idx_dim < _nDim; idx_dim++ ) {
+            _cellMidPoints[Idx2D( idx_cell, idx_dim, _nDim )] = cellMidPts[idx_cell][idx_dim];
+        }
+
+        for( unsigned long idx_nbr = 0; idx_nbr < _nNbr; idx_nbr++ ) {
+
+            _neighbors[Idx2D( idx_cell, idx_nbr, _nNbr )] = neighbors[idx_cell][idx_nbr];
+
+            for( unsigned long idx_dim = 0; idx_dim < _nDim; idx_dim++ ) {
+                _normals[Idx3D( idx_cell, idx_nbr, idx_dim, _nNbr, _nDim )]            = normals[idx_cell][idx_nbr][idx_dim];
+                _interfaceMidPoints[Idx3D( idx_cell, idx_nbr, idx_dim, _nNbr, _nDim )] = interfaceMidPts[idx_cell][idx_nbr][idx_dim];
+                _relativeInterfaceMidPt[Idx3D( idx_cell, idx_nbr, idx_dim, _nNbr, _nDim )] =
+                    _interfaceMidPoints[Idx3D( idx_cell, idx_nbr, idx_dim, _nNbr, _nDim )] - _cellMidPoints[Idx2D( idx_cell, idx_dim, _nDim )];
+                _relativeCellVertices[Idx3D( idx_cell, idx_nbr, idx_dim, _nNbr, _nDim )] =
+                    nodes[cells[idx_cell][idx_nbr]][idx_dim] - cellMidPts[idx_cell][idx_dim];
+            }
+        }
+
+        _sigmaS[idx_cell]     = sigmaS[0][idx_cell];
+        _sigmaT[idx_cell]     = sigmaT[0][idx_cell];
+        _scalarFlux[idx_cell] = 0;
+
+        for( unsigned long idx_sys = 0; idx_sys < _localNSys; idx_sys++ ) {
+            _source[Idx2D( idx_cell, idx_sys, _localNSys )] = source[0][idx_cell][0];    // CAREFUL HERE hardcoded to isotropic source
+            _sol[Idx2D( idx_cell, idx_sys, _localNSys )]    = 0.0;                       // initial condition is zero
+
+            _scalarFlux[idx_cell] += _sol[Idx2D( idx_cell, idx_sys, _localNSys )] * _quadWeights[idx_sys];
+        }
+        // _mass += _scalarFlux[idx_cell] * _areas[idx_cell];
+    }
+
+    // Lattice
+    {
+        _curAbsorptionLattice    = 0;
+        _totalAbsorptionLattice  = 0;
+        _curMaxAbsorptionLattice = 0;
+        _curScalarOutflow        = 0;
+        _totalScalarOutflow      = 0;
+        _curMaxOrdinateOutflow   = 0;
+        _curScalarOutflowPeri1   = 0;
+        _totalScalarOutflowPeri1 = 0;
+        _curScalarOutflowPeri2   = 0;
+        _totalScalarOutflowPeri2 = 0;
+        ComputeCellsPerimeterLattice();
+    }
+    // Hohlraum
+    {
+        _totalAbsorptionHohlraumCenter     = 0;
+        _totalAbsorptionHohlraumVertical   = 0;
+        _totalAbsorptionHohlraumHorizontal = 0;
+        _curAbsorptionHohlraumCenter       = 0;
+        _curAbsorptionHohlraumVertical     = 0;
+        _curAbsorptionHohlraumHorizontal   = 0;
+        _varAbsorptionHohlraumGreen        = 0;
+    }
+
+    if( _settings->GetLoadRestartSolution() )
+        _idx_start_iter = LoadRestartSolution( _settings->GetOutputFile(),
+                                               _sol,
+                                               _scalarFlux,
+                                               _rank,
+                                               _nCells,
+                                               _totalAbsorptionHohlraumCenter,
+                                               _totalAbsorptionHohlraumVertical,
+                                               _totalAbsorptionHohlraumHorizontal,
+                                               _totalAbsorptionLattice ) +
+                          1;
+
+#ifdef IMPORT_MPI
+    MPI_Barrier( MPI_COMM_WORLD );
+#endif
+    SetGhostCells();
+    InitHIP();
+    UploadStaticDataToDevice();
+    UploadStateToDevice();
+
+    if( _rank == 0 ) {
+        PrepareScreenOutput();     // Screen Output
+        PrepareHistoryOutput();    // History Output
+    }
+#ifdef IMPORT_MPI
+    MPI_Barrier( MPI_COMM_WORLD );
+#endif
+    delete quad;
+
+    // Initialiye QOIS
+    _mass    = 0;
+    _rmsFlux = 0;
+
+    {    // Hohlraum
+
+        unsigned n_probes = 4;
+        if( _settings->GetProblemName() == PROBLEM_SymmetricHohlraum ) n_probes = 4;
+        // if( _settings->GetProblemName() == PROBLEM_QuarterHohlraum ) n_probes = 2;
+
+        _probingMoments = std::vector<double>( n_probes * 3, 0. );    // 10 time horizons
+
+        if( _settings->GetProblemName() == PROBLEM_SymmetricHohlraum ) {
+            _probingCellsHohlraum = {
+                _mesh->GetCellsofBall( -0.4, 0., 0.01 ),
+                _mesh->GetCellsofBall( 0.4, 0., 0.01 ),
+                _mesh->GetCellsofBall( 0., -0.5, 0.01 ),
+                _mesh->GetCellsofBall( 0., 0.5, 0.01 ),
+            };
+        }
+        // else if( _settings->GetProblemName() == PROBLEM_QuarterHohlraum ) {
+        //     _probingCellsHohlraum = {
+        //         _mesh->GetCellsofBall( 0.4, 0., 0.01 ),
+        //         _mesh->GetCellsofBall( 0., 0.5, 0.01 ),
+        //     };
+        // }
+
+        // Green
+        _thicknessGreen = 0.05;
+
+        if( _settings->GetProblemName() == PROBLEM_SymmetricHohlraum ) {
+            _centerGreen           = { _settings->GetPosXCenterGreenHohlraum(), _settings->GetPosYCenterGreenHohlraum() };
+            _cornerUpperLeftGreen  = { -0.2 + _centerGreen[0], 0.4 + _centerGreen[1] };
+            _cornerLowerLeftGreen  = { -0.2 + _centerGreen[0], -0.4 + _centerGreen[1] };
+            _cornerUpperRightGreen = { 0.2 + _centerGreen[0], 0.4 + _centerGreen[1] };
+            _cornerLowerRightGreen = { 0.2 + _centerGreen[0], -0.4 + _centerGreen[1] };
+        }
+        // else {
+        //     _centerGreen           = { 0.0, 0.0 };
+        //     _cornerUpperLeftGreen  = { 0., 0.4 - _thicknessGreen / 2.0 };
+        //     _cornerLowerLeftGreen  = { 0., +_thicknessGreen / 2.0 };
+        //     _cornerUpperRightGreen = { 0.2 - _thicknessGreen / 2.0, 0.4 - _thicknessGreen / 2.0 };
+        //     _cornerLowerRightGreen = { 0.2 - _thicknessGreen / 2.0, 0. + _thicknessGreen / 2.0 };
+        // }
+
+        _nProbingCellsLineGreen = _settings->GetNumProbingCellsLineHohlraum();
+
+        _nProbingCellsBlocksGreen  = 44;
+        _absorptionValsBlocksGreen = std::vector<double>( _nProbingCellsBlocksGreen, 0. );
+        _absorptionValsLineSegment = std::vector<double>( _nProbingCellsLineGreen, 0.0 );
+
+        SetProbingCellsLineGreen();    // ONLY FOR HOHLRAUM
+    }
+}
+
+void SNSolverHPCHIP::InitHIP() {
+    int nDevices = 0;
+    CheckHip( hipGetDeviceCount( &nDevices ), "hipGetDeviceCount" );
+    if( nDevices < 1 ) {
+        ErrorMessages::Error( "No HIP-capable GPU detected, but SNSolverHPCHIP was requested.", CURRENT_FUNCTION );
+    }
+
+    int localRank = 0;
+    int localSize = 1;
+#ifdef IMPORT_MPI
+    MPI_Comm localComm = MPI_COMM_NULL;
+    MPI_Comm_split_type( MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, _rank, MPI_INFO_NULL, &localComm );
+    MPI_Comm_rank( localComm, &localRank );
+    MPI_Comm_size( localComm, &localSize );
+    MPI_Comm_free( &localComm );
+#endif
+
+    _hipDeviceId = localRank % nDevices;
+    CheckHip( hipSetDevice( _hipDeviceId ), "hipSetDevice" );
+
+    if( _rank == 0 ) {
+        auto log = spdlog::get( "event" );
+        if( log ) {
+            log->info( "| HIP backend: {} local MPI rank(s), {} visible HIP device(s).", localSize, nDevices );
+            if( localSize > nDevices ) {
+                log->warn( "| HIP backend: {} local MPI rank(s) exceed {} visible device(s); GPUs will be shared.", localSize, nDevices );
+            }
+        }
+    }
+
+    _device = new DeviceBuffers();
+
+    const std::size_t nCells      = static_cast<std::size_t>( _nCells );
+    const std::size_t nCellNbr    = nCells * static_cast<std::size_t>( _nNbr );
+    const std::size_t nCellNbrDim = nCellNbr * static_cast<std::size_t>( _nDim );
+    const std::size_t nCellSys    = nCells * static_cast<std::size_t>( _localNSys );
+    const std::size_t nSysDim     = static_cast<std::size_t>( _localNSys ) * static_cast<std::size_t>( _nDim );
+
+    AllocateDeviceArray( &_device->areas, nCells, "areas" );
+    AllocateDeviceArray( &_device->normals, nCellNbrDim, "normals" );
+    AllocateDeviceArray( &_device->neighbors, nCellNbr, "neighbors" );
+    AllocateDeviceArray( &_device->boundaryTypes, nCells, "boundaryTypes" );
+    AllocateDeviceArray( &_device->ghostCellValues, nCellSys, "ghostCellValues" );
+    AllocateDeviceArray( &_device->cellMidPoints, nCells * static_cast<std::size_t>( _nDim ), "cellMidPoints" );
+    AllocateDeviceArray( &_device->interfaceMidPoints, nCellNbrDim, "interfaceMidPoints" );
+    AllocateDeviceArray( &_device->relativeInterfaceMid, nCellNbrDim, "relativeInterfaceMid" );
+    AllocateDeviceArray( &_device->relativeCellVertices, nCellNbrDim, "relativeCellVertices" );
+
+    AllocateDeviceArray( &_device->sigmaS, nCells, "sigmaS" );
+    AllocateDeviceArray( &_device->sigmaT, nCells, "sigmaT" );
+    AllocateDeviceArray( &_device->source, nCellSys, "source" );
+    AllocateDeviceArray( &_device->quadPts, nSysDim, "quadPts" );
+    AllocateDeviceArray( &_device->quadWeights, static_cast<std::size_t>( _localNSys ), "quadWeights" );
+
+    AllocateDeviceArray( &_device->sol, nCellSys, "sol" );
+    AllocateDeviceArray( &_device->solRK0, nCellSys, "solRK0" );
+    AllocateDeviceArray( &_device->flux, nCellSys, "flux" );
+    AllocateDeviceArray( &_device->scalarFlux, nCells, "scalarFlux" );
+    AllocateDeviceArray( &_device->solDx, nCellSys * static_cast<std::size_t>( _nDim ), "solDx" );
+    AllocateDeviceArray( &_device->limiter, nCellSys, "limiter" );
+
+    _hipInitialized = true;
+}
+
+void SNSolverHPCHIP::UploadStaticDataToDevice() {
+    if( !_hipInitialized || _device == nullptr ) {
+        ErrorMessages::Error( "HIP backend was not initialized before UploadStaticDataToDevice.", CURRENT_FUNCTION );
+    }
+
+    _cellBoundaryTypesInt.resize( _nCells, static_cast<int>( BOUNDARY_TYPE::INVALID ) );
+#pragma omp parallel for
+    for( unsigned long idxCell = 0; idxCell < _nCells; ++idxCell ) {
+        _cellBoundaryTypesInt[idxCell] = static_cast<int>( _cellBoundaryTypes[idxCell] );
+    }
+
+    _ghostCellValues.assign( static_cast<std::size_t>( _nCells ) * static_cast<std::size_t>( _localNSys ), 0.0 );
+    for( const auto& cellEntry : _ghostCells ) {
+        const unsigned idxCell             = cellEntry.first;
+        const std::vector<double>& values  = cellEntry.second;
+        const unsigned long nGhostOrdinate = std::min<unsigned long>( _localNSys, static_cast<unsigned long>( values.size() ) );
+        for( unsigned long idxSys = 0; idxSys < nGhostOrdinate; ++idxSys ) {
+            _ghostCellValues[Idx2D( idxCell, idxSys, _localNSys )] = values[idxSys];
+        }
+    }
+
+    const std::size_t nCells      = static_cast<std::size_t>( _nCells );
+    const std::size_t nCellNbr    = nCells * static_cast<std::size_t>( _nNbr );
+    const std::size_t nCellNbrDim = nCellNbr * static_cast<std::size_t>( _nDim );
+    const std::size_t nCellSys    = nCells * static_cast<std::size_t>( _localNSys );
+    const std::size_t nSysDim     = static_cast<std::size_t>( _localNSys ) * static_cast<std::size_t>( _nDim );
+
+    CheckHip( hipMemcpy( _device->areas, _areas.data(), nCells * sizeof( double ), hipMemcpyHostToDevice ), "copy areas" );
+    CheckHip( hipMemcpy( _device->normals, _normals.data(), nCellNbrDim * sizeof( double ), hipMemcpyHostToDevice ), "copy normals" );
+    CheckHip( hipMemcpy( _device->neighbors, _neighbors.data(), nCellNbr * sizeof( unsigned ), hipMemcpyHostToDevice ), "copy neighbors" );
+    CheckHip( hipMemcpy( _device->boundaryTypes, _cellBoundaryTypesInt.data(), nCells * sizeof( int ), hipMemcpyHostToDevice ),
+              "copy boundary types" );
+    CheckHip( hipMemcpy( _device->ghostCellValues, _ghostCellValues.data(), nCellSys * sizeof( double ), hipMemcpyHostToDevice ),
+              "copy ghost values" );
+    CheckHip(
+        hipMemcpy(
+            _device->cellMidPoints, _cellMidPoints.data(), nCells * static_cast<std::size_t>( _nDim ) * sizeof( double ), hipMemcpyHostToDevice ),
+        "copy cell midpoints" );
+    CheckHip( hipMemcpy( _device->interfaceMidPoints, _interfaceMidPoints.data(), nCellNbrDim * sizeof( double ), hipMemcpyHostToDevice ),
+              "copy interface midpoints" );
+    CheckHip( hipMemcpy( _device->relativeInterfaceMid, _relativeInterfaceMidPt.data(), nCellNbrDim * sizeof( double ), hipMemcpyHostToDevice ),
+              "copy relative interface points" );
+    CheckHip( hipMemcpy( _device->relativeCellVertices, _relativeCellVertices.data(), nCellNbrDim * sizeof( double ), hipMemcpyHostToDevice ),
+              "copy relative vertices" );
+    CheckHip( hipMemcpy( _device->sigmaS, _sigmaS.data(), nCells * sizeof( double ), hipMemcpyHostToDevice ), "copy sigmaS" );
+    CheckHip( hipMemcpy( _device->sigmaT, _sigmaT.data(), nCells * sizeof( double ), hipMemcpyHostToDevice ), "copy sigmaT" );
+    CheckHip( hipMemcpy( _device->source, _source.data(), nCellSys * sizeof( double ), hipMemcpyHostToDevice ), "copy source" );
+    CheckHip( hipMemcpy( _device->quadPts, _quadPts.data(), nSysDim * sizeof( double ), hipMemcpyHostToDevice ), "copy quad points" );
+    CheckHip(
+        hipMemcpy( _device->quadWeights, _quadWeights.data(), static_cast<std::size_t>( _localNSys ) * sizeof( double ), hipMemcpyHostToDevice ),
+        "copy quad weights" );
+
+    CheckHip( hipMemset( _device->flux, 0, nCellSys * sizeof( double ) ), "init flux" );
+    CheckHip( hipMemcpy( _device->solDx, _solDx.data(), nCellSys * static_cast<std::size_t>( _nDim ) * sizeof( double ), hipMemcpyHostToDevice ),
+              "init solDx" );
+    CheckHip( hipMemcpy( _device->limiter, _limiter.data(), nCellSys * sizeof( double ), hipMemcpyHostToDevice ), "init limiter" );
+}
+
+void SNSolverHPCHIP::UploadStateToDevice() {
+    if( !_hipInitialized || _device == nullptr ) {
+        ErrorMessages::Error( "HIP backend was not initialized before UploadStateToDevice.", CURRENT_FUNCTION );
+    }
+    const std::size_t nCells   = static_cast<std::size_t>( _nCells );
+    const std::size_t nCellSys = nCells * static_cast<std::size_t>( _localNSys );
+
+    CheckHip( hipMemcpy( _device->sol, _sol.data(), nCellSys * sizeof( double ), hipMemcpyHostToDevice ), "upload sol" );
+    CheckHip( hipMemcpy( _device->scalarFlux, _scalarFlux.data(), nCells * sizeof( double ), hipMemcpyHostToDevice ), "upload scalar flux" );
+}
+
+void SNSolverHPCHIP::DownloadStateFromDevice() {
+    if( !_hipInitialized || _device == nullptr ) {
+        ErrorMessages::Error( "HIP backend was not initialized before DownloadStateFromDevice.", CURRENT_FUNCTION );
+    }
+    const std::size_t nCells   = static_cast<std::size_t>( _nCells );
+    const std::size_t nCellSys = nCells * static_cast<std::size_t>( _localNSys );
+
+    CheckHip( hipMemcpy( _sol.data(), _device->sol, nCellSys * sizeof( double ), hipMemcpyDeviceToHost ), "download sol" );
+    CheckHip( hipMemcpy( _scalarFlux.data(), _device->scalarFlux, nCells * sizeof( double ), hipMemcpyDeviceToHost ), "download scalar flux" );
+}
+
+void SNSolverHPCHIP::FreeHIP() {
+    if( !_hipInitialized || _device == nullptr ) {
+        return;
+    }
+
+    CheckHip( hipSetDevice( _hipDeviceId ), "hipSetDevice" );
+
+    FreeDeviceArray( _device->areas, "areas" );
+    FreeDeviceArray( _device->normals, "normals" );
+    FreeDeviceArray( _device->neighbors, "neighbors" );
+    FreeDeviceArray( _device->boundaryTypes, "boundaryTypes" );
+    FreeDeviceArray( _device->ghostCellValues, "ghostCellValues" );
+    FreeDeviceArray( _device->cellMidPoints, "cellMidPoints" );
+    FreeDeviceArray( _device->interfaceMidPoints, "interfaceMidPoints" );
+    FreeDeviceArray( _device->relativeInterfaceMid, "relativeInterfaceMid" );
+    FreeDeviceArray( _device->relativeCellVertices, "relativeCellVertices" );
+    FreeDeviceArray( _device->sigmaS, "sigmaS" );
+    FreeDeviceArray( _device->sigmaT, "sigmaT" );
+    FreeDeviceArray( _device->source, "source" );
+    FreeDeviceArray( _device->quadPts, "quadPts" );
+    FreeDeviceArray( _device->quadWeights, "quadWeights" );
+    FreeDeviceArray( _device->sol, "sol" );
+    FreeDeviceArray( _device->solRK0, "solRK0" );
+    FreeDeviceArray( _device->flux, "flux" );
+    FreeDeviceArray( _device->scalarFlux, "scalarFlux" );
+    FreeDeviceArray( _device->solDx, "solDx" );
+    FreeDeviceArray( _device->limiter, "limiter" );
+
+    delete _device;
+    _device         = nullptr;
+    _hipInitialized = false;
+}
+
+SNSolverHPCHIP::~SNSolverHPCHIP() {
+    FreeHIP();
+    delete _mesh;
+    delete _problem;
+}
+
+void SNSolverHPCHIP::Solve() {
+
+    // --- Preprocessing ---
+    if( _rank == 0 ) {
+        PrepareVolumeOutput();
+        DrawPreSolverOutput();
+    }
+    // On restart, continue simulation time from the loaded iteration index.
+    _curSimTime = static_cast<double>( _idx_start_iter ) * _dT;
+    auto start  = std::chrono::high_resolution_clock::now();    // Start timing
+
+    std::chrono::duration<double> duration;
+    // Loop over energies (pseudo-time of continuous slowing down approach)
+    for( unsigned iter = (unsigned)_idx_start_iter; iter < _nIter; iter++ ) {
+
+        if( iter == _nIter - 1 ) {    // last iteration
+            _dT = _settings->GetTEnd() - iter * _dT;
+        }
+        _scalarFluxPrevIter = _scalarFlux;
+
+        if( _temporalOrder == 2 ) {
+            CheckHip( hipSetDevice( _hipDeviceId ), "hipSetDevice" );
+            CheckHip( hipMemcpy( _device->solRK0,
+                                 _device->sol,
+                                 static_cast<std::size_t>( _nCells ) * static_cast<std::size_t>( _localNSys ) * sizeof( double ),
+                                 hipMemcpyDeviceToDevice ),
+                      "backup solRK0" );
+
+            ( _spatialOrder == 2 ) ? FluxOrder2() : FluxOrder1();
+            FVMUpdate();
+            ( _spatialOrder == 2 ) ? FluxOrder2() : FluxOrder1();
+            FVMUpdate();
+
+            const dim3 threads( HIP_BLOCK_SIZE );
+            const dim3 gridCells( static_cast<unsigned>( ( _nCells + HIP_BLOCK_SIZE - 1 ) / HIP_BLOCK_SIZE ) );
+            RK2AverageAndScalarFluxKernel<<<gridCells, threads>>>(
+                _nCells, _localNSys, _device->quadWeights, _device->solRK0, _device->sol, _device->scalarFlux );
+            CheckHip( hipGetLastError(), "RK2AverageAndScalarFluxKernel launch" );
+#ifdef IMPORT_MPI
+            CheckHip(
+                hipMemcpy( _scalarFlux.data(), _device->scalarFlux, static_cast<std::size_t>( _nCells ) * sizeof( double ), hipMemcpyDeviceToHost ),
+                "download scalar flux after RK2 average" );
+            std::vector<double> tempScalarFlux( _scalarFlux );
+            MPI_Barrier( MPI_COMM_WORLD );
+            MPI_Allreduce( tempScalarFlux.data(), _scalarFlux.data(), _nCells, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD );
+            MPI_Barrier( MPI_COMM_WORLD );
+            CheckHip(
+                hipMemcpy( _device->scalarFlux, _scalarFlux.data(), static_cast<std::size_t>( _nCells ) * sizeof( double ), hipMemcpyHostToDevice ),
+                "sync allreduced scalar flux after RK2 average" );
+#endif
+        }
+        else {
+            ( _spatialOrder == 2 ) ? FluxOrder2() : FluxOrder1();
+            FVMUpdate();
+        }
+
+        DownloadStateFromDevice();
+
+        _curSimTime += _dT;
+        IterPostprocessing();
+        // --- Wall time measurement
+        duration  = std::chrono::high_resolution_clock::now() - start;
+        _currTime = std::chrono::duration_cast<std::chrono::duration<double>>( duration ).count();
+
+        // --- Write Output ---
+        if( _rank == 0 ) {
+
+            WriteScalarOutput( iter );
+
+            // --- Update Scalar Fluxes
+
+            // --- Print Output ---
+            PrintScreenOutput( iter );
+            PrintHistoryOutput( iter );
+        }
+#ifdef IMPORT_MPI
+        MPI_Barrier( MPI_COMM_WORLD );
+#endif
+
+        PrintVolumeOutput( iter );
+#ifdef IMPORT_MPI
+        MPI_Barrier( MPI_COMM_WORLD );
+#endif
+    }
+    // --- Postprocessing ---
+    if( _rank == 0 ) {
+        DrawPostSolverOutput();
+    }
+}
+
+void SNSolverHPCHIP::FluxOrder2() {
+    CheckHip( hipSetDevice( _hipDeviceId ), "hipSetDevice" );
+    const unsigned long nCellSys = _nCells * _localNSys;
+    const dim3 threads( HIP_BLOCK_SIZE );
+    const dim3 gridCellSys( static_cast<unsigned>( ( nCellSys + HIP_BLOCK_SIZE - 1 ) / HIP_BLOCK_SIZE ) );
+    const double eps = 1e-10;
+
+    FluxOrder2SlopeKernel<<<gridCellSys, threads>>>( _nCells,
+                                                     _localNSys,
+                                                     _nNbr,
+                                                     _nDim,
+                                                     _device->areas,
+                                                     _device->normals,
+                                                     _device->neighbors,
+                                                     _device->boundaryTypes,
+                                                     _device->sol,
+                                                     _device->solDx,
+                                                     _device->limiter,
+                                                     static_cast<int>( BOUNDARY_TYPE::NONE ) );
+    CheckHip( hipGetLastError(), "FluxOrder2SlopeKernel launch" );
+
+    FluxOrder2LimiterKernel<<<gridCellSys, threads>>>( _nCells,
+                                                       _localNSys,
+                                                       _nNbr,
+                                                       _nDim,
+                                                       _device->areas,
+                                                       _device->neighbors,
+                                                       _device->boundaryTypes,
+                                                       _device->relativeCellVertices,
+                                                       _device->sol,
+                                                       _device->solDx,
+                                                       _device->limiter,
+                                                       static_cast<int>( BOUNDARY_TYPE::NONE ),
+                                                       eps );
+    CheckHip( hipGetLastError(), "FluxOrder2LimiterKernel launch" );
+
+    FluxOrder2Kernel<<<gridCellSys, threads>>>( _nCells,
+                                                _localNSys,
+                                                _nNbr,
+                                                _nDim,
+                                                _device->quadPts,
+                                                _device->normals,
+                                                _device->neighbors,
+                                                _device->boundaryTypes,
+                                                _device->ghostCellValues,
+                                                _device->interfaceMidPoints,
+                                                _device->cellMidPoints,
+                                                _device->relativeInterfaceMid,
+                                                _device->sol,
+                                                _device->solDx,
+                                                _device->limiter,
+                                                _device->flux,
+                                                static_cast<int>( BOUNDARY_TYPE::NEUMANN ) );
+    CheckHip( hipGetLastError(), "FluxOrder2Kernel launch" );
+}
+
+void SNSolverHPCHIP::FluxOrder1() {
+    CheckHip( hipSetDevice( _hipDeviceId ), "hipSetDevice" );
+    const unsigned long nCellSys = _nCells * _localNSys;
+    const dim3 threads( HIP_BLOCK_SIZE );
+    const dim3 gridCellSys( static_cast<unsigned>( ( nCellSys + HIP_BLOCK_SIZE - 1 ) / HIP_BLOCK_SIZE ) );
+
+    FluxOrder1Kernel<<<gridCellSys, threads>>>( _nCells,
+                                                _localNSys,
+                                                _nNbr,
+                                                _nDim,
+                                                _device->quadPts,
+                                                _device->normals,
+                                                _device->neighbors,
+                                                _device->boundaryTypes,
+                                                _device->ghostCellValues,
+                                                _device->sol,
+                                                _device->flux,
+                                                static_cast<int>( BOUNDARY_TYPE::NEUMANN ) );
+    CheckHip( hipGetLastError(), "FluxOrder1Kernel launch" );
+}
+
+void SNSolverHPCHIP::FVMUpdate() {
+    CheckHip( hipSetDevice( _hipDeviceId ), "hipSetDevice" );
+
+    const dim3 threads( HIP_BLOCK_SIZE );
+    const dim3 gridCells( static_cast<unsigned>( ( _nCells + HIP_BLOCK_SIZE - 1 ) / HIP_BLOCK_SIZE ) );
+    const double invTwoPi = 1.0 / static_cast<double>( 2.0L * PI );
+
+    FVMUpdateKernel<<<gridCells, threads>>>( _nCells,
+                                             _localNSys,
+                                             _dT,
+                                             _device->sigmaS,
+                                             _device->sigmaT,
+                                             _device->source,
+                                             _device->areas,
+                                             _device->quadWeights,
+                                             _device->flux,
+                                             _device->sol,
+                                             _device->scalarFlux,
+                                             invTwoPi );
+    CheckHip( hipGetLastError(), "FVMUpdateKernel launch" );
+    CheckHip( hipMemcpy( _scalarFlux.data(), _device->scalarFlux, static_cast<std::size_t>( _nCells ) * sizeof( double ), hipMemcpyDeviceToHost ),
+              "download scalar flux in FVMUpdate" );
+
+#ifdef IMPORT_MPI
+    std::vector<double> tempScalarFlux( _scalarFlux );
+    MPI_Barrier( MPI_COMM_WORLD );
+    MPI_Allreduce( tempScalarFlux.data(), _scalarFlux.data(), _nCells, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD );
+    MPI_Barrier( MPI_COMM_WORLD );
+    CheckHip( hipMemcpy( _device->scalarFlux, _scalarFlux.data(), static_cast<std::size_t>( _nCells ) * sizeof( double ), hipMemcpyHostToDevice ),
+              "sync allreduced scalar flux to device" );
+#endif
+}
+
+void SNSolverHPCHIP::IterPostprocessing() {
+    // ALREDUCE NEEDED
+    _mass    = 0.0;
+    _rmsFlux = 0.0;
+#pragma omp parallel for reduction( + : _mass, _rmsFlux )
+    for( unsigned long idx_cell = 0; idx_cell < _nCells; ++idx_cell ) {
+        _mass += _scalarFlux[idx_cell] * _areas[idx_cell];
+        double diff = _scalarFlux[idx_cell] - _scalarFluxPrevIter[idx_cell];
+        _rmsFlux += diff * diff;
+    }
+
+    _curAbsorptionLattice            = 0.0;
+    _curScalarOutflow                = 0.0;
+    _curScalarOutflowPeri1           = 0.0;
+    _curScalarOutflowPeri2           = 0.0;
+    _curAbsorptionHohlraumCenter     = 0.0;    // Green and blue areas of symmetric hohlraum
+    _curAbsorptionHohlraumVertical   = 0.0;    // Red areas of symmetric hohlraum
+    _curAbsorptionHohlraumHorizontal = 0.0;    // Black areas of symmetric hohlraum
+    _varAbsorptionHohlraumGreen      = 0.0;
+    double a_g                       = 0.0;
+
+#pragma omp parallel for reduction( + : _curAbsorptionLattice,                                                                                       \
+                                        _curScalarOutflow,                                                                                           \
+                                        _curScalarOutflowPeri1,                                                                                      \
+                                        _curScalarOutflowPeri2,                                                                                      \
+                                        _curAbsorptionHohlraumCenter,                                                                                \
+                                        _curAbsorptionHohlraumVertical,                                                                              \
+                                        _curAbsorptionHohlraumHorizontal,                                                                            \
+                                        a_g ) reduction( max : _curMaxOrdinateOutflow, _curMaxAbsorptionLattice )
+    for( unsigned long idx_cell = 0; idx_cell < _nCells; ++idx_cell ) {
+
+        if( _settings->GetProblemName() == PROBLEM_Lattice || _settings->GetProblemName() == PROBLEM_HalfLattice ) {
+            if( IsAbsorptionLattice( _cellMidPoints[Idx2D( idx_cell, 0, _nDim )], _cellMidPoints[Idx2D( idx_cell, 1, _nDim )] ) ) {
+                double sigmaAPsi = _scalarFlux[idx_cell] * ( _sigmaT[idx_cell] - _sigmaS[idx_cell] ) * _areas[idx_cell];
+                _curAbsorptionLattice += sigmaAPsi;
+                _curMaxAbsorptionLattice = ( _curMaxAbsorptionLattice < sigmaAPsi ) ? sigmaAPsi : _curMaxAbsorptionLattice;
+            }
+        }
+
+        if( _settings->GetProblemName() == PROBLEM_SymmetricHohlraum ) {    //} || _settings->GetProblemName() == PROBLEM_QuarterHohlraum ) {
+
+            double x = _cellMidPoints[Idx2D( idx_cell, 0, _nDim )];
+            double y = _cellMidPoints[Idx2D( idx_cell, 1, _nDim )];
+            _curAbsorptionLattice += _scalarFlux[idx_cell] * ( _sigmaT[idx_cell] - _sigmaS[idx_cell] ) * _areas[idx_cell];
+            if( x > -0.2 + _settings->GetPosXCenterGreenHohlraum() && x < 0.2 + _settings->GetPosXCenterGreenHohlraum() &&
+                y > -0.4 + _settings->GetPosYCenterGreenHohlraum() && y < 0.4 + _settings->GetPosYCenterGreenHohlraum() ) {
+                _curAbsorptionHohlraumCenter += _scalarFlux[idx_cell] * ( _sigmaT[idx_cell] - _sigmaS[idx_cell] ) * _areas[idx_cell];
+            }
+            if( ( x < _settings->GetPosRedLeftBorderHohlraum() && y > _settings->GetPosRedLeftBottomHohlraum() &&
+                  y < _settings->GetPosRedLeftTopHohlraum() ) ||
+                ( x > _settings->GetPosRedRightBorderHohlraum() && y > _settings->GetPosRedLeftBottomHohlraum() &&
+                  y < _settings->GetPosRedRightTopHohlraum() ) ) {
+                _curAbsorptionHohlraumVertical += _scalarFlux[idx_cell] * ( _sigmaT[idx_cell] - _sigmaS[idx_cell] ) * _areas[idx_cell];
+            }
+            if( y > 0.6 || y < -0.6 ) {
+                _curAbsorptionHohlraumHorizontal += _scalarFlux[idx_cell] * ( _sigmaT[idx_cell] - _sigmaS[idx_cell] ) * _areas[idx_cell];
+            }
+
+            // Variation in absorption of center (part 1)
+            // green area 1 (lower boundary)
+            bool green1 = x > -0.2 + _settings->GetPosXCenterGreenHohlraum() && x < 0.2 + _settings->GetPosXCenterGreenHohlraum() &&
+                          y > -0.4 + _settings->GetPosYCenterGreenHohlraum() && y < -0.35 + _settings->GetPosYCenterGreenHohlraum();
+            // green area 2 (upper boundary)
+            bool green2 = x > -0.2 + _settings->GetPosXCenterGreenHohlraum() && x < 0.2 + _settings->GetPosXCenterGreenHohlraum() &&
+                          y > 0.35 + _settings->GetPosYCenterGreenHohlraum() && y < 0.4 + _settings->GetPosYCenterGreenHohlraum();
+            // green area 3 (left boundary)
+            bool green3 = x > -0.2 + _settings->GetPosXCenterGreenHohlraum() && x < -0.15 + _settings->GetPosXCenterGreenHohlraum() &&
+                          y > -0.35 + _settings->GetPosYCenterGreenHohlraum() && y < 0.35 + _settings->GetPosYCenterGreenHohlraum();
+            // green area 4 (right boundary)
+            bool green4 = x > 0.15 + _settings->GetPosXCenterGreenHohlraum() && x < 0.2 + _settings->GetPosXCenterGreenHohlraum() &&
+                          y > -0.35 + _settings->GetPosYCenterGreenHohlraum() && y < 0.35 + _settings->GetPosYCenterGreenHohlraum();
+            if( green1 || green2 || green3 || green4 ) {
+                a_g += ( _sigmaT[idx_cell] - _sigmaS[idx_cell] ) * _scalarFlux[idx_cell] * _areas[idx_cell] /
+                       ( 44 * 0.05 * 0.05 );    // divisor is area of the green
+            }
+        }
+
+        if( _settings->GetProblemName() == PROBLEM_Lattice ) {
+            // Outflow out of inner and middle perimeter
+            if( _isPerimeterLatticeCell1[idx_cell] ) {    // inner
+                for( unsigned long idx_nbr_helper = 0; idx_nbr_helper < _cellsLatticePerimeter1[idx_cell].size(); ++idx_nbr_helper ) {
+#pragma omp simd reduction( + : _curScalarOutflowPeri1 )
+                    for( unsigned long idx_sys = 0; idx_sys < _localNSys; idx_sys++ ) {
+                        double localInner = _quadPts[Idx2D( idx_sys, 0, _nDim )] *
+                                                _normals[Idx3D( idx_cell, _cellsLatticePerimeter1[idx_cell][idx_nbr_helper], 0, _nNbr, _nDim )] +
+                                            _quadPts[Idx2D( idx_sys, 1, _nDim )] *
+                                                _normals[Idx3D( idx_cell, _cellsLatticePerimeter1[idx_cell][idx_nbr_helper], 1, _nNbr, _nDim )];
+                        // Find outward facing transport directions
+
+                        if( localInner > 0.0 ) {
+                            _curScalarOutflowPeri1 +=
+                                localInner * _sol[Idx2D( idx_cell, idx_sys, _localNSys )] * _quadWeights[idx_sys];    // Integrate flux
+                        }
+                    }
+                }
+            }
+            if( _isPerimeterLatticeCell2[idx_cell] ) {    // middle
+                for( unsigned long idx_nbr_helper = 0; idx_nbr_helper < _cellsLatticePerimeter2[idx_cell].size(); ++idx_nbr_helper ) {
+#pragma omp simd reduction( + : _curScalarOutflowPeri2 )
+                    for( unsigned long idx_sys = 0; idx_sys < _localNSys; idx_sys++ ) {
+                        double localInner = _quadPts[Idx2D( idx_sys, 0, _nDim )] *
+                                                _normals[Idx3D( idx_cell, _cellsLatticePerimeter2[idx_cell][idx_nbr_helper], 0, _nNbr, _nDim )] +
+                                            _quadPts[Idx2D( idx_sys, 1, _nDim )] *
+                                                _normals[Idx3D( idx_cell, _cellsLatticePerimeter2[idx_cell][idx_nbr_helper], 1, _nNbr, _nDim )];
+                        // Find outward facing transport directions
+
+                        if( localInner > 0.0 ) {
+                            _curScalarOutflowPeri2 +=
+                                localInner * _sol[Idx2D( idx_cell, idx_sys, _localNSys )] * _quadWeights[idx_sys];    // Integrate flux
+                        }
+                    }
+                }
+            }
+        }
+        // Outflow out of domain
+        if( _cellBoundaryTypes[idx_cell] == BOUNDARY_TYPE::NEUMANN ) {
+            // Iterate over face cell faces
+            double currOrdinatewiseOutflow = 0.0;
+
+            for( unsigned long idx_nbr = 0; idx_nbr < _nNbr; ++idx_nbr ) {
+                // Find face that points outward
+                if( _neighbors[Idx2D( idx_cell, idx_nbr, _nNbr )] == _nCells ) {
+#pragma omp simd reduction( + : _curScalarOutflow ) reduction( max : _curMaxOrdinateOutflow )
+                    for( unsigned long idx_sys = 0; idx_sys < _localNSys; idx_sys++ ) {
+
+                        double localInner = _quadPts[Idx2D( idx_sys, 0, _nDim )] * _normals[Idx3D( idx_cell, idx_nbr, 0, _nNbr, _nDim )] +
+                                            _quadPts[Idx2D( idx_sys, 1, _nDim )] * _normals[Idx3D( idx_cell, idx_nbr, 1, _nNbr, _nDim )];
+                        // Find outward facing transport directions
+
+                        if( localInner > 0.0 ) {
+                            _curScalarOutflow +=
+                                localInner * _sol[Idx2D( idx_cell, idx_sys, _localNSys )] * _quadWeights[idx_sys];    // Integrate flux
+
+                            currOrdinatewiseOutflow =
+                                _sol[Idx2D( idx_cell, idx_sys, _localNSys )] * localInner /
+                                sqrt( (
+                                    _normals[Idx3D( idx_cell, idx_nbr, 0, _nNbr, _nDim )] * _normals[Idx3D( idx_cell, idx_nbr, 0, _nNbr, _nDim )] +
+                                    _normals[Idx3D( idx_cell, idx_nbr, 1, _nNbr, _nDim )] * _normals[Idx3D( idx_cell, idx_nbr, 1, _nNbr, _nDim )] ) );
+
+                            _curMaxOrdinateOutflow =
+                                ( currOrdinatewiseOutflow > _curMaxOrdinateOutflow ) ? currOrdinatewiseOutflow : _curMaxOrdinateOutflow;
+                        }
+                    }
+                }
+            }
+        }
+    }
+// MPI Allreduce
+#ifdef IMPORT_MPI
+    double tmp_curScalarOutflow      = 0.0;
+    double tmp_curScalarOutflowPeri1 = 0.0;
+    double tmp_curScalarOutflowPeri2 = 0.0;
+    double tmp_curMaxOrdinateOutflow = 0.0;
+    MPI_Barrier( MPI_COMM_WORLD );
+    MPI_Allreduce( &_curScalarOutflow, &tmp_curScalarOutflow, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD );
+    _curScalarOutflow = tmp_curScalarOutflow;
+    MPI_Barrier( MPI_COMM_WORLD );
+    MPI_Allreduce( &_curScalarOutflowPeri1, &tmp_curScalarOutflowPeri1, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD );
+    _curScalarOutflowPeri1 = tmp_curScalarOutflowPeri1;
+    MPI_Barrier( MPI_COMM_WORLD );
+    MPI_Allreduce( &_curScalarOutflowPeri2, &tmp_curScalarOutflowPeri2, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD );
+    _curScalarOutflowPeri2 = tmp_curScalarOutflowPeri2;
+    MPI_Barrier( MPI_COMM_WORLD );
+    MPI_Allreduce( &_curMaxOrdinateOutflow, &tmp_curMaxOrdinateOutflow, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD );
+    _curMaxOrdinateOutflow = tmp_curMaxOrdinateOutflow;
+    MPI_Barrier( MPI_COMM_WORLD );
+#endif
+    // Variation absorption (part II)
+    if( _settings->GetProblemName() == PROBLEM_SymmetricHohlraum ) {
+        unsigned n_probes = 4;
+        std::vector<double> temp_probingMoments( 3 * n_probes );    // for MPI allreduce
+
+#pragma omp parallel for reduction( + : _varAbsorptionHohlraumGreen )
+        for( unsigned long idx_cell = 0; idx_cell < _nCells; ++idx_cell ) {
+            double x = _cellMidPoints[Idx2D( idx_cell, 0, _nDim )];
+            double y = _cellMidPoints[Idx2D( idx_cell, 1, _nDim )];
+
+            // green area 1 (lower boundary)
+            bool green1 = x > -0.2 + _settings->GetPosXCenterGreenHohlraum() && x < 0.2 + _settings->GetPosXCenterGreenHohlraum() &&
+                          y > -0.4 + _settings->GetPosYCenterGreenHohlraum() && y < -0.35 + _settings->GetPosYCenterGreenHohlraum();
+            // green area 2 (upper boundary)
+            bool green2 = x > -0.2 + _settings->GetPosXCenterGreenHohlraum() && x < 0.2 + _settings->GetPosXCenterGreenHohlraum() &&
+                          y > 0.35 + _settings->GetPosYCenterGreenHohlraum() && y < 0.4 + _settings->GetPosYCenterGreenHohlraum();
+            // green area 3 (left boundary)
+            bool green3 = x > -0.2 + _settings->GetPosXCenterGreenHohlraum() && x < -0.15 + _settings->GetPosXCenterGreenHohlraum() &&
+                          y > -0.35 + _settings->GetPosYCenterGreenHohlraum() && y < 0.35 + _settings->GetPosYCenterGreenHohlraum();
+            // green area 4 (right boundary)
+            bool green4 = x > 0.15 + _settings->GetPosXCenterGreenHohlraum() && x < 0.2 + _settings->GetPosXCenterGreenHohlraum() &&
+                          y > -0.35 + _settings->GetPosYCenterGreenHohlraum() && y < 0.35 + _settings->GetPosYCenterGreenHohlraum();
+            if( green1 || green2 || green3 || green4 ) {
+                _varAbsorptionHohlraumGreen += ( a_g - _scalarFlux[idx_cell] * ( _sigmaT[idx_cell] - _sigmaS[idx_cell] ) ) *
+                                               ( a_g - _scalarFlux[idx_cell] * ( _sigmaT[idx_cell] - _sigmaS[idx_cell] ) ) * _areas[idx_cell];
+            }
+        }
+        // Probes value moments
+        // #pragma omp parallel for
+        for( unsigned long idx_probe = 0; idx_probe < n_probes; idx_probe++ ) {    // Loop over probing cells
+            temp_probingMoments[Idx2D( idx_probe, 0, 3 )] = 0.0;
+            temp_probingMoments[Idx2D( idx_probe, 1, 3 )] = 0.0;
+            temp_probingMoments[Idx2D( idx_probe, 2, 3 )] = 0.0;
+            // for( unsigned long idx_ball = 0; idx_ball < _probingCellsHohlraum[idx_probe].size(); idx_ball++ ) {
+            //   std::cout << idx_ball << _areas[_probingCellsHohlraum[idx_probe][idx_ball]] / ( 0.01 * 0.01 * M_PI ) << std::endl;
+            //}
+            for( unsigned long idx_sys = 0; idx_sys < _localNSys; idx_sys++ ) {
+                for( unsigned long idx_ball = 0; idx_ball < _probingCellsHohlraum[idx_probe].size(); idx_ball++ ) {
+                    temp_probingMoments[Idx2D( idx_probe, 0, 3 )] += _sol[Idx2D( _probingCellsHohlraum[idx_probe][idx_ball], idx_sys, _localNSys )] *
+                                                                     _quadWeights[idx_sys] * _areas[_probingCellsHohlraum[idx_probe][idx_ball]] /
+                                                                     ( 0.01 * 0.01 * M_PI );
+                    temp_probingMoments[Idx2D( idx_probe, 1, 3 )] +=
+                        _quadPts[Idx2D( idx_sys, 0, _nDim )] * _sol[Idx2D( _probingCellsHohlraum[idx_probe][idx_ball], idx_sys, _localNSys )] *
+                        _quadWeights[idx_sys] * _areas[_probingCellsHohlraum[idx_probe][idx_ball]] / ( 0.01 * 0.01 * M_PI );
+                    temp_probingMoments[Idx2D( idx_probe, 2, 3 )] +=
+                        _quadPts[Idx2D( idx_sys, 1, _nDim )] * _sol[Idx2D( _probingCellsHohlraum[idx_probe][idx_ball], idx_sys, _localNSys )] *
+                        _quadWeights[idx_sys] * _areas[_probingCellsHohlraum[idx_probe][idx_ball]] / ( 0.01 * 0.01 * M_PI );
+                }
+            }
+        }
+
+#ifdef IMPORT_MPI
+        MPI_Barrier( MPI_COMM_WORLD );
+        MPI_Allreduce( temp_probingMoments.data(), _probingMoments.data(), 3 * n_probes, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD );
+        MPI_Barrier( MPI_COMM_WORLD );
+#endif
+#ifndef IMPORT_MPI
+        for( unsigned long idx_probe = 0; idx_probe < n_probes; idx_probe++ ) {    // Loop over probing cells
+            _probingMoments[Idx2D( idx_probe, 0, 3 )] = temp_probingMoments[Idx2D( idx_probe, 0, 3 )];
+            _probingMoments[Idx2D( idx_probe, 1, 3 )] = temp_probingMoments[Idx2D( idx_probe, 1, 3 )];
+            _probingMoments[Idx2D( idx_probe, 2, 3 )] = temp_probingMoments[Idx2D( idx_probe, 2, 3 )];
+        }
+#endif
+    }
+    // probe values green
+    if( _settings->GetProblemName() == PROBLEM_SymmetricHohlraum ) {
+        ComputeQOIsGreenProbingLine();
+    }
+    // Update time integral values on rank 0
+    if( _rank == 0 ) {
+        _totalScalarOutflow += _curScalarOutflow * _dT;
+        _totalScalarOutflowPeri1 += _curScalarOutflowPeri1 * _dT;
+        _totalScalarOutflowPeri2 += _curScalarOutflowPeri2 * _dT;
+        _totalAbsorptionLattice += _curAbsorptionLattice * _dT;
+
+        _totalAbsorptionHohlraumCenter += _curAbsorptionHohlraumCenter * _dT;
+        _totalAbsorptionHohlraumVertical += _curAbsorptionHohlraumVertical * _dT;
+        _totalAbsorptionHohlraumHorizontal += _curAbsorptionHohlraumHorizontal * _dT;
+
+        _rmsFlux = sqrt( _rmsFlux );
+    }
+}
+
+bool SNSolverHPCHIP::IsAbsorptionLattice( double x, double y ) const {
+    // Check whether pos is inside absorbing squares
+    double xy_corrector = -3.5;
+    std::vector<double> lbounds{ 1 + xy_corrector, 2 + xy_corrector, 3 + xy_corrector, 4 + xy_corrector, 5 + xy_corrector };
+    std::vector<double> ubounds{ 2 + xy_corrector, 3 + xy_corrector, 4 + xy_corrector, 5 + xy_corrector, 6 + xy_corrector };
+    for( unsigned k = 0; k < lbounds.size(); ++k ) {
+        for( unsigned l = 0; l < lbounds.size(); ++l ) {
+            if( ( l + k ) % 2 == 1 || ( k == 2 && l == 2 ) || ( k == 2 && l == 4 ) ) continue;
+            if( x >= lbounds[k] && x <= ubounds[k] && y >= lbounds[l] && y <= ubounds[l] ) {
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
+// --- Helper ---
+double SNSolverHPCHIP::ComputeTimeStep( double cfl ) const {
+    // for pseudo 1D, set timestep to dx
+    double dx, dy;
+    switch( _settings->GetProblemName() ) {
+        case PROBLEM_Checkerboard1D:
+            dx = 7.0 / (double)_nCells;
+            dy = 0.3;
+            return cfl * ( dx * dy ) / ( dx + dy );
+            break;
+        case PROBLEM_Linesource1D:     // Fallthrough
+        case PROBLEM_Meltingcube1D:    // Fallthrough
+        case PROBLEM_Aircavity1D:
+            dx = 3.0 / (double)_nCells;
+            dy = 0.3;
+            return cfl * ( dx * dy ) / ( dx + dy );
+            break;
+        default: break;    // 2d as normal
+    }
+    // 2D case
+    double charSize = __DBL_MAX__;    // minimum char size of all mesh cells in the mesh
+
+#pragma omp parallel for reduction( min : charSize )
+    for( unsigned long j = 0; j < _nCells; j++ ) {
+        double currCharSize = sqrt( _areas[j] );
+        if( currCharSize < charSize ) {
+            charSize = currCharSize;
+        }
+    }
+    if( _rank == 0 ) {
+        auto log         = spdlog::get( "event" );
+        std::string line = "| Smallest characteristic length of a grid cell in this mesh: " + std::to_string( charSize );
+        log->info( line );
+        line = "| Corresponding maximal time-step: " + std::to_string( cfl * charSize );
+        log->info( line );
+    }
+    return cfl * charSize;
+}
+
+// --- IO ----
+void SNSolverHPCHIP::PrepareScreenOutput() {
+    unsigned nFields = (unsigned)_settings->GetNScreenOutput();
+
+    _screenOutputFieldNames.resize( nFields );
+    _screenOutputFields.resize( nFields );
+
+    // Prepare all output Fields ==> Specified in option SCREEN_OUTPUT
+    for( unsigned idx_field = 0; idx_field < nFields; idx_field++ ) {
+        // Prepare all Output Fields per group
+
+        // Different procedure, depending on the Group...
+        switch( _settings->GetScreenOutput()[idx_field] ) {
+            case MASS: _screenOutputFieldNames[idx_field] = "Mass"; break;
+            case ITER: _screenOutputFieldNames[idx_field] = "Iter"; break;
+            case SIM_TIME: _screenOutputFieldNames[idx_field] = "Sim time"; break;
+            case WALL_TIME: _screenOutputFieldNames[idx_field] = "Wall time [s]"; break;
+            case RMS_FLUX: _screenOutputFieldNames[idx_field] = "RMS flux"; break;
+            case VTK_OUTPUT: _screenOutputFieldNames[idx_field] = "VTK out"; break;
+            case CSV_OUTPUT: _screenOutputFieldNames[idx_field] = "CSV out"; break;
+            case CUR_OUTFLOW: _screenOutputFieldNames[idx_field] = "Cur. outflow"; break;
+            case TOTAL_OUTFLOW: _screenOutputFieldNames[idx_field] = "Tot. outflow"; break;
+            case CUR_OUTFLOW_P1: _screenOutputFieldNames[idx_field] = "Cur. outflow P1"; break;
+            case TOTAL_OUTFLOW_P1: _screenOutputFieldNames[idx_field] = "Tot. outflow P1"; break;
+            case CUR_OUTFLOW_P2: _screenOutputFieldNames[idx_field] = "Cur. outflow P2"; break;
+            case TOTAL_OUTFLOW_P2: _screenOutputFieldNames[idx_field] = "Tot. outflow P2"; break;
+            case MAX_OUTFLOW: _screenOutputFieldNames[idx_field] = "Max outflow"; break;
+            case CUR_PARTICLE_ABSORPTION: _screenOutputFieldNames[idx_field] = "Cur. absorption"; break;
+            case TOTAL_PARTICLE_ABSORPTION: _screenOutputFieldNames[idx_field] = "Tot. absorption"; break;
+            case MAX_PARTICLE_ABSORPTION: _screenOutputFieldNames[idx_field] = "Max absorption"; break;
+            case TOTAL_PARTICLE_ABSORPTION_CENTER: _screenOutputFieldNames[idx_field] = "Tot. abs. center"; break;
+            case TOTAL_PARTICLE_ABSORPTION_VERTICAL: _screenOutputFieldNames[idx_field] = "Tot. abs. vertical wall"; break;
+            case TOTAL_PARTICLE_ABSORPTION_HORIZONTAL: _screenOutputFieldNames[idx_field] = "Tot. abs. horizontal wall"; break;
+            case PROBE_MOMENT_TIME_TRACE:
+                _screenOutputFieldNames[idx_field] = "Probe 1 u_0";
+                idx_field++;
+                _screenOutputFieldNames[idx_field] = "Probe 2 u_0";
+                if( _settings->GetProblemName() == PROBLEM_SymmetricHohlraum ) {
+                    idx_field++;
+                    _screenOutputFieldNames[idx_field] = "Probe 3 u_0";
+                    idx_field++;
+                    _screenOutputFieldNames[idx_field] = "Probe 4 u_0";
+                }
+                break;
+            case VAR_ABSORPTION_GREEN: _screenOutputFieldNames[idx_field] = "Var. absorption green"; break;
+
+            default: ErrorMessages::Error( "Screen output field not defined!", CURRENT_FUNCTION ); break;
+        }
+    }
+}
+
+void SNSolverHPCHIP::WriteScalarOutput( unsigned idx_iter ) {
+    unsigned n_probes = 4;
+
+    unsigned nFields                  = (unsigned)_settings->GetNScreenOutput();
+    const VectorVector probingMoments = _problem->GetCurrentProbeMoment();
+    // -- Screen Output
+    for( unsigned idx_field = 0; idx_field < nFields; idx_field++ ) {
+        // Prepare all Output Fields per group
+        // Different procedure, depending on the Group...
+        switch( _settings->GetScreenOutput()[idx_field] ) {
+            case MASS: _screenOutputFields[idx_field] = _mass; break;
+            case ITER: _screenOutputFields[idx_field] = idx_iter; break;
+            case SIM_TIME: _screenOutputFields[idx_field] = _curSimTime; break;
+            case WALL_TIME: _screenOutputFields[idx_field] = _currTime; break;
+            case RMS_FLUX: _screenOutputFields[idx_field] = _rmsFlux; break;
+            case VTK_OUTPUT:
+                _screenOutputFields[idx_field] = 0;
+                if( ( _settings->GetVolumeOutputFrequency() != 0 && idx_iter % (unsigned)_settings->GetVolumeOutputFrequency() == 0 ) ||
+                    ( idx_iter == _nIter - 1 ) /* need sol at last iteration */ ) {
+                    _screenOutputFields[idx_field] = 1;
+                }
+                break;
+            case CSV_OUTPUT:
+                _screenOutputFields[idx_field] = 0;
+                if( ( _settings->GetHistoryOutputFrequency() != 0 && idx_iter % (unsigned)_settings->GetHistoryOutputFrequency() == 0 ) ||
+                    ( idx_iter == _nIter - 1 ) /* need sol at last iteration */ ) {
+                    _screenOutputFields[idx_field] = 1;
+                }
+                break;
+            case CUR_OUTFLOW: _screenOutputFields[idx_field] = _curScalarOutflow; break;
+            case TOTAL_OUTFLOW: _screenOutputFields[idx_field] = _totalScalarOutflow; break;
+            case CUR_OUTFLOW_P1: _screenOutputFields[idx_field] = _curScalarOutflowPeri1; break;
+            case TOTAL_OUTFLOW_P1: _screenOutputFields[idx_field] = _totalScalarOutflowPeri1; break;
+            case CUR_OUTFLOW_P2: _screenOutputFields[idx_field] = _curScalarOutflowPeri2; break;
+            case TOTAL_OUTFLOW_P2: _screenOutputFields[idx_field] = _totalScalarOutflowPeri2; break;
+            case MAX_OUTFLOW: _screenOutputFields[idx_field] = _curMaxOrdinateOutflow; break;
+            case CUR_PARTICLE_ABSORPTION: _screenOutputFields[idx_field] = _curAbsorptionLattice; break;
+            case TOTAL_PARTICLE_ABSORPTION: _screenOutputFields[idx_field] = _totalAbsorptionLattice; break;
+            case MAX_PARTICLE_ABSORPTION: _screenOutputFields[idx_field] = _curMaxAbsorptionLattice; break;
+            case TOTAL_PARTICLE_ABSORPTION_CENTER: _screenOutputFields[idx_field] = _totalAbsorptionHohlraumCenter; break;
+            case TOTAL_PARTICLE_ABSORPTION_VERTICAL: _screenOutputFields[idx_field] = _totalAbsorptionHohlraumVertical; break;
+            case TOTAL_PARTICLE_ABSORPTION_HORIZONTAL: _screenOutputFields[idx_field] = _totalAbsorptionHohlraumHorizontal; break;
+            case PROBE_MOMENT_TIME_TRACE:
+                if( _settings->GetProblemName() == PROBLEM_SymmetricHohlraum ) n_probes = 4;
+                // if( _settings->GetProblemName() == PROBLEM_QuarterHohlraum ) n_probes = 2;
+                for( unsigned i = 0; i < n_probes; i++ ) {
+                    _screenOutputFields[idx_field] = _probingMoments[Idx2D( i, 0, 3 )];
+                    idx_field++;
+                }
+                idx_field--;
+                break;
+            case VAR_ABSORPTION_GREEN: _screenOutputFields[idx_field] = _varAbsorptionHohlraumGreen; break;
+            default: ErrorMessages::Error( "Screen output group not defined!", CURRENT_FUNCTION ); break;
+        }
+    }
+
+    // --- History output ---
+    nFields = (unsigned)_settings->GetNHistoryOutput();
+
+    std::vector<SCALAR_OUTPUT> screenOutputFields = _settings->GetScreenOutput();
+    for( unsigned idx_field = 0; idx_field < nFields; idx_field++ ) {
+
+        // Prepare all Output Fields per group
+        // Different procedure, depending on the Group...
+        switch( _settings->GetHistoryOutput()[idx_field] ) {
+            case MASS: _historyOutputFields[idx_field] = _mass; break;
+            case ITER: _historyOutputFields[idx_field] = idx_iter; break;
+            case SIM_TIME: _historyOutputFields[idx_field] = _curSimTime; break;
+            case WALL_TIME: _historyOutputFields[idx_field] = _currTime; break;
+            case RMS_FLUX: _historyOutputFields[idx_field] = _rmsFlux; break;
+            case VTK_OUTPUT:
+                _historyOutputFields[idx_field] = 0;
+                if( ( _settings->GetVolumeOutputFrequency() != 0 && idx_iter % (unsigned)_settings->GetVolumeOutputFrequency() == 0 ) ||
+                    ( idx_iter == _nIter - 1 ) /* need sol at last iteration */ ) {
+                    _historyOutputFields[idx_field] = 1;
+                }
+                break;
+
+            case CSV_OUTPUT:
+                _historyOutputFields[idx_field] = 0;
+                if( ( _settings->GetHistoryOutputFrequency() != 0 && idx_iter % (unsigned)_settings->GetHistoryOutputFrequency() == 0 ) ||
+                    ( idx_iter == _nIter - 1 ) /* need sol at last iteration */ ) {
+                    _historyOutputFields[idx_field] = 1;
+                }
+                break;
+            case CUR_OUTFLOW: _historyOutputFields[idx_field] = _curScalarOutflow; break;
+            case TOTAL_OUTFLOW: _historyOutputFields[idx_field] = _totalScalarOutflow; break;
+            case CUR_OUTFLOW_P1: _historyOutputFields[idx_field] = _curScalarOutflowPeri1; break;
+            case TOTAL_OUTFLOW_P1: _historyOutputFields[idx_field] = _totalScalarOutflowPeri1; break;
+            case CUR_OUTFLOW_P2: _historyOutputFields[idx_field] = _curScalarOutflowPeri2; break;
+            case TOTAL_OUTFLOW_P2: _historyOutputFields[idx_field] = _totalScalarOutflowPeri2; break;
+            case MAX_OUTFLOW: _historyOutputFields[idx_field] = _curMaxOrdinateOutflow; break;
+            case CUR_PARTICLE_ABSORPTION: _historyOutputFields[idx_field] = _curAbsorptionLattice; break;
+            case TOTAL_PARTICLE_ABSORPTION: _historyOutputFields[idx_field] = _totalAbsorptionLattice; break;
+            case MAX_PARTICLE_ABSORPTION: _historyOutputFields[idx_field] = _curMaxAbsorptionLattice; break;
+            case TOTAL_PARTICLE_ABSORPTION_CENTER: _historyOutputFields[idx_field] = _totalAbsorptionHohlraumCenter; break;
+            case TOTAL_PARTICLE_ABSORPTION_VERTICAL: _historyOutputFields[idx_field] = _totalAbsorptionHohlraumVertical; break;
+            case TOTAL_PARTICLE_ABSORPTION_HORIZONTAL: _historyOutputFields[idx_field] = _totalAbsorptionHohlraumHorizontal; break;
+            case PROBE_MOMENT_TIME_TRACE:
+                if( _settings->GetProblemName() == PROBLEM_SymmetricHohlraum ) n_probes = 4;
+                // if( _settings->GetProblemName() == PROBLEM_QuarterHohlraum ) n_probes = 2;
+                for( unsigned i = 0; i < n_probes; i++ ) {
+                    for( unsigned j = 0; j < 3; j++ ) {
+                        _historyOutputFields[idx_field] = _probingMoments[Idx2D( i, j, 3 )];
+                        idx_field++;
+                    }
+                }
+                idx_field--;
+                break;
+            case VAR_ABSORPTION_GREEN: _historyOutputFields[idx_field] = _varAbsorptionHohlraumGreen; break;
+            case ABSORPTION_GREEN_LINE:
+                for( unsigned i = 0; i < _settings->GetNumProbingCellsLineHohlraum(); i++ ) {
+                    _historyOutputFields[idx_field] = _absorptionValsLineSegment[i];
+                    idx_field++;
+                }
+                idx_field--;
+                break;
+            case ABSORPTION_GREEN_BLOCK:
+                for( unsigned i = 0; i < 44; i++ ) {
+                    _historyOutputFields[idx_field] = _absorptionValsBlocksGreen[i];
+                    // std::cout << _absorptionValsBlocksGreen[i] << "/" << _historyOutputFields[idx_field] << std::endl;
+                    idx_field++;
+                }
+                idx_field--;
+                break;
+            default: ErrorMessages::Error( "History output group not defined!", CURRENT_FUNCTION ); break;
+        }
+    }
+}
+
+void SNSolverHPCHIP::PrintScreenOutput( unsigned idx_iter ) {
+    auto log = spdlog::get( "event" );
+
+    unsigned strLen  = 15;    // max width of one column
+    char paddingChar = ' ';
+
+    // assemble the line to print
+    std::string lineToPrint = "| ";
+    std::string tmp;
+    for( unsigned idx_field = 0; idx_field < _settings->GetNScreenOutput(); idx_field++ ) {
+        tmp = std::to_string( _screenOutputFields[idx_field] );
+
+        // Format outputs correctly
+        std::vector<SCALAR_OUTPUT> integerFields    = { ITER };
+        std::vector<SCALAR_OUTPUT> scientificFields = { RMS_FLUX,
+                                                        MASS,
+                                                        WALL_TIME,
+                                                        CUR_OUTFLOW,
+                                                        TOTAL_OUTFLOW,
+                                                        CUR_OUTFLOW_P1,
+                                                        TOTAL_OUTFLOW_P1,
+                                                        CUR_OUTFLOW_P2,
+                                                        TOTAL_OUTFLOW_P2,
+                                                        MAX_OUTFLOW,
+                                                        CUR_PARTICLE_ABSORPTION,
+                                                        TOTAL_PARTICLE_ABSORPTION,
+                                                        MAX_PARTICLE_ABSORPTION,
+                                                        TOTAL_PARTICLE_ABSORPTION_CENTER,
+                                                        TOTAL_PARTICLE_ABSORPTION_VERTICAL,
+                                                        TOTAL_PARTICLE_ABSORPTION_HORIZONTAL,
+                                                        PROBE_MOMENT_TIME_TRACE,
+                                                        VAR_ABSORPTION_GREEN,
+                                                        ABSORPTION_GREEN_BLOCK,
+                                                        ABSORPTION_GREEN_LINE };
+        std::vector<SCALAR_OUTPUT> booleanFields    = { VTK_OUTPUT, CSV_OUTPUT };
+
+        if( !( integerFields.end() == std::find( integerFields.begin(), integerFields.end(), _settings->GetScreenOutput()[idx_field] ) ) ) {
+            tmp = std::to_string( (int)_screenOutputFields[idx_field] );
+        }
+        else if( !( booleanFields.end() == std::find( booleanFields.begin(), booleanFields.end(), _settings->GetScreenOutput()[idx_field] ) ) ) {
+            tmp = "no";
+            if( (bool)_screenOutputFields[idx_field] ) tmp = "yes";
+        }
+        else if( !( scientificFields.end() ==
+                    std::find( scientificFields.begin(), scientificFields.end(), _settings->GetScreenOutput()[idx_field] ) ) ) {
+
+            std::stringstream ss;
+            ss << TextProcessingToolbox::DoubleToScientificNotation( _screenOutputFields[idx_field] );
+            tmp = ss.str();
+            tmp.erase( std::remove( tmp.begin(), tmp.end(), '+' ), tmp.end() );    // removing the '+' sign
+        }
+
+        if( strLen > tmp.size() )    // Padding
+            tmp.insert( 0, strLen - tmp.size(), paddingChar );
+        else if( strLen < tmp.size() )    // Cutting
+            tmp.resize( strLen );
+
+        lineToPrint += tmp + " |";
+    }
+    if( _settings->GetScreenOutputFrequency() != 0 && idx_iter % (unsigned)_settings->GetScreenOutputFrequency() == 0 ) {
+        log->info( lineToPrint );
+    }
+    else if( idx_iter == _nIter - 1 ) {    // Always print last iteration
+        log->info( lineToPrint );
+    }
+}
+
+void SNSolverHPCHIP::PrepareHistoryOutput() {
+    unsigned n_probes = 4;
+
+    unsigned nFields = (unsigned)_settings->GetNHistoryOutput();
+
+    _historyOutputFieldNames.resize( nFields );
+    _historyOutputFields.resize( nFields );
+
+    // Prepare all output Fields ==> Specified in option SCREEN_OUTPUT
+    for( unsigned idx_field = 0; idx_field < nFields; idx_field++ ) {
+        // Prepare all Output Fields per group
+
+        // Different procedure, depending on the Group...
+        switch( _settings->GetHistoryOutput()[idx_field] ) {
+            case MASS: _historyOutputFieldNames[idx_field] = "Mass"; break;
+            case ITER: _historyOutputFieldNames[idx_field] = "Iter"; break;
+            case SIM_TIME: _historyOutputFieldNames[idx_field] = "Sim_time"; break;
+            case WALL_TIME: _historyOutputFieldNames[idx_field] = "Wall_time_[s]"; break;
+            case RMS_FLUX: _historyOutputFieldNames[idx_field] = "RMS_flux"; break;
+            case VTK_OUTPUT: _historyOutputFieldNames[idx_field] = "VTK_out"; break;
+            case CSV_OUTPUT: _historyOutputFieldNames[idx_field] = "CSV_out"; break;
+            case CUR_OUTFLOW: _historyOutputFieldNames[idx_field] = "Cur_outflow"; break;
+            case TOTAL_OUTFLOW: _historyOutputFieldNames[idx_field] = "Total_outflow"; break;
+            case CUR_OUTFLOW_P1: _historyOutputFieldNames[idx_field] = "Cur_outflow_P1"; break;
+            case TOTAL_OUTFLOW_P1: _historyOutputFieldNames[idx_field] = "Total_outflow_P1"; break;
+            case CUR_OUTFLOW_P2: _historyOutputFieldNames[idx_field] = "Cur_outflow_P2"; break;
+            case TOTAL_OUTFLOW_P2: _historyOutputFieldNames[idx_field] = "Total_outflow_P2"; break;
+            case MAX_OUTFLOW: _historyOutputFieldNames[idx_field] = "Max_outflow"; break;
+            case CUR_PARTICLE_ABSORPTION: _historyOutputFieldNames[idx_field] = "Cur_absorption"; break;
+            case TOTAL_PARTICLE_ABSORPTION: _historyOutputFieldNames[idx_field] = "Total_absorption"; break;
+            case MAX_PARTICLE_ABSORPTION: _historyOutputFieldNames[idx_field] = "Max_absorption"; break;
+            case TOTAL_PARTICLE_ABSORPTION_CENTER: _historyOutputFieldNames[idx_field] = "Cumulated_absorption_center"; break;
+            case TOTAL_PARTICLE_ABSORPTION_VERTICAL: _historyOutputFieldNames[idx_field] = "Cumulated_absorption_vertical_wall"; break;
+            case TOTAL_PARTICLE_ABSORPTION_HORIZONTAL: _historyOutputFieldNames[idx_field] = "Cumulated_absorption_horizontal_wall"; break;
+            case PROBE_MOMENT_TIME_TRACE:
+                if( _settings->GetProblemName() == PROBLEM_SymmetricHohlraum ) n_probes = 4;
+                // if( _settings->GetProblemName() == PROBLEM_QuarterHohlraum ) n_probes = 2;
+                for( unsigned i = 0; i < n_probes; i++ ) {
+                    for( unsigned j = 0; j < 3; j++ ) {
+                        _historyOutputFieldNames[idx_field] = "Probe " + std::to_string( i ) + " u_" + std::to_string( j );
+                        idx_field++;
+                    }
+                }
+                idx_field--;
+                break;
+            case VAR_ABSORPTION_GREEN: _historyOutputFieldNames[idx_field] = "Var. absorption green"; break;
+            case ABSORPTION_GREEN_BLOCK:
+                for( unsigned i = 0; i < 44; i++ ) {
+                    _historyOutputFieldNames[idx_field] = "Probe Green Block " + std::to_string( i );
+                    idx_field++;
+                }
+                idx_field--;
+                break;
+
+            case ABSORPTION_GREEN_LINE:
+                for( unsigned i = 0; i < _settings->GetNumProbingCellsLineHohlraum(); i++ ) {
+                    _historyOutputFieldNames[idx_field] = "Probe Green Line " + std::to_string( i );
+                    idx_field++;
+                }
+                idx_field--;
+                break;
+            default: ErrorMessages::Error( "History output field not defined!", CURRENT_FUNCTION ); break;
+        }
+    }
+}
+
+void SNSolverHPCHIP::PrintHistoryOutput( unsigned idx_iter ) {
+
+    auto log = spdlog::get( "tabular" );
+
+    // assemble the line to print
+    std::string lineToPrint = "";
+    std::string tmp;
+    for( int idx_field = 0; idx_field < _settings->GetNHistoryOutput() - 1; idx_field++ ) {
+        if( idx_field == 0 ) {
+            tmp = std::to_string( _historyOutputFields[idx_field] );    // Iteration count
+        }
+        else {
+            tmp = TextProcessingToolbox::DoubleToScientificNotation( _historyOutputFields[idx_field] );
+        }
+        lineToPrint += tmp + ",";
+    }
+    tmp = TextProcessingToolbox::DoubleToScientificNotation( _historyOutputFields[_settings->GetNHistoryOutput() - 1] );
+    lineToPrint += tmp;    // Last element without comma
+    // std::cout << lineToPrint << std::endl;
+    if( _settings->GetHistoryOutputFrequency() != 0 && idx_iter % (unsigned)_settings->GetHistoryOutputFrequency() == 0 ) {
+        log->info( lineToPrint );
+    }
+    else if( idx_iter == _nIter - 1 ) {    // Always print last iteration
+        log->info( lineToPrint );
+    }
+}
+
+void SNSolverHPCHIP::DrawPreSolverOutput() {
+
+    // Logger
+    auto log    = spdlog::get( "event" );
+    auto logCSV = spdlog::get( "tabular" );
+
+    std::string hLine = "--";
+
+    unsigned strLen  = 15;    // max width of one column
+    char paddingChar = ' ';
+
+    // Assemble Header for Screen Output
+    std::string lineToPrint = "| ";
+    std::string tmpLine     = "-----------------";
+    for( unsigned idxFields = 0; idxFields < _settings->GetNScreenOutput(); idxFields++ ) {
+        std::string tmp = _screenOutputFieldNames[idxFields];
+
+        if( strLen > tmp.size() )    // Padding
+            tmp.insert( 0, strLen - tmp.size(), paddingChar );
+        else if( strLen < tmp.size() )    // Cutting
+            tmp.resize( strLen );
+
+        lineToPrint += tmp + " |";
+        hLine += tmpLine;
+    }
+    log->info( "---------------------------- Solver Starts -----------------------------" );
+    log->info( "| The simulation will run for {} iterations.", _nIter );
+    log->info( "| The spatial grid contains {} cells.", _nCells );
+    if( _settings->GetSolverName() != PN_SOLVER && _settings->GetSolverName() != CSD_PN_SOLVER ) {
+        log->info( "| The velocity grid contains {} points.", _nq );
+    }
+    log->info( hLine );
+    log->info( lineToPrint );
+    log->info( hLine );
+
+    std::string lineToPrintCSV = "";
+    for( int idxFields = 0; idxFields < _settings->GetNHistoryOutput() - 1; idxFields++ ) {
+        std::string tmp = _historyOutputFieldNames[idxFields];
+        lineToPrintCSV += tmp + ",";
+    }
+    lineToPrintCSV += _historyOutputFieldNames[_settings->GetNHistoryOutput() - 1];
+    logCSV->info( lineToPrintCSV );
+}
+
+void SNSolverHPCHIP::DrawPostSolverOutput() {
+
+    // Logger
+    auto log = spdlog::get( "event" );
+
+    std::string hLine = "--";
+
+    unsigned strLen  = 10;    // max width of one column
+    char paddingChar = ' ';
+
+    // Assemble Header for Screen Output
+    std::string lineToPrint = "| ";
+    std::string tmpLine     = "------------";
+    for( unsigned idxFields = 0; idxFields < _settings->GetNScreenOutput(); idxFields++ ) {
+        std::string tmp = _screenOutputFieldNames[idxFields];
+
+        if( strLen > tmp.size() )    // Padding
+            tmp.insert( 0, strLen - tmp.size(), paddingChar );
+        else if( strLen < tmp.size() )    // Cutting
+            tmp.resize( strLen );
+
+        lineToPrint += tmp + " |";
+        hLine += tmpLine;
+    }
+    log->info( hLine );
+#ifndef BUILD_TESTING
+    log->info( "| The volume output files have been stored at " + _settings->GetOutputFile() );
+    log->info( "| The log files have been stored at " + _settings->GetLogDir() + _settings->GetLogFile() );
+#endif
+    log->info( "--------------------------- Solver Finished ----------------------------" );
+}
+
+unsigned long SNSolverHPCHIP::Idx2D( unsigned long idx1, unsigned long idx2, unsigned long len2 ) { return idx1 * len2 + idx2; }
+
+unsigned long SNSolverHPCHIP::Idx3D( unsigned long idx1, unsigned long idx2, unsigned long idx3, unsigned long len2, unsigned long len3 ) {
+    return ( idx1 * len2 + idx2 ) * len3 + idx3;
+}
+
+void SNSolverHPCHIP::WriteVolumeOutput( unsigned idx_iter ) {
+    unsigned nGroups = (unsigned)_settings->GetNVolumeOutput();
+    if( ( _settings->GetVolumeOutputFrequency() != 0 && idx_iter % (unsigned)_settings->GetVolumeOutputFrequency() == 0 ) ||
+        ( idx_iter == _nIter - 1 ) /* need sol at last iteration */ ) {
+        for( unsigned idx_group = 0; idx_group < nGroups; idx_group++ ) {
+            switch( _settings->GetVolumeOutput()[idx_group] ) {
+                case MINIMAL:
+                    if( _rank == 0 ) {
+                        _outputFields[idx_group][0] = _scalarFlux;
+                    }
+                    break;
+
+                case MOMENTS:
+#pragma omp parallel for
+                    for( unsigned idx_cell = 0; idx_cell < _nCells; ++idx_cell ) {
+                        _outputFields[idx_group][0][idx_cell] = 0.0;
+                        _outputFields[idx_group][1][idx_cell] = 0.0;
+                        for( unsigned idx_sys = 0; idx_sys < _localNSys; idx_sys++ ) {
+                            _outputFields[idx_group][0][idx_cell] +=
+                                _quadPts[Idx2D( idx_sys, 0, _nDim )] * _sol[Idx2D( idx_cell, idx_sys, _localNSys )] * _quadWeights[idx_sys];
+                            _outputFields[idx_group][1][idx_cell] +=
+                                _quadPts[Idx2D( idx_sys, 1, _nDim )] * _sol[Idx2D( idx_cell, idx_sys, _localNSys )] * _quadWeights[idx_sys];
+                        }
+                    }
+#ifdef IMPORT_MPI
+                    MPI_Barrier( MPI_COMM_WORLD );
+                    MPI_Allreduce(
+                        _outputFields[idx_group][0].data(), _outputFields[idx_group][0].data(), _nCells, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD );
+                    MPI_Allreduce(
+                        _outputFields[idx_group][1].data(), _outputFields[idx_group][1].data(), _nCells, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD );
+                    MPI_Barrier( MPI_COMM_WORLD );
+#endif
+                    break;
+                default: ErrorMessages::Error( "Volume Output Group not defined for HPC SN Solver!", CURRENT_FUNCTION ); break;
+            }
+        }
+    }
+}
+
+void SNSolverHPCHIP::PrintVolumeOutput( int idx_iter ) {
+    if( _settings->GetSaveRestartSolutionFrequency() != 0 && idx_iter % (int)_settings->GetSaveRestartSolutionFrequency() == 0 ) {
+        // std::cout << "Saving restart solution at iteration " << idx_iter << std::endl;
+        WriteRestartSolution( _settings->GetOutputFile(),
+                              _sol,
+                              _scalarFlux,
+                              _rank,
+                              idx_iter,
+                              _totalAbsorptionHohlraumCenter,
+                              _totalAbsorptionHohlraumVertical,
+                              _totalAbsorptionHohlraumHorizontal,
+                              _totalAbsorptionLattice );
+    }
+
+    if( _settings->GetVolumeOutputFrequency() != 0 && idx_iter % (int)_settings->GetVolumeOutputFrequency() == 0 ) {
+        WriteVolumeOutput( idx_iter );
+        if( _rank == 0 ) {
+            ExportVTK( _settings->GetOutputFile() + "_" + std::to_string( idx_iter ), _outputFields, _outputFieldNames, _mesh );    // slow
+        }
+    }
+    if( idx_iter == (int)_nIter - 1 ) {    // Last iteration write without suffix.
+        WriteVolumeOutput( idx_iter );
+        if( _rank == 0 ) {
+            ExportVTK( _settings->GetOutputFile(), _outputFields, _outputFieldNames, _mesh );
+        }
+    }
+}
+
+void SNSolverHPCHIP::PrepareVolumeOutput() {
+    unsigned nGroups = (unsigned)_settings->GetNVolumeOutput();
+
+    _outputFieldNames.resize( nGroups );
+    _outputFields.resize( nGroups );
+
+    // Prepare all OutputGroups ==> Specified in option VOLUME_OUTPUT
+    for( unsigned idx_group = 0; idx_group < nGroups; idx_group++ ) {
+        // Prepare all Output Fields per group
+
+        // Different procedure, depending on the Group...
+        switch( _settings->GetVolumeOutput()[idx_group] ) {
+            case MINIMAL:
+                // Currently only one entry ==> rad flux
+                _outputFields[idx_group].resize( 1 );
+                _outputFieldNames[idx_group].resize( 1 );
+
+                _outputFields[idx_group][0].resize( _nCells );
+                _outputFieldNames[idx_group][0] = "scalar flux";
+                break;
+            case MOMENTS:
+                // As many entries as there are moments in the system
+                _outputFields[idx_group].resize( _nOutputMoments );
+                _outputFieldNames[idx_group].resize( _nOutputMoments );
+
+                for( unsigned idx_moment = 0; idx_moment < _nOutputMoments; idx_moment++ ) {
+                    _outputFieldNames[idx_group][idx_moment] = std::string( "u_" + std::to_string( idx_moment ) );
+                    _outputFields[idx_group][idx_moment].resize( _nCells );
+                }
+                break;
+
+            default: ErrorMessages::Error( "Volume Output Group not defined for HPC SN Solver!", CURRENT_FUNCTION ); break;
+        }
+    }
+}
+
+void SNSolverHPCHIP::SetGhostCells() {
+    if( _settings->GetProblemName() == PROBLEM_Lattice ) {
+        // #pragma omp parallel for
+        for( unsigned idx_cell = 0; idx_cell < _nCells; idx_cell++ ) {
+            if( _cellBoundaryTypes[idx_cell] == BOUNDARY_TYPE::NEUMANN || _cellBoundaryTypes[idx_cell] == BOUNDARY_TYPE::DIRICHLET ) {
+                _ghostCells[idx_cell] = std::vector<double>( _localNSys, 0.0 );
+            }
+        }
+    }
+    else if( _settings->GetProblemName() == PROBLEM_HalfLattice ) {    // HALF LATTICE NOT WORKING
+        ErrorMessages::Error( "Test case does not work with MPI", CURRENT_FUNCTION );
+    }
+    else if( _settings->GetProblemName() == PROBLEM_SymmetricHohlraum ) {
+
+        auto nodes = _mesh->GetNodes();
+        double tol = 1e-12;    // For distance to boundary
+
+        // #pragma omp parallel for
+        for( unsigned idx_cell = 0; idx_cell < _nCells; idx_cell++ ) {
+
+            if( _cellBoundaryTypes[idx_cell] == BOUNDARY_TYPE::NEUMANN || _cellBoundaryTypes[idx_cell] == BOUNDARY_TYPE::DIRICHLET ) {
+                _ghostCells[idx_cell] = std::vector<double>( _localNSys, 0.0 );
+
+                auto localCellNodes = _mesh->GetCells()[idx_cell];
+
+                for( unsigned idx_node = 0; idx_node < _mesh->GetNumNodesPerCell(); idx_node++ ) {    // Check if corner node is in this cell
+                    if( nodes[localCellNodes[idx_node]][0] < -0.65 + tol ) {                          // close to 0 => left boundary
+                        for( unsigned idx_sys = 0; idx_sys < _localNSys; idx_sys++ ) {
+                            if( _quadPts[Idx2D( idx_sys, 0, _nDim )] > 0.0 ) _ghostCells[idx_cell][idx_sys] = 1.0;
+                        }
+                        break;
+                    }
+                    else if( nodes[localCellNodes[idx_node]][0] > 0.65 - tol ) {    // right boundary
+                        for( unsigned idx_sys = 0; idx_sys < _localNSys; idx_sys++ ) {
+                            if( _quadPts[Idx2D( idx_sys, 0, _nDim )] < 0.0 ) _ghostCells[idx_cell][idx_sys] = 1.0;
+                        }
+                        break;
+                    }
+                    else if( nodes[localCellNodes[idx_node]][1] < -0.65 + tol ) {    // lower boundary
+                        break;
+                    }
+                    else if( nodes[localCellNodes[idx_node]][1] > 0.65 - tol ) {    // upper boundary
+                        break;
+                    }
+                    else if( idx_node == _mesh->GetNumNodesPerCell() - 1 ) {
+                        ErrorMessages::Error( " Problem with ghost cell setup and  boundary of this mesh ", CURRENT_FUNCTION );
+                    }
+                }
+            }
+        }
+    }
+    else if( _settings->GetProblemName() == PROBLEM_QuarterHohlraum ) {
+        ErrorMessages::Error( "Test case does not work with MPI", CURRENT_FUNCTION );
+    }
+}
+
+void SNSolverHPCHIP::SetProbingCellsLineGreen() {
+
+    if( _settings->GetProblemName() == PROBLEM_SymmetricHohlraum ) {
+        assert( _nProbingCellsLineGreen % 2 == 0 );
+
+        std::vector<double> p1 = { _cornerUpperLeftGreen[0] + _thicknessGreen / 2.0, _cornerUpperLeftGreen[1] - _thicknessGreen / 2.0 };
+        std::vector<double> p2 = { _cornerUpperRightGreen[0] - _thicknessGreen / 2.0, _cornerUpperRightGreen[1] - _thicknessGreen / 2.0 };
+        std::vector<double> p3 = { _cornerLowerRightGreen[0] - _thicknessGreen / 2.0, _cornerLowerRightGreen[1] + _thicknessGreen / 2.0 };
+        std::vector<double> p4 = { _cornerLowerLeftGreen[0] + _thicknessGreen / 2.0, _cornerLowerLeftGreen[1] + _thicknessGreen / 2.0 };
+
+        double verticalLineWidth   = std::abs( p1[1] - p4[1] );
+        double horizontalLineWidth = std::abs( p1[0] - p2[0] );
+
+        double pt_ratio_h = horizontalLineWidth / ( horizontalLineWidth + verticalLineWidth );
+
+        unsigned nHorizontalProbingCells = (unsigned)std::ceil( _nProbingCellsLineGreen / 2 * pt_ratio_h );
+        unsigned nVerticalProbingCells   = _nProbingCellsLineGreen / 2 - nHorizontalProbingCells;
+        assert( nHorizontalProbingCells > 1 );
+        assert( nVerticalProbingCells > 1 );
+
+        _probingCellsLineGreen = std::vector<unsigned>( 2 * ( nVerticalProbingCells + nHorizontalProbingCells ) );
+
+        // Sample points on each side of the rectangle
+        std::vector<unsigned> side1 = linspace2D( p1, p2, nHorizontalProbingCells );    // upper horizontal
+        std::vector<unsigned> side2 = linspace2D( p2, p3, nVerticalProbingCells );      // right vertical
+        std::vector<unsigned> side3 = linspace2D( p3, p4, nHorizontalProbingCells );    // lower horizontal
+        std::vector<unsigned> side4 = linspace2D( p4, p1, nVerticalProbingCells );      // left vertical
+
+        for( unsigned i = 0; i < nHorizontalProbingCells; ++i ) {
+            _probingCellsLineGreen[i] = side1[i];
+        }
+        for( unsigned i = 0; i < nVerticalProbingCells; ++i ) {
+            _probingCellsLineGreen[i + nHorizontalProbingCells] = side2[i];
+        }
+        for( unsigned i = 0; i < nHorizontalProbingCells; ++i ) {
+            _probingCellsLineGreen[i + nVerticalProbingCells + nHorizontalProbingCells] = side3[i];
+        }
+        for( unsigned i = 0; i < nVerticalProbingCells; ++i ) {
+            _probingCellsLineGreen[i + nVerticalProbingCells + 2 * nHorizontalProbingCells] = side4[i];
+        }
+
+        // Block-wise approach
+        // Initialize the vector to store the corner points of each block
+        std::vector<std::vector<std::vector<double>>> block_corners;
+
+        double block_size = 0.05;
+        const double cx   = _centerGreen[0];
+        const double cy   = _centerGreen[1];
+
+        // Loop to fill the blocks
+        for( int i = 0; i < 8; ++i ) {    // 8 blocks in the x-direction (horizontal) (upper) (left to right)
+
+            // Top row
+            double x1 = -0.2 + cx + i * block_size;
+            double y1 = 0.4 + cy;
+            double x2 = x1 + block_size;
+            double y2 = y1 - block_size;
+
+            std::vector<std::vector<double>> corners = {
+                { x1, y1 },    // top-left
+                { x2, y1 },    // top-right
+                { x2, y2 },    // bottom-right
+                { x1, y2 }     // bottom-left
+            };
+            block_corners.push_back( corners );
+        }
+
+        for( int j = 0; j < 14; ++j ) {    // 14 blocks in the y-direction (vertical)
+            // right column double x1 = 0.15;
+            double x1 = 0.15 + cx;
+            double y1 = 0.35 + cy - j * block_size;
+            double x2 = x1 + block_size;
+            double y2 = y1 - block_size;
+
+            // Store the four corner points for this block
+            std::vector<std::vector<double>> corners = {
+                { x1, y1 },    // top-left
+                { x2, y1 },    // top-right
+                { x2, y2 },    // bottom-right
+                { x1, y2 }     // bottom-left
+            };
+
+            block_corners.push_back( corners );
+        }
+
+        for( int i = 0; i < 8; ++i ) {    // 8 blocks in the x-direction (horizontal) (lower) (right to left)
+            // bottom row
+            double x1 = 0.15 + cx - i * block_size;
+            double y1 = -0.35 + cy;
+            double x2 = x1 + block_size;
+            double y2 = y1 - block_size;
+
+            std::vector<std::vector<double>> corners = {
+                { x1, y1 },    // top-left
+                { x2, y1 },    // top-right
+                { x2, y2 },    // bottom-right
+                { x1, y2 }     // bottom-left
+            };
+            block_corners.push_back( corners );
+        }
+
+        for( int j = 0; j < 14; ++j ) {    // 14 blocks in the y-direction (vertical) (down to up)
+
+            // left column
+            double x1 = -0.2 + cx;
+            double y1 = -0.3 + cy + j * block_size;
+            double x2 = x1 + block_size;
+            double y2 = y1 - block_size;
+
+            // Store the four corner points for this block
+            std::vector<std::vector<double>> corners = {
+                { x1, y1 },    // top-left
+                { x2, y1 },    // top-right
+                { x2, y2 },    // bottom-right
+                { x1, y2 }     // bottom-left
+            };
+
+            block_corners.push_back( corners );
+        }
+
+        // Compute the probing cells for each block
+        for( unsigned i = 0; i < _nProbingCellsBlocksGreen; i++ ) {
+            _probingCellsBlocksGreen.push_back( _mesh->GetCellsofRectangle( block_corners[i] ) );
+        }
+    }
+}
+
+void SNSolverHPCHIP::ComputeQOIsGreenProbingLine() {
+#pragma omp parallel for
+    for( unsigned i = 0; i < _nProbingCellsLineGreen; i++ ) {    // Loop over probing cells
+        _absorptionValsLineSegment[i] =
+            ( _sigmaT[_probingCellsLineGreen[i]] - _sigmaS[_probingCellsLineGreen[i]] ) * _scalarFlux[_probingCellsLineGreen[i]];
+    }
+
+    // Block-wise approach
+    // #pragma omp parallel for
+    for( unsigned i = 0; i < _nProbingCellsBlocksGreen; i++ ) {
+        _absorptionValsBlocksGreen[i] = 0.0;
+        for( unsigned j = 0; j < _probingCellsBlocksGreen[i].size(); j++ ) {
+            _absorptionValsBlocksGreen[i] += ( _sigmaT[_probingCellsBlocksGreen[i][j]] - _sigmaS[_probingCellsBlocksGreen[i][j]] ) *
+                                             _scalarFlux[_probingCellsBlocksGreen[i][j]] * _areas[_probingCellsBlocksGreen[i][j]];
+        }
+    }
+    // std::cout << _absorptionValsBlocksGreen[1] << std::endl;
+    // std::cout << _absorptionValsLineSegment[1] << std::endl;
+}
+
+std::vector<unsigned> SNSolverHPCHIP::linspace2D( const std::vector<double>& start, const std::vector<double>& end, unsigned num_points ) {
+    /**
+     * Generate a 2D linspace based on the start and end points with a specified number of points.
+     *
+     * @param start vector of starting x and y coordinates
+     * @param end vector of ending x and y coordinates
+     * @param num_points number of points to generate
+     *
+     * @return vector of unsigned integers representing the result
+     */
+
+    std::vector<unsigned> result;
+    assert( num_points > 1 );
+    result.resize( num_points );
+    double stepX = ( end[0] - start[0] ) / ( num_points - 1 );
+    double stepY = ( end[1] - start[1] ) / ( num_points - 1 );
+#pragma omp parallel for
+    for( unsigned i = 0; i < num_points; ++i ) {
+        double x = start[0] + i * stepX;
+        double y = start[1] + i * stepY;
+
+        result[i] = _mesh->GetCellOfKoordinate( x, y );
+    }
+
+    return result;
+}
+
+void SNSolverHPCHIP::ComputeCellsPerimeterLattice() {
+    double l_1    = 1.5;    // perimeter 1
+    double l_2    = 2.5;    // perimeter 2
+    auto nodes    = _mesh->GetNodes();
+    auto cells    = _mesh->GetCells();
+    auto cellMids = _mesh->GetCellMidPoints();
+    auto normals  = _mesh->GetNormals();
+    auto neigbors = _mesh->GetNeighbours();
+
+    _isPerimeterLatticeCell1.resize( _mesh->GetNumCells(), false );
+    _isPerimeterLatticeCell2.resize( _mesh->GetNumCells(), false );
+
+    for( unsigned idx_cell = 0; idx_cell < _mesh->GetNumCells(); ++idx_cell ) {
+        if( abs( cellMids[idx_cell][0] ) < l_1 && abs( cellMids[idx_cell][1] ) < l_1 ) {
+            // Cell is within perimeter
+            for( unsigned idx_nbr = 0; idx_nbr < _mesh->GetNumNodesPerCell(); ++idx_nbr ) {
+                if( neigbors[idx_cell][idx_nbr] == _mesh->GetNumCells() ) {
+                    continue;    // Skip boundary - ghost cells
+                }
+
+                if( ( abs( cellMids[neigbors[idx_cell][idx_nbr]][0] ) > l_1 && abs( cellMids[idx_cell][0] ) < l_1 ) ||
+                    ( abs( cellMids[neigbors[idx_cell][idx_nbr]][1] ) > l_1 && abs( cellMids[idx_cell][1] ) < l_1 ) ) {
+                    // neighbor is outside perimeter
+                    _cellsLatticePerimeter1[idx_cell].push_back( idx_nbr );
+                    _isPerimeterLatticeCell1[idx_cell] = true;
+                }
+            }
+        }
+        else if( abs( cellMids[idx_cell][0] ) < l_2 && abs( cellMids[idx_cell][1] ) < l_2 ) {
+            // Cell is within perimeter
+            for( unsigned idx_nbr = 0; idx_nbr < _mesh->GetNumNodesPerCell(); ++idx_nbr ) {
+                if( neigbors[idx_cell][idx_nbr] == _mesh->GetNumCells() ) {
+                    continue;    // Skip boundary - ghost cells
+                }
+                if( ( abs( cellMids[neigbors[idx_cell][idx_nbr]][0] ) > l_2 && abs( cellMids[idx_cell][0] ) < l_2 ) ||
+                    ( abs( cellMids[neigbors[idx_cell][idx_nbr]][1] ) > l_2 && abs( cellMids[idx_cell][1] ) < l_2 ) ) {
+                    // neighbor is outside perimeter
+                    _cellsLatticePerimeter2[idx_cell].push_back( idx_nbr );
+                    _isPerimeterLatticeCell2[idx_cell] = true;
+                }
+            }
+        }
+    }
+}
diff --git a/tests/catch2_main.cpp b/tests/catch2_main.cpp
index 72bfe4a4..82f7b05b 100644
--- a/tests/catch2_main.cpp
+++ b/tests/catch2_main.cpp
@@ -4,9 +4,18 @@
 // #include <Python.h>
 // #define PY_ARRAY_UNIQUE_SYMBOL KITRT_ARRAY_API
 #include <filesystem>
+#ifdef IMPORT_MPI
+#include <mpi.h>
+#endif
 
 int main( int argc, char** argv ) {
-    // MPI_Init( &argc, &argv );
+#ifdef IMPORT_MPI
+    int mpiInitialized = 0;
+    MPI_Initialized( &mpiInitialized );
+    if( !mpiInitialized ) {
+        MPI_Init( &argc, &argv );
+    }
+#endif
 
     // wchar_t* program = Py_DecodeLocale( argv[0], NULL );
     // Py_SetProgramName( program );
@@ -15,6 +24,12 @@ int main( int argc, char** argv ) {
 
     std::filesystem::remove_all( std::string( TESTS_PATH ) + "result" );
 
-    // MPI_Finalize();
+#ifdef IMPORT_MPI
+    int mpiFinalized = 0;
+    MPI_Finalized( &mpiFinalized );
+    if( !mpiFinalized ) {
+        MPI_Finalize();
+    }
+#endif
     return result;
 }
diff --git a/tests/test_cases.cpp b/tests/test_cases.cpp
index 383ffa8f..9eae1428 100644
--- a/tests/test_cases.cpp
+++ b/tests/test_cases.cpp
@@ -16,6 +16,9 @@
 #ifdef KITRT_ENABLE_CUDA_HPC
 #include "solvers/snsolver_hpc_cuda.hpp"
 #endif
+#ifdef KITRT_ENABLE_HIP_HPC
+#include "solvers/snsolver_hpc_hip.hpp"
+#endif
 #include "solvers/solverbase.hpp"
 
 using vtkUnstructuredGridReaderSP = vtkSmartPointer<vtkUnstructuredGridReader>;
@@ -642,6 +645,69 @@ TEST_CASE( "SN_SOLVER_HPC_CUDA_QOI_VALIDATION", "[validation_tests][hpc][cuda]"
 }
 #endif
 
+#ifdef KITRT_ENABLE_HIP_HPC
+TEST_CASE( "SN_SOLVER_HPC_HIP_QOI_VALIDATION", "[validation_tests][hpc][hip]" ) {
+    std::string hpcFileDir = "input/validation_tests/SN_solver_hpc/";
+    std::string resultDir  = std::string( TESTS_PATH ) + "result";
+    std::string logDir     = std::string( TESTS_PATH ) + "result/logs";
+
+    auto runCpuAndCollectCSV = [&]( const std::string& configFileName, const std::string& prefix ) {
+        Config* config      = new Config( configFileName );
+        SNSolverHPC* solver = new SNSolverHPC( config );
+        solver->Solve();
+        requireAndFlushTestLoggers();
+        delete solver;
+        delete config;
+        return findNewestCSVFile( logDir, prefix );
+    };
+
+    auto runHipAndCollectCSV = [&]( const std::string& configFileName, const std::string& prefix ) {
+        Config* config         = new Config( configFileName );
+        SNSolverHPCHIP* solver = new SNSolverHPCHIP( config );
+        solver->Solve();
+        requireAndFlushTestLoggers();
+        delete solver;
+        delete config;
+        return findNewestCSVFile( logDir, prefix );
+    };
+
+    SECTION( "lattice_order1_cpu_vs_hip_all_qois" ) {
+        std::filesystem::remove_all( resultDir );
+        resetTestLoggers();
+        const std::string cpuConfig = std::string( TESTS_PATH ) + hpcFileDir + "lattice_hpc_200_cpu_order1.cfg";
+        const std::string cpuCSV    = runCpuAndCollectCSV( cpuConfig, "lattice_hpc_200_cpu_order1_output" );
+        REQUIRE( !cpuCSV.empty() );
+
+        resetTestLoggers();
+        const std::string hipConfig = std::string( TESTS_PATH ) + hpcFileDir + "lattice_hpc_200_cuda_order1.cfg";
+        const std::string hipCSV    = runHipAndCollectCSV( hipConfig, "lattice_hpc_200_cuda_order1_output" );
+        REQUIRE( !hipCSV.empty() );
+
+        INFO( "CPU CSV: " << cpuCSV );
+        INFO( "HIP CSV: " << hipCSV );
+        compareHistoryCSV( cpuCSV, hipCSV );
+    }
+
+    SECTION( "lattice_order2_cpu_vs_hip_all_qois" ) {
+        std::filesystem::remove_all( resultDir );
+        resetTestLoggers();
+        const std::string cpuConfig = std::string( TESTS_PATH ) + hpcFileDir + "lattice_hpc_200_cpu_order2.cfg";
+        const std::string cpuCSV    = runCpuAndCollectCSV( cpuConfig, "lattice_hpc_200_cpu_order2_output" );
+        REQUIRE( !cpuCSV.empty() );
+
+        resetTestLoggers();
+        const std::string hipConfig = std::string( TESTS_PATH ) + hpcFileDir + "lattice_hpc_200_cuda_order2.cfg";
+        const std::string hipCSV    = runHipAndCollectCSV( hipConfig, "lattice_hpc_200_cuda_order2_output" );
+        REQUIRE( !hipCSV.empty() );
+
+        INFO( "CPU CSV: " << cpuCSV );
+        INFO( "HIP CSV: " << hipCSV );
+        compareHistoryCSV( cpuCSV, hipCSV );
+    }
+
+}
+#endif
+
 TEST_CASE( "screen_output", "[output]" ) {
     std::string out_fileDir = "input/validation_tests/output/";
     resetTestLoggers();    // Make sure to write in own logging file
diff --git a/tools/singularity/build_container.sh b/tools/singularity/build_container.sh
index 1975c056..a57c0c5b 100755
--- a/tools/singularity/build_container.sh
+++ b/tools/singularity/build_container.sh
@@ -10,12 +10,16 @@ case "${mode}" in
     cuda)
         singularity build kit_rt_MPI_cuda.sif kit_rt_MPI_cuda.def
         ;;
+    rocm)
+        singularity build kit_rt_MPI_rocm72.sif kit_rt_MPI_rocm72.def
+        ;;
     all)
         singularity build kit_rt.sif kit_rt.def
         singularity build kit_rt_MPI_cuda.sif kit_rt_MPI_cuda.def
+        singularity build kit_rt_MPI_rocm72.sif kit_rt_MPI_rocm72.def
         ;;
     *)
-        echo "Usage: $0 [cpu|cuda|all]" >&2
+        echo "Usage: $0 [cpu|cuda|rocm|all]" >&2
         exit 1
         ;;
 esac
diff --git a/tools/singularity/install_kitrt_singularity_cuda.sh b/tools/singularity/install_kitrt_singularity_cuda.sh
index a59653d2..6d613888 100755
--- a/tools/singularity/install_kitrt_singularity_cuda.sh
+++ b/tools/singularity/install_kitrt_singularity_cuda.sh
@@ -4,5 +4,5 @@ set -euo pipefail
 cd ../../
 mkdir -p build_singularity_cuda
 cd build_singularity_cuda
-cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_MPI=ON -DBUILD_CUDA_HPC=ON -DBUILD_ML=OFF ..
+cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_MPI=ON -DBUILD_CUDA_HPC=ON -DBUILD_HIP_HPC=OFF -DBUILD_ML=OFF ..
 make -j
diff --git a/tools/singularity/install_kitrt_singularity_rocm.sh b/tools/singularity/install_kitrt_singularity_rocm.sh
new file mode 100755
index 00000000..b1f8cf9f
--- /dev/null
+++ b/tools/singularity/install_kitrt_singularity_rocm.sh
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+cd ../../
+ROCM_CLANGXX="${ROCM_CLANGXX:-/opt/rocm-7.2.0/lib/llvm/bin/clang++}"
+if [[ ! -x "${ROCM_CLANGXX}" && -x "/opt/rocm/lib/llvm/bin/clang++" ]]; then
+    ROCM_CLANGXX="/opt/rocm/lib/llvm/bin/clang++"
+fi
+
+if [[ ! -x "${ROCM_CLANGXX}" ]]; then
+    echo "Could not find ROCm clang++ compiler. Set ROCM_CLANGXX to a valid path." >&2
+    exit 1
+fi
+
+mkdir -p build_singularity_rocm72
+cd build_singularity_rocm72
+if [[ -f CMakeCache.txt ]]; then
+    CACHED_CXX="$(grep '^CMAKE_CXX_COMPILER:FILEPATH=' CMakeCache.txt | cut -d= -f2- || true)"
+    if [[ -n "${CACHED_CXX}" && "${CACHED_CXX}" != "${ROCM_CLANGXX}" ]]; then
+        echo "Resetting CMake cache because compiler changed: ${CACHED_CXX} -> ${ROCM_CLANGXX}"
+        rm -rf CMakeCache.txt CMakeFiles
+    fi
+fi
+
+cmake -DCMAKE_BUILD_TYPE=Release \
+      -DCMAKE_CXX_COMPILER="${ROCM_CLANGXX}" \
+      -DCMAKE_HIP_COMPILER="${ROCM_CLANGXX}" \
+      -DBUILD_MPI=ON \
+      -DBUILD_CUDA_HPC=OFF \
+      -DBUILD_HIP_HPC=ON \
+      -DBUILD_ML=OFF ..
+make -j
diff --git a/tools/singularity/kit_rt_MPI_rocm72.def b/tools/singularity/kit_rt_MPI_rocm72.def
new file mode 100644
index 00000000..96e5fa88
--- /dev/null
+++ b/tools/singularity/kit_rt_MPI_rocm72.def
@@ -0,0 +1,75 @@
+Bootstrap: docker
+From: rocm/dev-ubuntu-22.04:7.2-complete
+
+%post
+    export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib:/opt/rocm/lib:/opt/rocm/lib64"
+    export PYTHONPATH=/usr/local/gmsh/lib:$PYTHONPATH
+    export PATH=/usr/local/gmsh/bin:/opt/rocm/bin:$PATH
+
+    apt-get update
+    DEBIAN_FRONTEND=noninteractive apt-get install -qq \
+       gcc \
+       g++ \
+       libopenmpi-dev \
+       openmpi-bin \
+       libblas-dev \
+       liblapack-dev \
+       git \
+       make \
+       ninja-build \
+       cmake \
+       wget \
+       ssh \
+       libssl-dev \
+       libxt-dev \
+       libgl1-mesa-dev \
+       libglu1 \
+       libxrender1 \
+       libxcursor-dev \
+       libxft-dev \
+       libxinerama-dev \
+       python3 \
+       python3-pip \
+       doxygen
+
+    apt-get clean
+    apt-get autoremove --purge
+    rm -rf /var/lib/apt/lists/*
+
+    cd /usr/local
+    wget -nc --quiet http://gmsh.info/bin/Linux/gmsh-4.7.0-Linux64-sdk.tgz
+    tar xzf gmsh-4.7.0-Linux64-sdk.tgz
+    mv gmsh-4.7.0-Linux64-sdk gmsh
+    rm gmsh-4.7.0-Linux64-sdk.tgz
+
+    wget -nc --no-check-certificate --quiet https://www.vtk.org/files/release/9.1/VTK-9.1.0.tar.gz
+    tar xzf VTK-9.1.0.tar.gz
+    mkdir VTK-9.1.0/build
+    cd VTK-9.1.0/build
+    cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -DBUILD_DOCUMENTATION=OFF -DBUILD_TESTING=OFF ../
+    ninja
+    ninja install > /dev/null
+    cd -
+    rm -rf VTK-*
+
+    pip3 install numpy pygmsh==6.1.1 Pillow pydicom gcovr sphinx_rtd_theme breathe cpp-coveralls coveralls pandas
+
+%environment
+    export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib:/opt/rocm/lib:/opt/rocm/lib64"
+    export PYTHONPATH=/usr/local/gmsh/lib:$PYTHONPATH
+    export PATH=/usr/local/gmsh/bin:/opt/rocm/bin:$PATH
+
+%help
+    KiT-RT ROCm 7.2 MPI container.
+
+    Example build:
+      cmake -S . -B build_singularity_rocm72 -DCMAKE_BUILD_TYPE=Release \
+        -DBUILD_MPI=ON -DBUILD_HIP_HPC=ON -DBUILD_ML=OFF
+      cmake --build build_singularity_rocm72 -j
+
+    Example run:
+      mpirun -np 2 ./build_singularity_rocm72/KiT-RT \
+        tests/input/validation_tests/SN_solver_hpc/lattice_hpc_200_cuda_order2.cfg
+
+%runscript
+    exec /bin/bash
diff --git a/tools/singularity/singularity_run_interactive_rocm.sh b/tools/singularity/singularity_run_interactive_rocm.sh
new file mode 100644
index 00000000..02b611d8
--- /dev/null
+++ b/tools/singularity/singularity_run_interactive_rocm.sh
@@ -0,0 +1 @@
+singularity shell --nv  --bind $(pwd)/../..:/mnt kit_rt_MPI_rocm72.sif