diff --git a/.github/workflows/frontier_amd/bench.sh b/.github/workflows/frontier_amd/bench.sh
deleted file mode 120000
index 2ac24c7604..0000000000
--- a/.github/workflows/frontier_amd/bench.sh
+++ /dev/null
@@ -1 +0,0 @@
-../frontier/bench.sh
\ No newline at end of file
diff --git a/.github/workflows/frontier_amd/bench.sh b/.github/workflows/frontier_amd/bench.sh
new file mode 100644
index 0000000000..2ac24c7604
--- /dev/null
+++ b/.github/workflows/frontier_amd/bench.sh
@@ -0,0 +1 @@
+../frontier/bench.sh
\ No newline at end of file
diff --git a/.github/workflows/frontier_amd/build.sh b/.github/workflows/frontier_amd/build.sh
deleted file mode 120000
index 40fec10411..0000000000
--- a/.github/workflows/frontier_amd/build.sh
+++ /dev/null
@@ -1 +0,0 @@
-../frontier/build.sh
\ No newline at end of file
diff --git a/.github/workflows/frontier_amd/build.sh b/.github/workflows/frontier_amd/build.sh
new file mode 100644
index 0000000000..40fec10411
--- /dev/null
+++ b/.github/workflows/frontier_amd/build.sh
@@ -0,0 +1 @@
+../frontier/build.sh
\ No newline at end of file
diff --git a/.github/workflows/frontier_amd/submit.sh b/.github/workflows/frontier_amd/submit.sh
deleted file mode 120000
index 11890c4fcd..0000000000
--- a/.github/workflows/frontier_amd/submit.sh
+++ /dev/null
@@ -1 +0,0 @@
-../frontier/submit.sh
\ No newline at end of file
diff --git a/.github/workflows/frontier_amd/submit.sh b/.github/workflows/frontier_amd/submit.sh
new file mode 100644
index 0000000000..11890c4fcd
--- /dev/null
+++ b/.github/workflows/frontier_amd/submit.sh
@@ -0,0 +1 @@
+../frontier/submit.sh
\ No newline at end of file
diff --git a/.github/workflows/frontier_amd/test.sh b/.github/workflows/frontier_amd/test.sh
deleted file mode 120000
index 8878e823b2..0000000000
--- a/.github/workflows/frontier_amd/test.sh
+++ /dev/null
@@ -1 +0,0 @@
-../frontier/test.sh
\ No newline at end of file
diff --git a/.github/workflows/frontier_amd/test.sh b/.github/workflows/frontier_amd/test.sh
new file mode 100644
index 0000000000..8878e823b2
--- /dev/null
+++ b/.github/workflows/frontier_amd/test.sh
@@ -0,0 +1 @@
+../frontier/test.sh
\ No newline at end of file
diff --git a/examples/2D_lid_driven_cavity_nn/Lid_driven_cavity_Re_100_n_0.5.png b/examples/2D_lid_driven_cavity_nn/Lid_driven_cavity_Re_100_n_0.5.png
new file mode 100644
index 0000000000..b589c6bdd8
Binary files /dev/null and b/examples/2D_lid_driven_cavity_nn/Lid_driven_cavity_Re_100_n_0.5.png differ
diff --git a/examples/2D_lid_driven_cavity_nn/Lid_driven_cavity_Re_100_n_1.5.png b/examples/2D_lid_driven_cavity_nn/Lid_driven_cavity_Re_100_n_1.5.png
new file mode 100644
index 0000000000..b6855d823c
Binary files /dev/null and b/examples/2D_lid_driven_cavity_nn/Lid_driven_cavity_Re_100_n_1.5.png differ
diff --git a/examples/2D_lid_driven_cavity_nn/case.py b/examples/2D_lid_driven_cavity_nn/case.py
new file mode 100644
index 0000000000..cef8312ef1
--- /dev/null
+++ b/examples/2D_lid_driven_cavity_nn/case.py
@@ -0,0 +1,116 @@
+#!/usr/bin/env python3
+"""
+2D Lid-Driven Cavity Flow with Herschel-Bulkley Non-Newtonian Fluid
+
+Re = 5000, n = 1.5 (shear-thickening) with mesh stretching near walls.
+
+HB model: mu = (tau0/gdot)*(1 - exp(-m*gdot)) + K * gdot^(n-1)
+Re_gen = rho * U^(2-n) * L^n / K = 1 * 1^0.5 * 1^1.5 / 0.0002 = 5000
+
+Mesh stretching: cosh-based clustering near all 4 walls (x_a, x_b, y_a, y_b).
+"""
+import json
+
+eps = 1e-6
+
+# HB model parameters
+tau0 = 0.0          # Yield stress (set to 0 for power-law fluid)
+K = 0.0002          # Consistency index (Re=5000: K = 1/5000)
+nn = 1.5            # Flow behavior index (shear-thickening)
+mu_min = 0.00002    # K * gdot_min^(n-1) = 0.0002 * (0.01)^0.5
+mu_max = 0.0632     # K * gdot_max^(n-1) = 0.0002 * (1e5)^0.5
+hb_m = 1000.0       # Papanastasiou regularization parameter
+mu_bulk = 0.0
+
+lid_velocity = 1.0  # Lid velocity (m/s)
+
+# Configuring case dictionary
+print(
+    json.dumps(
+        {
+            # Logistics
+            "run_time_info": "T",
+            # Computational Domain Parameters
+            "x_domain%beg": 0.0,
+            "x_domain%end": 1.0,
+            "y_domain%beg": 0.0,
+            "y_domain%end": 1.0,
+            "m": 255,
+            "n": 255,
+            "p": 0,
+            "cfl_adap_dt": "T",
+            "cfl_target": 0.5,
+            "n_start": 0,
+            "t_stop": 50.0,
+            "t_save": 25.0,
+            # Simulation Algorithm Parameters
+            "num_patches": 1,
+            "model_eqns": 2,
+            "alt_soundspeed": "F",
+            "num_fluids": 2,
+            "mpp_lim": "F",
+            "mixture_err": "T",
+            "time_stepper": 3,
+            "weno_order": 5,
+            "weno_eps": 1e-16,
+            "mapped_weno": "T",
+            "weno_Re_flux": "T",
+            "mp_weno": "T",
+            "weno_avg": "T",
+            "riemann_solver": 2,
+            "wave_speeds": 1,
+            "avg_state": 2,
+            "bc_x%beg": -16,
+            "bc_x%end": -16,
+            "bc_y%beg": -16,
+            "bc_y%end": -16,
+            "bc_y%ve1": lid_velocity,
+            "viscous": "T",
+            # Formatted Database Files Structure Parameters
+            "format": 1,
+            "precision": 2,
+            "prim_vars_wrt": "T",
+            "omega_wrt(3)": "T",
+            "fd_order": 4,
+            "parallel_io": "T",
+            # Patch 1: Base
+            "patch_icpp(1)%geometry": 3,
+            "patch_icpp(1)%x_centroid": 0.5,
+            "patch_icpp(1)%y_centroid": 0.5,
+            "patch_icpp(1)%length_x": 1.0,
+            "patch_icpp(1)%length_y": 1.0,
+            "patch_icpp(1)%vel(1)": 0,
+            "patch_icpp(1)%vel(2)": 0.0,
+            "patch_icpp(1)%pres": 1e3,
+            "patch_icpp(1)%alpha_rho(1)": 0.5,
+            "patch_icpp(1)%alpha(1)": 0.5,
+            "patch_icpp(1)%alpha_rho(2)": 0.5,
+            "patch_icpp(1)%alpha(2)": 0.5,
+            # Fluids Physical Parameters
+            # Fluid 1:
+            "fluid_pp(1)%gamma": 1.0 / (1.4 - 1.0),
+            "fluid_pp(1)%pi_inf": 0.0,
+            "fluid_pp(1)%Re(1)": 1.0 / K,
+            "fluid_pp(1)%non_newtonian": "T",
+            "fluid_pp(1)%tau0": tau0,
+            "fluid_pp(1)%K": K,
+            "fluid_pp(1)%nn": nn,
+            "fluid_pp(1)%mu_max": mu_max,
+            "fluid_pp(1)%mu_min": mu_min,
+            "fluid_pp(1)%mu_bulk": mu_bulk,
+            "fluid_pp(1)%hb_m": hb_m,
+            # Fluid 2:
+            "fluid_pp(2)%gamma": 1.0 / (1.4 - 1.0),
+            "fluid_pp(2)%pi_inf": 0.0,
+            "fluid_pp(2)%Re(1)": 1.0 / K,
+            "fluid_pp(2)%non_newtonian": "T",
+            "fluid_pp(2)%tau0": tau0,
+            "fluid_pp(2)%K": K,
+            "fluid_pp(2)%nn": nn,
+            "fluid_pp(2)%mu_max": mu_max,
+            "fluid_pp(2)%mu_min": mu_min,
+            "fluid_pp(2)%mu_bulk": mu_bulk,
+            "fluid_pp(2)%hb_m": hb_m,
+        }
+    )
+)
diff --git a/examples/2D_poiseuille_nn/case.py b/examples/2D_poiseuille_nn/case.py
new file mode 100644
index 0000000000..b99500a6ad
--- /dev/null
+++ b/examples/2D_poiseuille_nn/case.py
@@ -0,0 +1,158 @@
+#!/usr/bin/env python3
+"""
+2D Poiseuille Flow with Herschel-Bulkley Non-Newtonian Fluid
+
+Pressure-driven channel flow between two no-slip walls, driven by a constant
+body force in the streamwise (x) direction. Periodic BCs in x, no-slip in y.
+
+HB model: mu = (tau0/gdot)*(1 - exp(-m*gdot)) + K * gdot^(n-1)
+  - tau0: yield stress
+  - K:    consistency index
+  - n:    flow behavior index (< 1 shear-thinning, > 1 shear-thickening)
+  - m:    Papanastasiou regularization parameter
+
+For Newtonian Poiseuille validation, set tau0=0, nn=1, K=mu.
+The analytical solution is: u(y) = (G/(2*mu)) * y * (H - y)
+where G = rho * g_x is the effective pressure gradient.
+"""
+import json
+import math
+
+# === Channel geometry (square domain) ===
+L = 1.0         # Channel length (streamwise, x)
+H = 1.0         # Channel height (wall-normal, y)
+
+# === Grid resolution ===
+Nx = 24         # Cells in x (streamwise, minimal — periodic)
+Ny = 81         # Cells in y (wall-normal)
+
+# === Fluid properties ===
+rho = 1.0       # Density
+p0 = 1e5        # Reference pressure (high for low Mach)
+gamma = 1.4     # Ratio of specific heats
+
+# Sound speed and CFL
+c = math.sqrt(gamma * p0 / rho)
+dx = L / (Nx + 1)
+cfl = 0.3
+dt = cfl * dx / c
+
+# === Body force (pressure gradient substitute) ===
+# G = rho * g_x acts as dp/dx driving force
+g_x = 0.5
+
+# === HB non-Newtonian model parameters ===
+tau0 = 0.0          # Yield stress (set 0 for power-law)
+K = 0.1            # Consistency index
+nn = 2.0          # Flow behavior index (< 1 = shear-thinning)
+hb_m = 1000.0       # Papanastasiou regularization parameter
+mu_min = 1e-4       # Minimum viscosity bound
+mu_max = 10.0       # Maximum viscosity bound
+mu_bulk = 0.0       # Bulk viscosity
+
+# Reference Re based on consistency index (used as baseline)
+Re_ref = 1.0 / K    # = 100
+
+# === Time control ===
+t_end = 10.0        # End time (allow flow to reach steady state)
+t_save = 5.0        # Save interval
+
+eps = 1e-6
+
+# Configuring case dictionary
+print(
+    json.dumps(
+        {
+            # Logistics
+            "run_time_info": "T",
+            # Computational Domain Parameters
+            "x_domain%beg": 0.0,
+            "x_domain%end": L,
+            "y_domain%beg": 0.0,
+            "y_domain%end": H,
+            "m": Nx,
+            "n": Ny,
+            "p": 0,
+            "cfl_adap_dt": "T",
+            "cfl_target": cfl,
+            "n_start": 0,
+            "t_stop": t_end,
+            "t_save": t_save,
+            # Simulation Algorithm Parameters
+            "num_patches": 1,
+            "model_eqns": 2,
+            "alt_soundspeed": "F",
+            "num_fluids": 2,
+            "mpp_lim": "F",
+            "mixture_err": "T",
+            "time_stepper": 3,
+            "weno_order": 5,
+            "weno_eps": 1e-16,
+            "mapped_weno": "T",
+            "weno_Re_flux": "T",
+            "mp_weno": "T",
+            "weno_avg": "T",
+            "riemann_solver": 2,
+            "wave_speeds": 1,
+            "avg_state": 2,
+            # Boundary Conditions
+            # Periodic in x (streamwise), no-slip walls in y
+            "bc_x%beg": -1,
+            "bc_x%end": -1,
+            "bc_y%beg": -16,
+            "bc_y%end": -16,
+            # Viscous
+            "viscous": "T",
+            # Body Force (drives the flow like a pressure gradient)
+            "bf_x": "T",
+            "g_x": g_x,
+            "k_x": 0.0,
+            "w_x": 0.0,
+            "p_x": 0.0,
+            # Formatted Database Files Structure Parameters
+            "format": 1,
+            "precision": 2,
+            "prim_vars_wrt": "T",
+            "omega_wrt(3)": "T",
+            "fd_order": 4,
+            "parallel_io": "T",
+            # Patch 1: Entire channel domain (initially at rest)
+            "patch_icpp(1)%geometry": 3,
+            "patch_icpp(1)%x_centroid": L / 2.0,
+            "patch_icpp(1)%y_centroid": H / 2.0,
+            "patch_icpp(1)%length_x": L,
+            "patch_icpp(1)%length_y": H,
+            "patch_icpp(1)%vel(1)": 0.0,
+            "patch_icpp(1)%vel(2)": 0.0,
+            "patch_icpp(1)%pres": p0,
+            "patch_icpp(1)%alpha_rho(1)": rho * 0.5,
+            "patch_icpp(1)%alpha(1)": 0.5,
+            "patch_icpp(1)%alpha_rho(2)": rho * 0.5,
+            "patch_icpp(1)%alpha(2)": 0.5,
+            # Fluid 1: HB non-Newtonian fluid
+            "fluid_pp(1)%gamma": 1.0 / (gamma - 1.0),
+            "fluid_pp(1)%pi_inf": 0.0,
+            "fluid_pp(1)%Re(1)": Re_ref,
+            "fluid_pp(1)%non_newtonian": "T",
+            "fluid_pp(1)%tau0": tau0,
+            "fluid_pp(1)%K": K,
+            "fluid_pp(1)%nn": nn,
+            "fluid_pp(1)%hb_m": hb_m,
+            "fluid_pp(1)%mu_min": mu_min,
+            "fluid_pp(1)%mu_max": mu_max,
+            "fluid_pp(1)%mu_bulk": mu_bulk,
+            # Fluid 2: same properties (single-phase effectively)
+            "fluid_pp(2)%gamma": 1.0 / (gamma - 1.0),
+            "fluid_pp(2)%pi_inf": 0.0,
+            "fluid_pp(2)%Re(1)": Re_ref,
+            "fluid_pp(2)%non_newtonian": "T",
+            "fluid_pp(2)%tau0": tau0,
+            "fluid_pp(2)%K": K,
+            "fluid_pp(2)%nn": nn,
+            "fluid_pp(2)%hb_m": hb_m,
+            "fluid_pp(2)%mu_min": mu_min,
+            "fluid_pp(2)%mu_max": mu_max,
+            "fluid_pp(2)%mu_bulk": mu_bulk,
+        }
+    )
+)
diff --git a/examples/2D_poiseuille_nn/velocity_profile.png b/examples/2D_poiseuille_nn/velocity_profile.png
new file mode 100644
index 0000000000..4632d4a9eb
Binary files /dev/null and b/examples/2D_poiseuille_nn/velocity_profile.png differ
diff --git a/src/common/m_derived_types.fpp b/src/common/m_derived_types.fpp
index 8559be36d3..da6145abbb 100644
--- a/src/common/m_derived_types.fpp
+++ b/src/common/m_derived_types.fpp
@@ -367,6 +367,14 @@ module m_derived_types
         real(wp) :: qv      !< reference energy per unit mass for SGEOS, q (see Le Metayer (2004))
         real(wp) :: qvp     !< reference entropy per unit mass for SGEOS, q' (see Le Metayer (2004))
         real(wp) :: G
+        logical :: non_newtonian  !< Non-Newtonian fluid flag
+        real(wp) :: tau0          !< Yield stress (Herschel-Bulkley model)
+        real(wp) :: K             !< Consistency index (Herschel-Bulkley model)
+        real(wp) :: nn            !< Flow behavior index (Herschel-Bulkley model)
+        real(wp) :: mu_max        !< Maximum viscosity limit (shear)
+        real(wp) :: mu_min        !< Minimum viscosity limit (shear)
+        real(wp) :: mu_bulk       !< Bulk viscosity for non-Newtonian fluids
+        real(wp) :: hb_m          !< Papanastasiou regularization parameter
     end type physical_parameters
 
     !> Derived type annexing the physical parameters required for sub-grid bubble models
diff --git a/src/post_process/m_global_parameters.fpp b/src/post_process/m_global_parameters.fpp
index 219667a158..f842cc5bd7 100644
--- a/src/post_process/m_global_parameters.fpp
+++ b/src/post_process/m_global_parameters.fpp
@@ -434,6 +434,14 @@ contains
             fluid_pp(i)%qv = 0._wp
             fluid_pp(i)%qvp = 0._wp
             fluid_pp(i)%G = dflt_real
+            fluid_pp(i)%non_newtonian = .false.
+            fluid_pp(i)%tau0 = 0._wp
+            fluid_pp(i)%K = 0._wp
+            fluid_pp(i)%nn = 1._wp
+            fluid_pp(i)%mu_max = dflt_real
+            fluid_pp(i)%mu_min = 0._wp
+            fluid_pp(i)%mu_bulk = 0._wp
+            fluid_pp(i)%hb_m = 1000._wp
         end do
 
         ! Subgrid bubble parameters
diff --git a/src/post_process/m_mpi_proxy.fpp b/src/post_process/m_mpi_proxy.fpp
index 29e65942c6..a019d33f86 100644
--- a/src/post_process/m_mpi_proxy.fpp
+++ b/src/post_process/m_mpi_proxy.fpp
@@ -130,6 +130,10 @@ contains
             call MPI_BCAST(fluid_pp(i)%qv, 1, mpi_p, 0, MPI_COMM_WORLD, ierr)
             call MPI_BCAST(fluid_pp(i)%qvp, 1, mpi_p, 0, MPI_COMM_WORLD, ierr)
             call MPI_BCAST(fluid_pp(i)%G, 1, mpi_p, 0, MPI_COMM_WORLD, ierr)
+            call MPI_BCAST(fluid_pp(i)%non_newtonian, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr)
+            #:for VAR in ['tau0', 'K', 'nn', 'mu_max', 'mu_min', 'mu_bulk', 'hb_m']
+                call MPI_BCAST(fluid_pp(i)%${VAR}$, 1, mpi_p, 0, MPI_COMM_WORLD, ierr)
+            #:endfor
         end do
 
         ! Subgrid bubble parameters
diff --git a/src/pre_process/m_global_parameters.fpp b/src/pre_process/m_global_parameters.fpp
index 11fa95e124..1df860d2b4 100644
--- a/src/pre_process/m_global_parameters.fpp
+++ b/src/pre_process/m_global_parameters.fpp
@@ -592,6 +592,14 @@ contains
             fluid_pp(i)%qv = 0._wp
             fluid_pp(i)%qvp = 0._wp
             fluid_pp(i)%G = 0._wp
+            fluid_pp(i)%non_newtonian = .false.
+            fluid_pp(i)%tau0 = 0._wp
+            fluid_pp(i)%K = 0._wp
+            fluid_pp(i)%nn = 1._wp
+            fluid_pp(i)%mu_max = dflt_real
+            fluid_pp(i)%mu_min = 0._wp
+            fluid_pp(i)%mu_bulk = 0._wp
+            fluid_pp(i)%hb_m = 1000._wp
         end do
 
         Bx0 = dflt_real
diff --git a/src/pre_process/m_mpi_proxy.fpp b/src/pre_process/m_mpi_proxy.fpp
index cbfac0571b..6225ac0222 100644
--- a/src/pre_process/m_mpi_proxy.fpp
+++ b/src/pre_process/m_mpi_proxy.fpp
@@ -143,6 +143,11 @@ contains
                 call MPI_BCAST(fluid_pp(i)%${VAR}$, 1, mpi_p, 0, MPI_COMM_WORLD, ierr)
             #:endfor
 
+            call MPI_BCAST(fluid_pp(i)%non_newtonian, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr)
+            #:for VAR in ['tau0', 'K', 'nn', 'mu_max', 'mu_min', 'mu_bulk', 'hb_m']
+                call MPI_BCAST(fluid_pp(i)%${VAR}$, 1, mpi_p, 0, MPI_COMM_WORLD, ierr)
+            #:endfor
+
             call MPI_BCAST(simplex_params%perturb_dens(i), 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr)
             call MPI_BCAST(simplex_params%perturb_dens_freq(i), 1, mpi_p, 0, MPI_COMM_WORLD, ierr)
             call MPI_BCAST(simplex_params%perturb_dens_scale(i), 1, mpi_p, 0, MPI_COMM_WORLD, ierr)
diff --git a/src/simulation/m_checker.fpp b/src/simulation/m_checker.fpp
index ed694c449d..3c44b7d7a1 100644
--- a/src/simulation/m_checker.fpp
+++ b/src/simulation/m_checker.fpp
@@ -39,6 +39,7 @@ contains
         end if
 
         call s_check_inputs_time_stepping
+        call s_check_inputs_non_newtonian
 
     end subroutine s_check_inputs
 
@@ -91,4 +92,30 @@ contains
 #endif
     end subroutine s_check_inputs_nvidia_uvm
 
+    !> Checks constraints on non-Newtonian fluid parameters
+    impure subroutine s_check_inputs_non_newtonian
+        integer :: i
+
+        do i = 1, num_fluids
+            if (fluid_pp(i)%non_newtonian) then
+                @:PROHIBIT(.not. viscous, &
+                    "Non-Newtonian fluid requires viscosity to be enabled")
+                @:PROHIBIT(fluid_pp(i)%K <= 0._wp, &
+                    "Non-Newtonian fluid consistency index K must be > 0")
+                @:PROHIBIT(fluid_pp(i)%nn <= 0._wp, &
+                    "Non-Newtonian fluid flow behavior index nn must be > 0")
+                @:PROHIBIT(fluid_pp(i)%tau0 < 0._wp, &
+                    "Non-Newtonian fluid yield stress tau0 must be >= 0")
+                @:PROHIBIT(fluid_pp(i)%mu_min < 0._wp, &
+                    "Non-Newtonian fluid mu_min must be >= 0")
+                @:PROHIBIT(fluid_pp(i)%mu_max < dflt_real .and. &
+                    fluid_pp(i)%mu_max <= fluid_pp(i)%mu_min, &
+                    "Non-Newtonian fluid mu_max must be > mu_min when set")
+                @:PROHIBIT(fluid_pp(i)%hb_m <= 0._wp, &
+                    "Non-Newtonian Papanastasiou parameter hb_m must be > 0")
+            end if
+        end do
+
+    end subroutine s_check_inputs_non_newtonian
+
 end module m_checker
diff --git a/src/simulation/m_data_output.fpp b/src/simulation/m_data_output.fpp
index 068fcb38b1..83498bf851 100644
--- a/src/simulation/m_data_output.fpp
+++ b/src/simulation/m_data_output.fpp
@@ -1,1974 +1,1984 @@
-!>
-!! @file
-!! @brief Contains module m_data_output
-
-#:include 'macros.fpp'
-#:include 'case.fpp'
-
-!> @brief Writes solution data, run-time stability diagnostics (ICFL, VCFL, CCFL, Rc), and probe/center-of-mass files
-module m_data_output
-
-    use m_derived_types        !< Definitions of the derived types
-
-    use m_global_parameters    !< Definitions of the global parameters
-
-    use m_mpi_proxy            !< Message passing interface (MPI) module proxy
-
-    use m_variables_conversion !< State variables type conversion procedures
-
-    use m_compile_specific
-
-    use m_helper
-
-    use m_helper_basic         !< Functions to compare floating point numbers
-
-    use m_sim_helpers
-
-    use m_delay_file_access
-
-    use m_ibm
-
-    use m_boundary_common
-
-    implicit none
-
-    private; 
-    public :: s_initialize_data_output_module, &
-              s_open_run_time_information_file, &
-              s_open_com_files, &
-              s_open_probe_files, &
-              s_write_run_time_information, &
-              s_write_data_files, &
-              s_write_serial_data_files, &
-              s_write_parallel_data_files, &
-              s_write_com_files, &
-              s_write_probe_files, &
-              s_close_run_time_information_file, &
-              s_close_com_files, &
-              s_close_probe_files, &
-              s_finalize_data_output_module, &
-              s_write_ib_data_file
-
-    real(wp), allocatable, dimension(:, :, :) :: icfl_sf  !< ICFL stability criterion
-    real(wp), allocatable, dimension(:, :, :) :: vcfl_sf  !< VCFL stability criterion
-    real(wp), allocatable, dimension(:, :, :) :: ccfl_sf  !< CCFL stability criterion
-    real(wp), allocatable, dimension(:, :, :) :: Rc_sf  !< Rc stability criterion
-    real(wp), public, allocatable, dimension(:, :) :: c_mass
-    $:GPU_DECLARE(create='[icfl_sf,vcfl_sf,ccfl_sf,Rc_sf,c_mass]')
-
-    real(wp) :: icfl_max_loc, icfl_max_glb !< ICFL stability extrema on local and global grids
-    real(wp) :: vcfl_max_loc, vcfl_max_glb !< VCFL stability extrema on local and global grids
-    real(wp) :: ccfl_max_loc, ccfl_max_glb !< CCFL stability extrema on local and global grids
-    real(wp) :: Rc_min_loc, Rc_min_glb !< Rc   stability extrema on local and global grids
-    $:GPU_DECLARE(create='[icfl_max_loc,icfl_max_glb,vcfl_max_loc,vcfl_max_glb]')
-    $:GPU_DECLARE(create='[ccfl_max_loc,ccfl_max_glb,Rc_min_loc,Rc_min_glb]')
-
-    !> @name ICFL, VCFL, CCFL and Rc stability criteria extrema over all the time-steps
-    !> @{
-    real(wp) :: icfl_max !< ICFL criterion maximum
-    real(wp) :: vcfl_max !< VCFL criterion maximum
-    real(wp) :: ccfl_max !< CCFL criterion maximum
-    real(wp) :: Rc_min !< Rc criterion maximum
-    !> @}
-
-    type(scalar_field), allocatable, dimension(:) :: q_cons_temp_ds
-
-contains
-
-    !> Write data files. Dispatch subroutine that replaces procedure pointer.
-        !! @param q_cons_vf Conservative variables
-        !! @param q_T_sf Temperature scalar field
-        !! @param q_prim_vf Primitive variables
-        !! @param t_step Current time step
-        !! @param bc_type Boundary condition type
-        !! @param beta Eulerian void fraction from lagrangian bubbles
-    impure subroutine s_write_data_files(q_cons_vf, q_T_sf, q_prim_vf, t_step, bc_type, beta)
-
-        type(scalar_field), &
-            dimension(sys_size), &
-            intent(inout) :: q_cons_vf
-
-        type(scalar_field), &
-            intent(inout) :: q_T_sf
-
-        type(scalar_field), &
-            dimension(sys_size), &
-            intent(inout) :: q_prim_vf
-
-        integer, intent(in) :: t_step
-
-        type(scalar_field), &
-            intent(inout), optional :: beta
-
-        type(integer_field), &
-            dimension(1:num_dims, -1:1), &
-            intent(in) :: bc_type
-
-        if (.not. parallel_io) then
-            call s_write_serial_data_files(q_cons_vf, q_T_sf, q_prim_vf, t_step, bc_type, beta)
-        else
-            call s_write_parallel_data_files(q_cons_vf, t_step, bc_type, beta)
-        end if
-
-    end subroutine s_write_data_files
-
-    !>  The purpose of this subroutine is to open a new or pre-
-        !!          existing run-time information file and append to it the
-        !!      basic header information relevant to current simulation.
-        !!      In general, this requires generating a table header for
-        !!      those stability criteria which will be written at every
-        !!      time-step.
-    impure subroutine s_open_run_time_information_file
-
-        character(LEN=name_len), parameter :: file_name = 'run_time.inf' !<
-            !! Name of the run-time information file
-
-        character(LEN=path_len + name_len) :: file_path !<
-            !! Relative path to a file in the case directory
-
-        character(LEN=8) :: file_date !<
-            !! Creation date of the run-time information file
-
-        ! Opening the run-time information file
-        file_path = trim(case_dir)//'/'//trim(file_name)
-
-        open (3, FILE=trim(file_path), &
-              FORM='formatted', &
-              STATUS='replace')
-
-        write (3, '(A)') 'Description: Stability information at '// &
-            'each time-step of the simulation. This'
-        write (3, '(13X,A)') 'data is composed of the inviscid '// &
-            'Courant–Friedrichs–Lewy (ICFL)'
-        write (3, '(13X,A)') 'number, the viscous CFL (VCFL) number, '// &
-            'the capillary CFL (CCFL)'
-        write (3, '(13X,A)') 'number and the cell Reynolds (Rc) '// &
-            'number. Please note that only'
-        write (3, '(13X,A)') 'those stability conditions pertinent '// &
-            'to the physics included in'
-        write (3, '(13X,A)') 'the current computation are displayed.'
-
-        call date_and_time(DATE=file_date)
-
-        write (3, '(A)') 'Date: '//file_date(5:6)//'/'// &
-            file_date(7:8)//'/'// &
-            file_date(3:4)
-
-        write (3, '(A)') ''; write (3, '(A)') ''
-
-        ! Generating table header for the stability criteria to be outputted
-        write (3, '(13X,A9,13X,A10,13X,A10,13X,A10)', advance="no") &
-            trim('Time-step'), trim('dt'), trim('Time'), trim('ICFL Max')
-
-        if (viscous) then
-            write (3, '(13X,A10,13X,A16)', advance="no") &
-                trim('VCFL Max'), trim('Rc Min')
-        end if
-
-        write (3, *) ! new line
-
-    end subroutine s_open_run_time_information_file
-
-    !>  This opens a formatted data file where the root processor
-        !!      can write out the CoM information
-    impure subroutine s_open_com_files()
-
-        character(len=path_len + 3*name_len) :: file_path !<
-            !! Relative path to the CoM file in the case directory
-        integer :: i !< Generic loop iterator
-
-        do i = 1, num_fluids
-            ! Generating the relative path to the CoM data file
-            write (file_path, '(A,I0,A)') '/fluid', i, '_com.dat'
-            file_path = trim(case_dir)//trim(file_path)
-            ! Creating the formatted data file and setting up its
-            ! structure
-            open (i + 120, file=trim(file_path), &
-                  form='formatted', &
-                  position='append', &
-                  status='unknown')
-            if (n == 0) then
-                write (i + 120, '(A)') '    Non-Dimensional Time '// &
-                    '    Total Mass '// &
-                    '    x-loc '// &
-                    '    Total Volume    '
-            elseif (p == 0) then
-                write (i + 120, '(A)') '    Non-Dimensional Time '// &
-                    '    Total Mass '// &
-                    '    x-loc '// &
-                    '    y-loc '// &
-                    '    Total Volume    '
-            else
-                write (i + 120, '(A)') '    Non-Dimensional Time '// &
-                    '    Total Mass '// &
-                    '    x-loc '// &
-                    '    y-loc '// &
-                    '    z-loc '// &
-                    '    Total Volume    '
-            end if
-        end do
-    end subroutine s_open_com_files
-
-    !>  This opens a formatted data file where the root processor
-        !!      can write out flow probe information
-    impure subroutine s_open_probe_files
-
-        character(LEN=path_len + 3*name_len) :: file_path !<
-            !! Relative path to the probe data file in the case directory
-
-        integer :: i !< Generic loop iterator
-        logical :: file_exist
-
-        do i = 1, num_probes
-            ! Generating the relative path to the data file
-            write (file_path, '(A,I0,A)') '/D/probe', i, '_prim.dat'
-            file_path = trim(case_dir)//trim(file_path)
-
-            ! Creating the formatted data file and setting up its
-            ! structure
-            inquire (file=trim(file_path), exist=file_exist)
-
-            if (file_exist) then
-                open (i + 30, FILE=trim(file_path), &
-                      FORM='formatted', &
-                      STATUS='old', &
-                      POSITION='append')
-            else
-                open (i + 30, FILE=trim(file_path), &
-                      FORM='formatted', &
-                      STATUS='unknown')
-            end if
-        end do
-
-        if (integral_wrt) then
-            do i = 1, num_integrals
-                write (file_path, '(A,I0,A)') '/D/integral', i, '_prim.dat'
-                file_path = trim(case_dir)//trim(file_path)
-
-                open (i + 70, FILE=trim(file_path), &
-                      FORM='formatted', &
-                      POSITION='append', &
-                      STATUS='unknown')
-            end do
-        end if
-
-    end subroutine s_open_probe_files
-
-    !>  The goal of the procedure is to output to the run-time
-        !!      information file the stability criteria extrema in the
-        !!      entire computational domain and at the given time-step.
-        !!      Moreover, the subroutine is also in charge of tracking
-        !!      these stability criteria extrema over all time-steps.
-        !!  @param q_prim_vf Cell-average primitive variables
-        !!  @param t_step Current time step
-    impure subroutine s_write_run_time_information(q_prim_vf, t_step)
-
-        type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf
-        integer, intent(in) :: t_step
-
-        real(wp) :: rho        !< Cell-avg. density
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
-            real(wp), dimension(3) :: alpha      !< Cell-avg. volume fraction
-            real(wp), dimension(3) :: vel        !< Cell-avg. velocity
-        #:else
-            real(wp), dimension(num_fluids) :: alpha      !< Cell-avg. volume fraction
-            real(wp), dimension(num_vels) :: vel        !< Cell-avg. velocity
-        #:endif
-        real(wp) :: vel_sum    !< Cell-avg. velocity sum
-        real(wp) :: pres       !< Cell-avg. pressure
-        real(wp) :: gamma      !< Cell-avg. sp. heat ratio
-        real(wp) :: pi_inf     !< Cell-avg. liquid stiffness function
-        real(wp) :: qv         !< Cell-avg. internal energy reference value
-        real(wp) :: c          !< Cell-avg. sound speed
-        real(wp) :: H          !< Cell-avg. enthalpy
-        real(wp), dimension(2) :: Re         !< Cell-avg. Reynolds numbers
-        integer :: j, k, l
-
-        ! Computing Stability Criteria at Current Time-step
-        $:GPU_PARALLEL_LOOP(collapse=3, private='[j,k,l,vel, alpha, Re, rho, vel_sum, pres, gamma, pi_inf, c, H, qv]')
-        do l = 0, p
-            do k = 0, n
-                do j = 0, m
-                    call s_compute_enthalpy(q_prim_vf, pres, rho, gamma, pi_inf, Re, H, alpha, vel, vel_sum, qv, j, k, l)
-
-                    call s_compute_speed_of_sound(pres, rho, gamma, pi_inf, H, alpha, vel_sum, 0._wp, c, qv)
-
-                    if (viscous) then
-                        call s_compute_stability_from_dt(vel, c, rho, Re, j, k, l, icfl_sf, vcfl_sf, Rc_sf)
-                    else
-                        call s_compute_stability_from_dt(vel, c, rho, Re, j, k, l, icfl_sf)
-                    end if
-
-                end do
-            end do
-        end do
-        $:END_GPU_PARALLEL_LOOP()
-
-        ! end: Computing Stability Criteria at Current Time-step
-
-        ! Determining local stability criteria extrema at current time-step
-
-#ifdef _CRAYFTN
-        $:GPU_UPDATE(host='[icfl_sf]')
-
-        if (viscous) then
-            $:GPU_UPDATE(host='[vcfl_sf,Rc_sf]')
-        end if
-
-        icfl_max_loc = maxval(icfl_sf)
-
-        if (viscous) then
-            vcfl_max_loc = maxval(vcfl_sf)
-            Rc_min_loc = minval(Rc_sf)
-        end if
-#else
-        #:call GPU_PARALLEL(copyout='[icfl_max_loc]', copyin='[icfl_sf]')
-            icfl_max_loc = maxval(icfl_sf)
-        #:endcall GPU_PARALLEL
-        if (viscous .or. dummy) then
-            #:call GPU_PARALLEL(copyout='[vcfl_max_loc, Rc_min_loc]', copyin='[vcfl_sf,Rc_sf]')
-                vcfl_max_loc = maxval(vcfl_sf)
-                Rc_min_loc = minval(Rc_sf)
-            #:endcall GPU_PARALLEL
-        end if
-#endif
-
-        ! Determining global stability criteria extrema at current time-step
-        if (num_procs > 1) then
-            call s_mpi_reduce_stability_criteria_extrema(icfl_max_loc, &
-                                                         vcfl_max_loc, &
-                                                         Rc_min_loc, &
-                                                         icfl_max_glb, &
-                                                         vcfl_max_glb, &
-                                                         Rc_min_glb)
-        else
-            icfl_max_glb = icfl_max_loc
-            if (viscous) vcfl_max_glb = vcfl_max_loc
-            if (viscous) Rc_min_glb = Rc_min_loc
-        end if
-
-        ! Determining the stability criteria extrema over all the time-steps
-        if (icfl_max_glb > icfl_max) icfl_max = icfl_max_glb
-
-        if (viscous) then
-            if (vcfl_max_glb > vcfl_max) vcfl_max = vcfl_max_glb
-            if (Rc_min_glb < Rc_min) Rc_min = Rc_min_glb
-        end if
-
-        ! Outputting global stability criteria extrema at current time-step
-        if (proc_rank == 0) then
-            write (3, '(13X,I9,13X,F10.6,13X,F10.6,13X,F10.6)', advance="no") &
-                t_step, dt, mytime, icfl_max_glb
-
-            if (viscous) then
-                write (3, '(13X,F10.6,13X,ES16.6)', advance="no") &
-                    vcfl_max_glb, &
-                    Rc_min_glb
-            end if
-
-            write (3, *) ! new line
-
-            if (.not. f_approx_equal(icfl_max_glb, icfl_max_glb)) then
-                call s_mpi_abort('ICFL is NaN. Exiting.')
-            elseif (icfl_max_glb > 1._wp) then
-                print *, 'icfl', icfl_max_glb
-                call s_mpi_abort('ICFL is greater than 1.0. Exiting.')
-            end if
-
-            if (viscous) then
-                if (.not. f_approx_equal(vcfl_max_glb, vcfl_max_glb)) then
-                    call s_mpi_abort('VCFL is NaN. Exiting.')
-                elseif (vcfl_max_glb > 1._wp) then
-                    print *, 'vcfl', vcfl_max_glb
-                    call s_mpi_abort('VCFL is greater than 1.0. Exiting.')
-                end if
-            end if
-        end if
-
-        call s_mpi_barrier()
-
-    end subroutine s_write_run_time_information
-
-    !>  The goal of this subroutine is to output the grid and
-        !!      conservative variables data files for given time-step.
-        !!  @param q_cons_vf Cell-average conservative variables
-        !!  @param q_T_sf Temperature scalar field
-        !!  @param q_prim_vf Cell-average primitive variables
-        !!  @param t_step Current time-step
-        !!  @param bc_type Boundary condition type
-        !!  @param beta Eulerian void fraction from lagrangian bubbles
-    impure subroutine s_write_serial_data_files(q_cons_vf, q_T_sf, q_prim_vf, t_step, bc_type, beta)
-
-        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
-        type(scalar_field), intent(inout) :: q_T_sf
-        type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
-        integer, intent(in) :: t_step
-        type(scalar_field), intent(inout), optional :: beta
-        type(integer_field), dimension(1:num_dims, -1:1), intent(in) :: bc_type
-
-        character(LEN=path_len + 2*name_len) :: t_step_dir !<
-            !! Relative path to the current time-step directory
-
-        character(LEN=path_len + 3*name_len) :: file_path !<
-            !! Relative path to the grid and conservative variables data files
-
-        logical :: file_exist !<
-            !! Logical used to check existence of current time-step directory
-
-        character(LEN=15) :: FMT
-
-        integer :: i, j, k, l, r
-
-        real(wp) :: gamma, lit_gamma, pi_inf, qv !< Temporary EOS params
-
-        ! Creating or overwriting the time-step root directory
-        write (t_step_dir, '(A,I0,A,I0)') trim(case_dir)//'/p_all'
-
-        ! Creating or overwriting the current time-step directory
-        write (t_step_dir, '(a,i0,a,i0)') trim(case_dir)//'/p_all/p', &
-            proc_rank, '/', t_step
-
-        file_path = trim(t_step_dir)//'/.'
-        call my_inquire(file_path, file_exist)
-        if (file_exist) call s_delete_directory(trim(t_step_dir))
-        call s_create_directory(trim(t_step_dir))
-
-        ! Writing the grid data file in the x-direction
-        file_path = trim(t_step_dir)//'/x_cb.dat'
-
-        open (2, FILE=trim(file_path), &
-              FORM='unformatted', &
-              STATUS='new')
-        write (2) x_cb(-1:m); close (2)
-
-        ! Writing the grid data files in the y- and z-directions
-        if (n > 0) then
-
-            file_path = trim(t_step_dir)//'/y_cb.dat'
-
-            open (2, FILE=trim(file_path), &
-                  FORM='unformatted', &
-                  STATUS='new')
-            write (2) y_cb(-1:n); close (2)
-
-            if (p > 0) then
-
-                file_path = trim(t_step_dir)//'/z_cb.dat'
-
-                open (2, FILE=trim(file_path), &
-                      FORM='unformatted', &
-                      STATUS='new')
-                write (2) z_cb(-1:p); close (2)
-
-            end if
-
-        end if
-
-        ! Writing the conservative variables data files
-        do i = 1, sys_size
-            write (file_path, '(A,I0,A)') trim(t_step_dir)//'/q_cons_vf', &
-                i, '.dat'
-
-            open (2, FILE=trim(file_path), &
-                  FORM='unformatted', &
-                  STATUS='new')
-
-            write (2) q_cons_vf(i)%sf(0:m, 0:n, 0:p); close (2)
-        end do
-
-        ! Lagrangian beta (void fraction) written as q_cons_vf(sys_size+1) to
-        ! match the parallel I/O path and allow post_process to read it.
-        if (bubbles_lagrange) then
-            write (file_path, '(A,I0,A)') trim(t_step_dir)//'/q_cons_vf', &
-                sys_size + 1, '.dat'
-
-            open (2, FILE=trim(file_path), &
-                  FORM='unformatted', &
-                  STATUS='new')
-
-            write (2) beta%sf(0:m, 0:n, 0:p); close (2)
-        end if
-
-        if (qbmm .and. .not. polytropic) then
-            do i = 1, nb
-                do r = 1, nnode
-                    write (file_path, '(A,I0,A)') trim(t_step_dir)//'/pb', &
-                        sys_size + (i - 1)*nnode + r, '.dat'
-
-                    open (2, FILE=trim(file_path), &
-                          FORM='unformatted', &
-                          STATUS='new')
-
-                    write (2) pb_ts(1)%sf(0:m, 0:n, 0:p, r, i); close (2)
-                end do
-            end do
-
-            do i = 1, nb
-                do r = 1, nnode
-                    write (file_path, '(A,I0,A)') trim(t_step_dir)//'/mv', &
-                        sys_size + (i - 1)*nnode + r, '.dat'
-
-                    open (2, FILE=trim(file_path), &
-                          FORM='unformatted', &
-                          STATUS='new')
-
-                    write (2) mv_ts(1)%sf(0:m, 0:n, 0:p, r, i); close (2)
-                end do
-            end do
-        end if
-
-        ! Writing the IB markers
-        if (ib) then
-            call s_write_serial_ib_data(t_step)
-            ! write (file_path, '(A,I0,A)') trim(t_step_dir)//'/ib.dat'
-
-            ! open (2, FILE=trim(file_path), &
-            !       FORM='unformatted', &
-            !       STATUS='new')
-
-            ! write (2) ib_markers%sf(0:m, 0:n, 0:p); close (2)
-        end if
-
-        gamma = gammas(1)
-        lit_gamma = gs_min(1)
-        pi_inf = pi_infs(1)
-        qv = qvs(1)
-
-        if (precision == 1) then
-            FMT = "(2F30.3)"
-        else
-            FMT = "(2F40.14)"
-        end if
-
-        ! writing an output directory
-        write (t_step_dir, '(A,I0,A,I0)') trim(case_dir)//'/D'
-        file_path = trim(t_step_dir)//'/.'
-
-        inquire (FILE=trim(file_path), EXIST=file_exist)
-
-        if (.not. file_exist) call s_create_directory(trim(t_step_dir))
-
-        if ((prim_vars_wrt .or. (n == 0 .and. p == 0)) .and. (.not. igr)) then
-            call s_convert_conservative_to_primitive_variables(q_cons_vf, q_T_sf, q_prim_vf, idwint)
-            do i = 1, sys_size
-                $:GPU_UPDATE(host='[q_prim_vf(i)%sf(:,:,:)]')
-            end do
-            ! q_prim_vf(bubxb) stores the value of nb needed in riemann solvers, so replace with true primitive value (=1._wp)
-            if (qbmm) then
-                q_prim_vf(bubxb)%sf = 1._wp
-            end if
-        end if
-
-        !1D
-        if (n == 0 .and. p == 0) then
-
-            if (model_eqns == 2 .and. (.not. igr)) then
-                do i = 1, sys_size
-                    write (file_path, '(A,I0,A,I2.2,A,I6.6,A)') trim(t_step_dir)//'/prim.', i, '.', proc_rank, '.', t_step, '.dat'
-
-                    open (2, FILE=trim(file_path))
-                    do j = 0, m
-                        ! todo: revisit change here
-                        if (((i >= adv_idx%beg) .and. (i <= adv_idx%end))) then
-                            write (2, FMT) x_cb(j), q_cons_vf(i)%sf(j, 0, 0)
-                        else
-                            write (2, FMT) x_cb(j), q_prim_vf(i)%sf(j, 0, 0)
-                        end if
-                    end do
-                    close (2)
-                end do
-            end if
-
-            do i = 1, sys_size
-                write (file_path, '(A,I0,A,I2.2,A,I6.6,A)') trim(t_step_dir)//'/cons.', i, '.', proc_rank, '.', t_step, '.dat'
-
-                open (2, FILE=trim(file_path))
-                do j = 0, m
-                    write (2, FMT) x_cb(j), q_cons_vf(i)%sf(j, 0, 0)
-                end do
-                close (2)
-            end do
-
-            if (qbmm .and. .not. polytropic) then
-                do i = 1, nb
-                    do r = 1, nnode
-                        write (file_path, '(A,I0,A,I0,A,I2.2,A,I6.6,A)') trim(t_step_dir)//'/pres.', i, '.', r, '.', proc_rank, '.', t_step, '.dat'
-
-                        open (2, FILE=trim(file_path))
-                        do j = 0, m
-                            write (2, FMT) x_cb(j), pb_ts(1)%sf(j, 0, 0, r, i)
-                        end do
-                        close (2)
-                    end do
-                end do
-                do i = 1, nb
-                    do r = 1, nnode
-                        write (file_path, '(A,I0,A,I0,A,I2.2,A,I6.6,A)') trim(t_step_dir)//'/mv.', i, '.', r, '.', proc_rank, '.', t_step, '.dat'
-
-                        open (2, FILE=trim(file_path))
-                        do j = 0, m
-                            write (2, FMT) x_cb(j), mv_ts(1)%sf(j, 0, 0, r, i)
-                        end do
-                        close (2)
-                    end do
-                end do
-            end if
-        end if
-
-        if (precision == 1) then
-            FMT = "(3F30.7)"
-        else
-            FMT = "(3F40.14)"
-        end if
-
-        ! 2D
-        if ((n > 0) .and. (p == 0)) then
-            do i = 1, sys_size
-                write (file_path, '(A,I0,A,I2.2,A,I6.6,A)') trim(t_step_dir)//'/cons.', i, '.', proc_rank, '.', t_step, '.dat'
-                open (2, FILE=trim(file_path))
-                do j = 0, m
-                    do k = 0, n
-                        write (2, FMT) x_cb(j), y_cb(k), q_cons_vf(i)%sf(j, k, 0)
-                    end do
-                    write (2, *)
-                end do
-                close (2)
-            end do
-
-            if (present(beta)) then
-                write (file_path, '(A,I0,A,I2.2,A,I6.6,A)') trim(t_step_dir)//'/beta.', i, '.', proc_rank, '.', t_step, '.dat'
-                open (2, FILE=trim(file_path))
-                do j = 0, m
-                    do k = 0, n
-                        write (2, FMT) x_cb(j), y_cb(k), beta%sf(j, k, 0)
-                    end do
-                    write (2, *)
-                end do
-                close (2)
-            end if
-
-            if (qbmm .and. .not. polytropic) then
-                do i = 1, nb
-                    do r = 1, nnode
-                        write (file_path, '(A,I0,A,I0,A,I2.2,A,I6.6,A)') trim(t_step_dir)//'/pres.', i, '.', r, '.', proc_rank, '.', t_step, '.dat'
-
-                        open (2, FILE=trim(file_path))
-                        do j = 0, m
-                            do k = 0, n
-                                write (2, FMT) x_cb(j), y_cb(k), pb_ts(1)%sf(j, k, 0, r, i)
-                            end do
-                        end do
-                        close (2)
-                    end do
-                end do
-                do i = 1, nb
-                    do r = 1, nnode
-                        write (file_path, '(A,I0,A,I0,A,I2.2,A,I6.6,A)') trim(t_step_dir)//'/mv.', i, '.', r, '.', proc_rank, '.', t_step, '.dat'
-
-                        open (2, FILE=trim(file_path))
-                        do j = 0, m
-                            do k = 0, n
-                                write (2, FMT) x_cb(j), y_cb(k), mv_ts(1)%sf(j, k, 0, r, i)
-                            end do
-                        end do
-                        close (2)
-                    end do
-                end do
-            end if
-
-            if (prim_vars_wrt .and. (.not. igr)) then
-                do i = 1, sys_size
-                    write (file_path, '(A,I0,A,I2.2,A,I6.6,A)') trim(t_step_dir)//'/prim.', i, '.', proc_rank, '.', t_step, '.dat'
-
-                    open (2, FILE=trim(file_path))
-
-                    do j = 0, m
-                        do k = 0, n
-                            if (((i >= cont_idx%beg) .and. (i <= cont_idx%end)) &
-                                .or. &
-                                ((i >= adv_idx%beg) .and. (i <= adv_idx%end)) &
-                                ) then
-                                write (2, FMT) x_cb(j), y_cb(k), q_cons_vf(i)%sf(j, k, 0)
-                            else
-                                write (2, FMT) x_cb(j), y_cb(k), q_prim_vf(i)%sf(j, k, 0)
-                            end if
-                        end do
-                        write (2, *)
-                    end do
-                    close (2)
-                end do
-            end if
-        end if
-
-        if (precision == 1) then
-            FMT = "(4F30.7)"
-        else
-            FMT = "(4F40.14)"
-        end if
-
-        ! 3D
-        if (p > 0) then
-            do i = 1, sys_size
-                write (file_path, '(A,I0,A,I2.2,A,I6.6,A)') trim(t_step_dir)//'/cons.', i, '.', proc_rank, '.', t_step, '.dat'
-                open (2, FILE=trim(file_path))
-                do j = 0, m
-                    do k = 0, n
-                        do l = 0, p
-                            write (2, FMT) x_cb(j), y_cb(k), z_cb(l), q_cons_vf(i)%sf(j, k, l)
-                        end do
-                        write (2, *)
-                    end do
-                    write (2, *)
-                end do
-                close (2)
-            end do
-
-            if (present(beta)) then
-                write (file_path, '(A,I0,A,I2.2,A,I6.6,A)') trim(t_step_dir)//'/beta.', i, '.', proc_rank, '.', t_step, '.dat'
-                open (2, FILE=trim(file_path))
-                do j = 0, m
-                    do k = 0, n
-                        do l = 0, p
-                            write (2, FMT) x_cb(j), y_cb(k), z_cb(l), beta%sf(j, k, l)
-                        end do
-                        write (2, *)
-                    end do
-                    write (2, *)
-                end do
-                close (2)
-            end if
-
-            if (qbmm .and. .not. polytropic) then
-                do i = 1, nb
-                    do r = 1, nnode
-                        write (file_path, '(A,I0,A,I0,A,I2.2,A,I6.6,A)') trim(t_step_dir)//'/pres.', i, '.', r, '.', proc_rank, '.', t_step, '.dat'
-
-                        open (2, FILE=trim(file_path))
-                        do j = 0, m
-                            do k = 0, n
-                                do l = 0, p
-                                    write (2, FMT) x_cb(j), y_cb(k), z_cb(l), pb_ts(1)%sf(j, k, l, r, i)
-                                end do
-                            end do
-                        end do
-                        close (2)
-                    end do
-                end do
-                do i = 1, nb
-                    do r = 1, nnode
-                        write (file_path, '(A,I0,A,I0,A,I2.2,A,I6.6,A)') trim(t_step_dir)//'/mv.', i, '.', r, '.', proc_rank, '.', t_step, '.dat'
-
-                        open (2, FILE=trim(file_path))
-                        do j = 0, m
-                            do k = 0, n
-                                do l = 0, p
-                                    write (2, FMT) x_cb(j), y_cb(k), z_cb(l), mv_ts(1)%sf(j, k, l, r, i)
-                                end do
-                            end do
-                        end do
-                        close (2)
-                    end do
-                end do
-            end if
-
-            if (prim_vars_wrt .and. (.not. igr)) then
-                do i = 1, sys_size
-                    write (file_path, '(A,I0,A,I2.2,A,I6.6,A)') trim(t_step_dir)//'/prim.', i, '.', proc_rank, '.', t_step, '.dat'
-
-                    open (2, FILE=trim(file_path))
-
-                    do j = 0, m
-                        do k = 0, n
-                            do l = 0, p
-                                if (((i >= cont_idx%beg) .and. (i <= cont_idx%end)) &
-                                    .or. &
-                                    ((i >= adv_idx%beg) .and. (i <= adv_idx%end)) &
-                                    .or. &
-                                    ((i >= chemxb) .and. (i <= chemxe)) &
-                                    ) then
-                                    write (2, FMT) x_cb(j), y_cb(k), z_cb(l), q_cons_vf(i)%sf(j, k, l)
-                                else
-                                    write (2, FMT) x_cb(j), y_cb(k), z_cb(l), q_prim_vf(i)%sf(j, k, l)
-                                end if
-                            end do
-                            write (2, *)
-                        end do
-                        write (2, *)
-                    end do
-                    close (2)
-                end do
-            end if
-        end if
-
-    end subroutine s_write_serial_data_files
-
-    !>  The goal of this subroutine is to output the grid and
-        !!      conservative variables data files for given time-step.
-        !!  @param q_cons_vf Cell-average conservative variables
-        !!  @param t_step Current time-step
-        !!  @param bc_type Boundary condition type
-        !!  @param beta Eulerian void fraction from lagrangian bubbles
-    impure subroutine s_write_parallel_data_files(q_cons_vf, t_step, bc_type, beta)
-
-        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
-        integer, intent(in) :: t_step
-        type(scalar_field), intent(inout), optional :: beta
-        type(integer_field), &
-            dimension(1:num_dims, -1:1), &
-            intent(in) :: bc_type
-
-#ifdef MFC_MPI
-
-        integer :: ifile, ierr, data_size
-        integer, dimension(MPI_STATUS_SIZE) :: status
-        integer(kind=MPI_OFFSET_kind) :: disp
-        integer(kind=MPI_OFFSET_kind) :: m_MOK, n_MOK, p_MOK
-        integer(kind=MPI_OFFSET_kind) :: WP_MOK, var_MOK, str_MOK
-        integer(kind=MPI_OFFSET_kind) :: NVARS_MOK
-        integer(kind=MPI_OFFSET_kind) :: MOK
-
-        character(LEN=path_len + 2*name_len) :: file_loc
-        logical :: file_exist, dir_check
-        character(len=10) :: t_step_string
-
-        integer :: i !< Generic loop iterator
-
-        integer :: alt_sys !< Altered system size for the lagrangian subgrid bubble model
-
-        ! Down sampling variables
-        integer :: m_ds, n_ds, p_ds
-        integer :: m_glb_ds, n_glb_ds, p_glb_ds
-        integer :: m_glb_save, n_glb_save, p_glb_save ! Global save size
-
-        if (down_sample) then
-            call s_downsample_data(q_cons_vf, q_cons_temp_ds, &
-                                   m_ds, n_ds, p_ds, m_glb_ds, n_glb_ds, p_glb_ds)
-        end if
-
-        if (present(beta)) then
-            alt_sys = sys_size + 1
-        else
-            alt_sys = sys_size
-        end if
-
-        if (file_per_process) then
-
-            call s_int_to_str(t_step, t_step_string)
-
-            ! Initialize MPI data I/O
-            if (down_sample) then
-                call s_initialize_mpi_data_ds(q_cons_temp_ds)
-            else
-                if (ib) then
-                    call s_initialize_mpi_data(q_cons_vf, ib_markers)
-                else
-                    call s_initialize_mpi_data(q_cons_vf)
-                end if
-            end if
-
-            if (proc_rank == 0) then
-                file_loc = trim(case_dir)//'/restart_data/lustre_'//trim(t_step_string)
-                call my_inquire(file_loc, dir_check)
-                if (dir_check .neqv. .true.) then
-                    call s_create_directory(trim(file_loc))
-                end if
-                call s_create_directory(trim(file_loc))
-            end if
-            call s_mpi_barrier()
-            call DelayFileAccess(proc_rank)
-
-            ! Initialize MPI data I/O
-            call s_initialize_mpi_data(q_cons_vf)
-
-            ! Open the file to write all flow variables
-            write (file_loc, '(I0,A,i7.7,A)') t_step, '_', proc_rank, '.dat'
-            file_loc = trim(case_dir)//'/restart_data/lustre_'//trim(t_step_string)//trim(mpiiofs)//trim(file_loc)
-            inquire (FILE=trim(file_loc), EXIST=file_exist)
-            if (file_exist .and. proc_rank == 0) then
-                call MPI_FILE_DELETE(file_loc, mpi_info_int, ierr)
-            end if
-            call MPI_FILE_OPEN(MPI_COMM_SELF, file_loc, ior(MPI_MODE_WRONLY, MPI_MODE_CREATE), &
-                               mpi_info_int, ifile, ierr)
-
-            if (down_sample) then
-                ! Size of local arrays
-                data_size = (m_ds + 3)*(n_ds + 3)*(p_ds + 3)
-                m_glb_save = m_glb_ds + 1
-                n_glb_save = n_glb_ds + 1
-                p_glb_save = p_glb_ds + 1
-            else
-                ! Size of local arrays
-                data_size = (m + 1)*(n + 1)*(p + 1)
-                m_glb_save = m_glb + 1
-                n_glb_save = n_glb + 1
-                p_glb_save = p_glb + 1
-            end if
-
-            ! Resize some integers so MPI can write even the biggest files
-            m_MOK = int(m_glb_save + 1, MPI_OFFSET_KIND)
-            n_MOK = int(n_glb_save + 1, MPI_OFFSET_KIND)
-            p_MOK = int(p_glb_save + 1, MPI_OFFSET_KIND)
-            WP_MOK = int(8._wp, MPI_OFFSET_KIND)
-            MOK = int(1._wp, MPI_OFFSET_KIND)
-            str_MOK = int(name_len, MPI_OFFSET_KIND)
-            NVARS_MOK = int(sys_size, MPI_OFFSET_KIND)
-
-            if (bubbles_euler) then
-                ! Write the data for each variable
-                do i = 1, sys_size
-                    var_MOK = int(i, MPI_OFFSET_KIND)
-
-                    call MPI_FILE_WRITE_ALL(ifile, MPI_IO_DATA%var(i)%sf, data_size*mpi_io_type, &
-                                            mpi_io_p, status, ierr)
-                end do
-                !Write pb and mv for non-polytropic qbmm
-                if (qbmm .and. .not. polytropic) then
-                    do i = sys_size + 1, sys_size + 2*nb*nnode
-                        var_MOK = int(i, MPI_OFFSET_KIND)
-
-                        call MPI_FILE_WRITE_ALL(ifile, MPI_IO_DATA%var(i)%sf, data_size*mpi_io_type, &
-                                                mpi_io_p, status, ierr)
-                    end do
-                end if
-            else
-                if (down_sample) then
-                    do i = 1, sys_size !TODO: check if correct (sys_size
-                        var_MOK = int(i, MPI_OFFSET_KIND)
-
-                        call MPI_FILE_WRITE_ALL(ifile, q_cons_temp_ds(i)%sf, data_size*mpi_io_type, &
-                                                mpi_io_p, status, ierr)
-                    end do
-                else
-                    do i = 1, sys_size !TODO: check if correct (sys_size
-                        var_MOK = int(i, MPI_OFFSET_KIND)
-
-                        call MPI_FILE_WRITE_ALL(ifile, MPI_IO_DATA%var(i)%sf, data_size*mpi_io_type, &
-                                                mpi_io_p, status, ierr)
-                    end do
-                end if
-            end if
-
-            call MPI_FILE_CLOSE(ifile, ierr)
-        else
-            ! Initialize MPI data I/O
-
-            if (ib) then
-                call s_initialize_mpi_data(q_cons_vf, ib_markers)
-            elseif (present(beta)) then
-                call s_initialize_mpi_data(q_cons_vf, beta=beta)
-            else
-                call s_initialize_mpi_data(q_cons_vf)
-            end if
-
-            write (file_loc, '(I0,A)') t_step, '.dat'
-            file_loc = trim(case_dir)//'/restart_data'//trim(mpiiofs)//trim(file_loc)
-            inquire (FILE=trim(file_loc), EXIST=file_exist)
-            if (file_exist .and. proc_rank == 0) then
-                call MPI_FILE_DELETE(file_loc, mpi_info_int, ierr)
-            end if
-            call MPI_FILE_OPEN(MPI_COMM_WORLD, file_loc, ior(MPI_MODE_WRONLY, MPI_MODE_CREATE), &
-                               mpi_info_int, ifile, ierr)
-
-            ! Size of local arrays
-            data_size = (m + 1)*(n + 1)*(p + 1)
-
-            ! Resize some integers so MPI can write even the biggest files
-            m_MOK = int(m_glb + 1, MPI_OFFSET_KIND)
-            n_MOK = int(n_glb + 1, MPI_OFFSET_KIND)
-            p_MOK = int(p_glb + 1, MPI_OFFSET_KIND)
-            WP_MOK = int(8._wp, MPI_OFFSET_KIND)
-            MOK = int(1._wp, MPI_OFFSET_KIND)
-            str_MOK = int(name_len, MPI_OFFSET_KIND)
-            NVARS_MOK = int(alt_sys, MPI_OFFSET_KIND)
-
-            if (bubbles_euler) then
-                ! Write the data for each variable
-                do i = 1, sys_size
-                    var_MOK = int(i, MPI_OFFSET_KIND)
-
-                    ! Initial displacement to skip at beginning of file
-                    disp = m_MOK*max(MOK, n_MOK)*max(MOK, p_MOK)*WP_MOK*(var_MOK - 1)
-
-                    call MPI_FILE_SET_VIEW(ifile, disp, mpi_p, MPI_IO_DATA%view(i), &
-                                           'native', mpi_info_int, ierr)
-                    call MPI_FILE_WRITE_ALL(ifile, MPI_IO_DATA%var(i)%sf, data_size*mpi_io_type, &
-                                            mpi_io_p, status, ierr)
-                end do
-                !Write pb and mv for non-polytropic qbmm
-                if (qbmm .and. .not. polytropic) then
-                    do i = sys_size + 1, sys_size + 2*nb*nnode
-                        var_MOK = int(i, MPI_OFFSET_KIND)
-
-                        ! Initial displacement to skip at beginning of file
-                        disp = m_MOK*max(MOK, n_MOK)*max(MOK, p_MOK)*WP_MOK*(var_MOK - 1)
-
-                        call MPI_FILE_SET_VIEW(ifile, disp, mpi_p, MPI_IO_DATA%view(i), &
-                                               'native', mpi_info_int, ierr)
-                        call MPI_FILE_WRITE_ALL(ifile, MPI_IO_DATA%var(i)%sf, data_size*mpi_io_type, &
-                                                mpi_io_p, status, ierr)
-                    end do
-                end if
-            else
-                do i = 1, sys_size !TODO: check if correct (sys_size
-                    var_MOK = int(i, MPI_OFFSET_KIND)
-
-                    ! Initial displacement to skip at beginning of file
-                    disp = m_MOK*max(MOK, n_MOK)*max(MOK, p_MOK)*WP_MOK*(var_MOK - 1)
-
-                    call MPI_FILE_SET_VIEW(ifile, disp, mpi_p, MPI_IO_DATA%view(i), &
-                                           'native', mpi_info_int, ierr)
-                    call MPI_FILE_WRITE_ALL(ifile, MPI_IO_DATA%var(i)%sf, data_size*mpi_io_type, &
-                                            mpi_io_p, status, ierr)
-                end do
-            end if
-
-            ! Correction for the lagrangian subgrid bubble model
-            if (present(beta)) then
-                var_MOK = int(sys_size + 1, MPI_OFFSET_KIND)
-
-                ! Initial displacement to skip at beginning of file
-                disp = m_MOK*max(MOK, n_MOK)*max(MOK, p_MOK)*WP_MOK*(var_MOK - 1)
-
-                call MPI_FILE_SET_VIEW(ifile, disp, mpi_p, MPI_IO_DATA%view(sys_size + 1), &
-                                       'native', mpi_info_int, ierr)
-                call MPI_FILE_WRITE_ALL(ifile, MPI_IO_DATA%var(sys_size + 1)%sf, data_size*mpi_io_type, &
-                                        mpi_io_p, status, ierr)
-            end if
-
-            call MPI_FILE_CLOSE(ifile, ierr)
-
-            !Write ib data
-            if (ib) then
-                call s_write_parallel_ib_data(t_step)
-                ! write (file_loc, '(A)') 'ib.dat'
-                ! file_loc = trim(case_dir)//'/restart_data'//trim(mpiiofs)//trim(file_loc)
-                ! call MPI_FILE_OPEN(MPI_COMM_WORLD, file_loc, ior(MPI_MODE_WRONLY, MPI_MODE_CREATE), &
-                !                    mpi_info_int, ifile, ierr)
-
-                ! var_MOK = int(sys_size + 1, MPI_OFFSET_KIND)
-                ! disp = m_MOK*max(MOK, n_MOK)*max(MOK, p_MOK)*WP_MOK*(var_MOK - 1 + int(t_step/t_step_save))
-
-                ! call MPI_FILE_SET_VIEW(ifile, disp, MPI_INTEGER, MPI_IO_IB_DATA%view, &
-                !                        'native', mpi_info_int, ierr)
-                ! call MPI_FILE_WRITE_ALL(ifile, MPI_IO_IB_DATA%var%sf, data_size, &
-                !                         MPI_INTEGER, status, ierr)
-                ! call MPI_FILE_CLOSE(ifile, ierr)
-            end if
-
-        end if
-#endif
-
-    end subroutine s_write_parallel_data_files
-
-    !> @brief Writes immersed boundary marker data to a serial (per-processor) unformatted file.
-    subroutine s_write_serial_ib_data(time_step)
-
-        integer, intent(in) :: time_step
-        character(LEN=path_len + 2*name_len) :: file_path
-        character(LEN=path_len + 2*name_len) :: t_step_dir
-
-        ! Creating or overwriting the time-step root directory
-        write (t_step_dir, '(A,I0,A,I0)') trim(case_dir)//'/p_all'
-        write (t_step_dir, '(a,i0,a,i0)') trim(case_dir)//'/p_all/p', &
-            proc_rank, '/', time_step
-        write (file_path, '(A,I0,A)') trim(t_step_dir)//'/ib.dat'
-
-        open (2, FILE=trim(file_path), &
-              FORM='unformatted', &
-              STATUS='new')
-
-        $:GPU_UPDATE(host='[ib_markers%sf]')
-        write (2) ib_markers%sf(0:m, 0:n, 0:p); close (2)
-
-    end subroutine
-
-    !> @brief Writes immersed boundary marker data in parallel using MPI I/O.
-    subroutine s_write_parallel_ib_data(time_step)
-
-        integer, intent(in) :: time_step
-
-#ifdef MFC_MPI
-
-        character(LEN=path_len + 2*name_len) :: file_loc
-        integer(kind=MPI_OFFSET_kind) :: disp
-        integer(kind=MPI_OFFSET_kind) :: m_MOK, n_MOK, p_MOK
-        integer(kind=MPI_OFFSET_kind) :: WP_MOK, var_MOK, MOK
-        integer :: ifile, ierr, data_size
-        integer, dimension(MPI_STATUS_SIZE) :: status
-
-        $:GPU_UPDATE(host='[ib_markers%sf]')
-
-        ! Size of local arrays
-        data_size = (m + 1)*(n + 1)*(p + 1)
-        m_MOK = int(m_glb + 1, MPI_OFFSET_KIND)
-        n_MOK = int(n_glb + 1, MPI_OFFSET_KIND)
-        p_MOK = int(p_glb + 1, MPI_OFFSET_KIND)
-        WP_MOK = int(8._wp, MPI_OFFSET_KIND)
-        MOK = int(1._wp, MPI_OFFSET_KIND)
-
-        write (file_loc, '(A)') 'ib.dat'
-        file_loc = trim(case_dir)//'/restart_data'//trim(mpiiofs)//trim(file_loc)
-        call MPI_FILE_OPEN(MPI_COMM_WORLD, file_loc, ior(MPI_MODE_WRONLY, MPI_MODE_CREATE), &
-                           mpi_info_int, ifile, ierr)
-
-        var_MOK = int(sys_size + 1, MPI_OFFSET_KIND)
-        disp = m_MOK*max(MOK, n_MOK)*max(MOK, p_MOK)*WP_MOK*(var_MOK - 1 + int(time_step/t_step_save))
-        if (time_step == 0) disp = 0
-
-        call MPI_FILE_SET_VIEW(ifile, disp, MPI_INTEGER, MPI_IO_IB_DATA%view, &
-                               'native', mpi_info_int, ierr)
-        call MPI_FILE_WRITE_ALL(ifile, MPI_IO_IB_DATA%var%sf, data_size, &
-                                MPI_INTEGER, status, ierr)
-        call MPI_FILE_CLOSE(ifile, ierr)
-
-#endif
-
-    end subroutine s_write_parallel_ib_data
-
-    !> @brief Dispatches immersed boundary data output to the serial or parallel writer.
-    subroutine s_write_ib_data_file(time_step)
-
-        integer, intent(in) :: time_step
-
-        if (parallel_io) then
-            call s_write_parallel_ib_data(time_step)
-        else
-            call s_write_serial_ib_data(time_step)
-        end if
-
-    end subroutine
-
-    !>  This writes a formatted data file where the root processor
-    !!      can write out the CoM information
-    !!  @param t_step Current time-step
-    !!  @param c_mass_in Center of mass information
-    impure subroutine s_write_com_files(t_step, c_mass_in)
-
-        integer, intent(in) :: t_step
-        real(wp), dimension(num_fluids, 5), intent(in) :: c_mass_in
-        integer :: i !< Generic loop iterator
-        real(wp) :: nondim_time !< Non-dimensional time
-
-        ! Non-dimensional time calculation
-        if (t_step_old /= dflt_int) then
-            nondim_time = real(t_step + t_step_old, wp)*dt
-        else
-            nondim_time = real(t_step, wp)*dt
-        end if
-
-        if (proc_rank == 0) then
-            if (n == 0) then ! 1D simulation
-                do i = 1, num_fluids ! Loop through fluids
-                    write (i + 120, '(6X,4F24.12)') &
-                        nondim_time, &
-                        c_mass_in(i, 1), &
-                        c_mass_in(i, 2), &
-                        c_mass_in(i, 5)
-                end do
-            elseif (p == 0) then ! 2D simulation
-                do i = 1, num_fluids ! Loop through fluids
-                    write (i + 120, '(6X,5F24.12)') &
-                        nondim_time, &
-                        c_mass_in(i, 1), &
-                        c_mass_in(i, 2), &
-                        c_mass_in(i, 3), &
-                        c_mass_in(i, 5)
-                end do
-            else ! 3D simulation
-                do i = 1, num_fluids ! Loop through fluids
-                    write (i + 120, '(6X,6F24.12)') &
-                        nondim_time, &
-                        c_mass_in(i, 1), &
-                        c_mass_in(i, 2), &
-                        c_mass_in(i, 3), &
-                        c_mass_in(i, 4), &
-                        c_mass_in(i, 5)
-                end do
-            end if
-        end if
-
-    end subroutine s_write_com_files
-
-    !>  This writes a formatted data file for the flow probe information
-        !!  @param t_step Current time-step
-        !!  @param q_cons_vf Conservative variables
-        !!  @param accel_mag Acceleration magnitude information
-    impure subroutine s_write_probe_files(t_step, q_cons_vf, accel_mag)
-
-        integer, intent(in) :: t_step
-        type(scalar_field), dimension(sys_size), intent(in) :: q_cons_vf
-        real(wp), dimension(0:m, 0:n, 0:p), intent(in) :: accel_mag
-
-        real(wp), dimension(-1:m) :: distx
-        real(wp), dimension(-1:n) :: disty
-        real(wp), dimension(-1:p) :: distz
-
-        ! The cell-averaged partial densities, density, velocity, pressure,
-        ! volume fractions, specific heat ratio function, liquid stiffness
-        ! function, and sound speed.
-        real(wp) :: lit_gamma, nbub
-        real(wp) :: rho
-        real(wp), dimension(num_vels) :: vel
-        real(wp) :: pres
-        real(wp) :: ptilde
-        real(wp) :: ptot
-        real(wp) :: alf
-        real(wp) :: alfgr
-        real(wp), dimension(num_fluids) :: alpha
-        real(wp) :: gamma
-        real(wp) :: pi_inf
-        real(wp) :: qv
-        real(wp) :: c
-        real(wp) :: M00, M10, M01, M20, M11, M02
-        real(wp) :: varR, varV
-        real(wp), dimension(Nb) :: nR, R, nRdot, Rdot
-        real(wp) :: nR3
-        real(wp) :: accel
-        real(wp) :: int_pres
-        real(wp) :: max_pres
-        real(wp), dimension(2) :: Re
-        real(wp), dimension(6) :: tau_e
-        real(wp) :: G_local
-        real(wp) :: dyn_p, T
-        real(wp) :: damage_state
-
-        integer :: i, j, k, l, s, d !< Generic loop iterator
-
-        real(wp) :: nondim_time !< Non-dimensional time
-
-        real(wp) :: tmp !<
-            !! Temporary variable to store quantity for mpi_allreduce
-
-        integer :: npts !< Number of included integral points
-        real(wp) :: rad, thickness !< For integral quantities
-        logical :: trigger !< For integral quantities
-
-        real(wp) :: rhoYks(1:num_species)
-
-        T = dflt_T_guess
-
-        ! Non-dimensional time calculation
-        if (time_stepper == 23) then
-            nondim_time = mytime
-        else
-            if (t_step_old /= dflt_int) then
-                nondim_time = real(t_step + t_step_old, wp)*dt
-            else
-                nondim_time = real(t_step, wp)*dt
-            end if
-        end if
-
-        do i = 1, num_probes
-            ! Zeroing out flow variables for all processors
-            rho = 0._wp
-            do s = 1, num_vels
-                vel(s) = 0._wp
-            end do
-            pres = 0._wp
-            gamma = 0._wp
-            pi_inf = 0._wp
-            qv = 0._wp
-            c = 0._wp
-            accel = 0._wp
-            nR = 0._wp; R = 0._wp
-            nRdot = 0._wp; Rdot = 0._wp
-            nbub = 0._wp
-            M00 = 0._wp
-            M10 = 0._wp
-            M01 = 0._wp
-            M20 = 0._wp
-            M11 = 0._wp
-            M02 = 0._wp
-            varR = 0._wp; varV = 0._wp
-            alf = 0._wp
-            do s = 1, (num_dims*(num_dims + 1))/2
-                tau_e(s) = 0._wp
-            end do
-            damage_state = 0._wp
-
-            ! Find probe location in terms of indices on a
-            ! specific processor
-            if (n == 0) then ! 1D simulation
-                if ((probe(i)%x >= x_cb(-1)) .and. (probe(i)%x <= x_cb(m))) then
-                    do s = -1, m
-                        distx(s) = x_cb(s) - probe(i)%x
-                        if (distx(s) < 0._wp) distx(s) = 1000._wp
-                    end do
-                    j = minloc(distx, 1)
-                    if (j == 1) j = 2 ! Pick first point if probe is at edge
-                    k = 0
-                    l = 0
-
-                    if (chemistry) then
-                        do d = 1, num_species
-                            rhoYks(d) = q_cons_vf(chemxb + d - 1)%sf(j - 2, k, l)
-                        end do
-                    end if
-
-                    ! Computing/Sharing necessary state variables
-                    if (elasticity) then
-                        call s_convert_to_mixture_variables(q_cons_vf, j - 2, k, l, &
-                                                            rho, gamma, pi_inf, qv, &
-                                                            Re, G_local, fluid_pp(:)%G)
-                    else
-                        call s_convert_to_mixture_variables(q_cons_vf, j - 2, k, l, &
-                                                            rho, gamma, pi_inf, qv)
-                    end if
-                    do s = 1, num_vels
-                        vel(s) = q_cons_vf(cont_idx%end + s)%sf(j - 2, k, l)/rho
-                    end do
-
-                    dyn_p = 0.5_wp*rho*dot_product(vel, vel)
-
-                    if (elasticity) then
-                        if (cont_damage) then
-                            damage_state = q_cons_vf(damage_idx)%sf(j - 2, k, l)
-                            G_local = G_local*max((1._wp - damage_state), 0._wp)
-                        end if
-
-                        call s_compute_pressure( &
-                            q_cons_vf(1)%sf(j - 2, k, l), &
-                            q_cons_vf(alf_idx)%sf(j - 2, k, l), &
-                            dyn_p, pi_inf, gamma, rho, qv, rhoYks(:), pres, T, &
-                            q_cons_vf(stress_idx%beg)%sf(j - 2, k, l), &
-                            q_cons_vf(mom_idx%beg)%sf(j - 2, k, l), G_local)
-                    else
-                        call s_compute_pressure( &
-                            q_cons_vf(E_idx)%sf(j - 2, k, l), &
-                            q_cons_vf(alf_idx)%sf(j - 2, k, l), &
-                            dyn_p, pi_inf, gamma, rho, qv, rhoYks, pres, T)
-                    end if
-
-                    if (model_eqns == 4) then
-                        lit_gamma = gammas(1)
-                    else if (elasticity) then
-                        tau_e(1) = q_cons_vf(stress_idx%end)%sf(j - 2, k, l)/rho
-                    end if
-
-                    if (bubbles_euler) then
-                        alf = q_cons_vf(alf_idx)%sf(j - 2, k, l)
-                        if (num_fluids == 3) then
-                            alfgr = q_cons_vf(alf_idx - 1)%sf(j - 2, k, l)
-                        end if
-                        do s = 1, nb
-                            nR(s) = q_cons_vf(bub_idx%rs(s))%sf(j - 2, k, l)
-                            nRdot(s) = q_cons_vf(bub_idx%vs(s))%sf(j - 2, k, l)
-                        end do
-
-                        if (adv_n) then
-                            nbub = q_cons_vf(n_idx)%sf(j - 2, k, l)
-                        else
-                            nR3 = 0._wp
-                            do s = 1, nb
-                                nR3 = nR3 + weight(s)*(nR(s)**3._wp)
-                            end do
-
-                            nbub = sqrt((4._wp*pi/3._wp)*nR3/alf)
-                        end if
-#ifdef DEBUG
-                        print *, 'In probe, nbub: ', nbub
-#endif
-                        if (qbmm) then
-                            M00 = q_cons_vf(bub_idx%moms(1, 1))%sf(j - 2, k, l)/nbub
-                            M10 = q_cons_vf(bub_idx%moms(1, 2))%sf(j - 2, k, l)/nbub
-                            M01 = q_cons_vf(bub_idx%moms(1, 3))%sf(j - 2, k, l)/nbub
-                            M20 = q_cons_vf(bub_idx%moms(1, 4))%sf(j - 2, k, l)/nbub
-                            M11 = q_cons_vf(bub_idx%moms(1, 5))%sf(j - 2, k, l)/nbub
-                            M02 = q_cons_vf(bub_idx%moms(1, 6))%sf(j - 2, k, l)/nbub
-
-                            M10 = M10/M00
-                            M01 = M01/M00
-                            M20 = M20/M00
-                            M11 = M11/M00
-                            M02 = M02/M00
-
-                            varR = M20 - M10**2._wp
-                            varV = M02 - M01**2._wp
-                        end if
-                        R(:) = nR(:)/nbub
-                        Rdot(:) = nRdot(:)/nbub
-
-                        ptilde = ptil(j - 2, k, l)
-                        ptot = pres - ptilde
-                    end if
-
-                    ! Compute mixture sound Speed
-                    call s_compute_speed_of_sound(pres, rho, gamma, pi_inf, &
-                                                  ((gamma + 1._wp)*pres + pi_inf)/rho, alpha, 0._wp, 0._wp, c, qv)
-
-                    accel = accel_mag(j - 2, k, l)
-                end if
-            elseif (p == 0) then ! 2D simulation
-                if (chemistry) then
-                    do d = 1, num_species
-                        rhoYks(d) = q_cons_vf(chemxb + d - 1)%sf(j - 2, k - 2, l)
-                    end do
-                end if
-
-                if ((probe(i)%x >= x_cb(-1)) .and. (probe(i)%x <= x_cb(m))) then
-                    if ((probe(i)%y >= y_cb(-1)) .and. (probe(i)%y <= y_cb(n))) then
-                        do s = -1, m
-                            distx(s) = x_cb(s) - probe(i)%x
-                            if (distx(s) < 0._wp) distx(s) = 1000._wp
-                        end do
-                        do s = -1, n
-                            disty(s) = y_cb(s) - probe(i)%y
-                            if (disty(s) < 0._wp) disty(s) = 1000._wp
-                        end do
-                        j = minloc(distx, 1)
-                        k = minloc(disty, 1)
-                        if (j == 1) j = 2 ! Pick first point if probe is at edge
-                        if (k == 1) k = 2 ! Pick first point if probe is at edge
-                        l = 0
-
-                        ! Computing/Sharing necessary state variables
-                        call s_convert_to_mixture_variables(q_cons_vf, j - 2, k - 2, l, &
-                                                            rho, gamma, pi_inf, qv, &
-                                                            Re, G_local, fluid_pp(:)%G)
-                        do s = 1, num_vels
-                            vel(s) = q_cons_vf(cont_idx%end + s)%sf(j - 2, k - 2, l)/rho
-                        end do
-
-                        dyn_p = 0.5_wp*rho*dot_product(vel, vel)
-
-                        if (elasticity) then
-                            if (cont_damage) then
-                                damage_state = q_cons_vf(damage_idx)%sf(j - 2, k - 2, l)
-                                G_local = G_local*max((1._wp - damage_state), 0._wp)
-                            end if
-
-                            call s_compute_pressure( &
-                                q_cons_vf(1)%sf(j - 2, k - 2, l), &
-                                q_cons_vf(alf_idx)%sf(j - 2, k - 2, l), &
-                                dyn_p, pi_inf, gamma, rho, qv, &
-                                rhoYks, &
-                                pres, &
-                                T, &
-                                q_cons_vf(stress_idx%beg)%sf(j - 2, k - 2, l), &
-                                q_cons_vf(mom_idx%beg)%sf(j - 2, k - 2, l), G_local)
-                        else
-                            call s_compute_pressure(q_cons_vf(E_idx)%sf(j - 2, k - 2, l), &
-                                                    q_cons_vf(alf_idx)%sf(j - 2, k - 2, l), &
-                                                    dyn_p, pi_inf, gamma, rho, qv, &
-                                                    rhoYks, pres, T)
-                        end if
-
-                        if (model_eqns == 4) then
-                            lit_gamma = gs_min(1)
-                        else if (elasticity) then
-                            do s = 1, 3
-                                tau_e(s) = q_cons_vf(s)%sf(j - 2, k - 2, l)/rho
-                            end do
-                        end if
-
-                        if (bubbles_euler) then
-                            alf = q_cons_vf(alf_idx)%sf(j - 2, k - 2, l)
-                            do s = 1, nb
-                                nR(s) = q_cons_vf(bub_idx%rs(s))%sf(j - 2, k - 2, l)
-                                nRdot(s) = q_cons_vf(bub_idx%vs(s))%sf(j - 2, k - 2, l)
-                            end do
-
-                            if (adv_n) then
-                                nbub = q_cons_vf(n_idx)%sf(j - 2, k - 2, l)
-                            else
-                                nR3 = 0._wp
-                                do s = 1, nb
-                                    nR3 = nR3 + weight(s)*(nR(s)**3._wp)
-                                end do
-
-                                nbub = sqrt((4._wp*pi/3._wp)*nR3/alf)
-                            end if
-
-                            R(:) = nR(:)/nbub
-                            Rdot(:) = nRdot(:)/nbub
-                        end if
-                        ! Compute mixture sound speed
-                        call s_compute_speed_of_sound(pres, rho, gamma, pi_inf, &
-                                                      ((gamma + 1._wp)*pres + pi_inf)/rho, alpha, 0._wp, 0._wp, c, qv)
-
-                    end if
-                end if
-            else ! 3D
-                if ((probe(i)%x >= x_cb(-1)) .and. (probe(i)%x <= x_cb(m))) then
-                    if ((probe(i)%y >= y_cb(-1)) .and. (probe(i)%y <= y_cb(n))) then
-                        if ((probe(i)%z >= z_cb(-1)) .and. (probe(i)%z <= z_cb(p))) then
-                            do s = -1, m
-                                distx(s) = x_cb(s) - probe(i)%x
-                                if (distx(s) < 0._wp) distx(s) = 1000._wp
-                            end do
-                            do s = -1, n
-                                disty(s) = y_cb(s) - probe(i)%y
-                                if (disty(s) < 0._wp) disty(s) = 1000._wp
-                            end do
-                            do s = -1, p
-                                distz(s) = z_cb(s) - probe(i)%z
-                                if (distz(s) < 0._wp) distz(s) = 1000._wp
-                            end do
-                            j = minloc(distx, 1)
-                            k = minloc(disty, 1)
-                            l = minloc(distz, 1)
-                            if (j == 1) j = 2 ! Pick first point if probe is at edge
-                            if (k == 1) k = 2 ! Pick first point if probe is at edge
-                            if (l == 1) l = 2 ! Pick first point if probe is at edge
-
-                            ! Computing/Sharing necessary state variables
-                            call s_convert_to_mixture_variables(q_cons_vf, j - 2, k - 2, l - 2, &
-                                                                rho, gamma, pi_inf, qv, &
-                                                                Re, G_local, fluid_pp(:)%G)
-                            do s = 1, num_vels
-                                vel(s) = q_cons_vf(cont_idx%end + s)%sf(j - 2, k - 2, l - 2)/rho
-                            end do
-
-                            dyn_p = 0.5_wp*rho*dot_product(vel, vel)
-
-                            if (chemistry) then
-                                do d = 1, num_species
-                                    rhoYks(d) = q_cons_vf(chemxb + d - 1)%sf(j - 2, k - 2, l - 2)
-                                end do
-                            end if
-
-                            if (elasticity) then
-                                if (cont_damage) then
-                                    damage_state = q_cons_vf(damage_idx)%sf(j - 2, k - 2, l - 2)
-                                    G_local = G_local*max((1._wp - damage_state), 0._wp)
-                                end if
-
-                                call s_compute_pressure( &
-                                    q_cons_vf(1)%sf(j - 2, k - 2, l - 2), &
-                                    q_cons_vf(alf_idx)%sf(j - 2, k - 2, l - 2), &
-                                    dyn_p, pi_inf, gamma, rho, qv, &
-                                    rhoYks, pres, T, &
-                                    q_cons_vf(stress_idx%beg)%sf(j - 2, k - 2, l - 2), &
-                                    q_cons_vf(mom_idx%beg)%sf(j - 2, k - 2, l - 2), G_local)
-                            else
-                                call s_compute_pressure(q_cons_vf(E_idx)%sf(j - 2, k - 2, l - 2), &
-                                                        q_cons_vf(alf_idx)%sf(j - 2, k - 2, l - 2), &
-                                                        dyn_p, pi_inf, gamma, rho, qv, &
-                                                        rhoYks, pres, T)
-                            end if
-
-                            ! Compute mixture sound speed
-                            call s_compute_speed_of_sound(pres, rho, gamma, pi_inf, &
-                                                          ((gamma + 1._wp)*pres + pi_inf)/rho, alpha, 0._wp, 0._wp, c, qv)
-
-                            accel = accel_mag(j - 2, k - 2, l - 2)
-                        end if
-                    end if
-                end if
-            end if
-            if (num_procs > 1) then
-                #:for VAR in ['rho','pres','gamma','pi_inf','qv','c','accel']
-                    tmp = ${VAR}$
-                    call s_mpi_allreduce_sum(tmp, ${VAR}$)
-                #:endfor
-
-                do s = 1, num_vels
-                    tmp = vel(s)
-                    call s_mpi_allreduce_sum(tmp, vel(s))
-                end do
-
-                if (bubbles_euler) then
-                    #:for VAR in ['alf','alfgr','nbub','nR(1)','nRdot(1)','M00','R(1)','Rdot(1)','ptilde','ptot']
-                        tmp = ${VAR}$
-                        call s_mpi_allreduce_sum(tmp, ${VAR}$)
-                    #:endfor
-
-                    if (qbmm) then
-                        #:for VAR in ['varR','varV','M10','M01','M20','M02']
-                            tmp = ${VAR}$
-                            call s_mpi_allreduce_sum(tmp, ${VAR}$)
-                        #:endfor
-                    end if
-                end if
-
-                if (elasticity) then
-                    do s = 1, (num_dims*(num_dims + 1))/2
-                        tmp = tau_e(s)
-                        call s_mpi_allreduce_sum(tmp, tau_e(s))
-                    end do
-                end if
-
-                if (cont_damage) then
-                    tmp = damage_state
-                    call s_mpi_allreduce_sum(tmp, damage_state)
-                end if
-            end if
-            if (proc_rank == 0) then
-                if (n == 0) then
-                    if (bubbles_euler .and. (num_fluids <= 2)) then
-                        if (qbmm) then
-                            write (i + 30, '(6x,f12.6,14f28.16)') &
-                                nondim_time, &
-                                rho, &
-                                vel(1), &
-                                pres, &
-                                alf, &
-                                R(1), &
-                                Rdot(1), &
-                                nR(1), &
-                                nRdot(1), &
-                                varR, &
-                                varV, &
-                                M10, &
-                                M01, &
-                                M20, &
-                                M02
-                        else
-                            write (i + 30, '(6x,f12.6,8f24.8)') &
-                                nondim_time, &
-                                rho, &
-                                vel(1), &
-                                pres, &
-                                alf, &
-                                R(1), &
-                                Rdot(1), &
-                                nR(1), &
-                                nRdot(1)
-                            ! ptilde, &
-                            ! ptot
-                        end if
-                    else if (bubbles_euler .and. (num_fluids == 3)) then
-                        write (i + 30, '(6x,f12.6,f24.8,f24.8,f24.8,f24.8,f24.8,'// &
-                               'f24.8,f24.8,f24.8,f24.8,f24.8, f24.8)') &
-                            nondim_time, &
-                            rho, &
-                            vel(1), &
-                            pres, &
-                            alf, &
-                            alfgr, &
-                            nR(1), &
-                            nRdot(1), &
-                            R(1), &
-                            Rdot(1), &
-                            ptilde, &
-                            ptot
-                    else if (bubbles_euler .and. num_fluids == 4) then
-                        write (i + 30, '(6x,f12.6,f24.8,f24.8,f24.8,f24.8,'// &
-                               'f24.8,f24.8,f24.8,f24.8,f24.8,f24.8,f24.8,f24.8,f24.8)') &
-                            nondim_time, &
-                            q_cons_vf(1)%sf(j - 2, 0, 0), &
-                            q_cons_vf(2)%sf(j - 2, 0, 0), &
-                            q_cons_vf(3)%sf(j - 2, 0, 0), &
-                            q_cons_vf(4)%sf(j - 2, 0, 0), &
-                            q_cons_vf(5)%sf(j - 2, 0, 0), &
-                            q_cons_vf(6)%sf(j - 2, 0, 0), &
-                            q_cons_vf(7)%sf(j - 2, 0, 0), &
-                            q_cons_vf(8)%sf(j - 2, 0, 0), &
-                            q_cons_vf(9)%sf(j - 2, 0, 0), &
-                            q_cons_vf(10)%sf(j - 2, 0, 0), &
-                            nbub, &
-                            R(1), &
-                            Rdot(1)
-                    else
-                        write (i + 30, '(6X,F12.6,F24.8,F24.8,F24.8)') &
-                            nondim_time, &
-                            rho, &
-                            vel(1), &
-                            pres
-                    end if
-                elseif (p == 0) then
-                    if (bubbles_euler) then
-                        #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
-                            write (i + 30, '(6X,10F24.8)') &
-                                nondim_time, &
-                                rho, &
-                                vel(1), &
-                                vel(2), &
-                                pres, &
-                                alf, &
-                                nR(1), &
-                                nRdot(1), &
-                                R(1), &
-                                Rdot(1)
-                        #:endif
-                    else if (elasticity) then
-                        #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
-                            write (i + 30, '(6X,F12.6,F24.8,F24.8,F24.8,F24.8,'// &
-                                   'F24.8,F24.8,F24.8)') &
-                                nondim_time, &
-                                rho, &
-                                vel(1), &
-                                vel(2), &
-                                pres, &
-                                tau_e(1), &
-                                tau_e(2), &
-                                tau_e(3)
-                        #:endif
-                    else
-                        write (i + 30, '(6X,F12.6,F24.8,F24.8,F24.8)') &
-                            nondim_time, &
-                            rho, &
-                            vel(1), &
-                            pres
-                        print *, 'time =', nondim_time, 'rho =', rho, 'pres =', pres
-                    end if
-                else
-                    #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
-                        write (i + 30, '(6X,F12.6,F24.8,F24.8,F24.8,F24.8,'// &
-                               'F24.8,F24.8,F24.8,F24.8,F24.8,'// &
-                               'F24.8)') &
-                            nondim_time, &
-                            rho, &
-                            vel(1), &
-                            vel(2), &
-                            vel(3), &
-                            pres, &
-                            gamma, &
-                            pi_inf, &
-                            qv, &
-                            c, &
-                            accel
-                    #:endif
-                end if
-            end if
-        end do
-
-        if (integral_wrt .and. bubbles_euler) then
-            if (n == 0) then ! 1D simulation
-                do i = 1, num_integrals
-                    int_pres = 0._wp
-                    max_pres = 0._wp
-                    k = 0; l = 0
-                    npts = 0
-                    do j = 1, m
-                        pres = 0._wp
-                        do s = 1, num_vels
-                            vel(s) = 0._wp
-                        end do
-                        rho = 0._wp
-                        pres = 0._wp
-                        gamma = 0._wp
-                        pi_inf = 0._wp
-                        qv = 0._wp
-
-                        if ((integral(i)%xmin <= x_cb(j)) .and. (integral(i)%xmax >= x_cb(j))) then
-                            npts = npts + 1
-                            call s_convert_to_mixture_variables(q_cons_vf, j, k, l, &
-                                                                rho, gamma, pi_inf, qv, Re)
-                            do s = 1, num_vels
-                                vel(s) = q_cons_vf(cont_idx%end + s)%sf(j, k, l)/rho
-                            end do
-
-                            pres = ( &
-                                   (q_cons_vf(E_idx)%sf(j, k, l) - &
-                                    0.5_wp*(q_cons_vf(mom_idx%beg)%sf(j, k, l)**2._wp)/rho)/ &
-                                   (1._wp - q_cons_vf(alf_idx)%sf(j, k, l)) - &
-                                   pi_inf - qv &
-                                   )/gamma
-                            int_pres = int_pres + (pres - 1._wp)**2._wp
-                        end if
-                    end do
-                    int_pres = sqrt(int_pres/(1._wp*npts))
-
-                    if (num_procs > 1) then
-                        tmp = int_pres
-                        call s_mpi_allreduce_sum(tmp, int_pres)
-                    end if
-
-                    if (proc_rank == 0) then
-                        if (bubbles_euler .and. (num_fluids <= 2)) then
-                            write (i + 70, '(6x,f12.6,f24.8)') &
-                                nondim_time, int_pres
-                        end if
-                    end if
-                end do
-            elseif (p == 0) then
-                if (num_integrals /= 3) then
-                    call s_mpi_abort('Incorrect number of integrals')
-                end if
-
-                rad = integral(1)%xmax
-                thickness = integral(1)%xmin
-
-                do i = 1, num_integrals
-                    int_pres = 0._wp
-                    max_pres = 0._wp
-                    l = 0
-                    npts = 0
-                    do j = 1, m
-                        do k = 1, n
-                            trigger = .false.
-                            if (i == 1) then
-                                !inner portion
-                                if (sqrt(x_cb(j)**2._wp + y_cb(k)**2._wp) < (rad - 0.5_wp*thickness)) &
-                                    trigger = .true.
-                            elseif (i == 2) then
-                                !net region
-                                if (sqrt(x_cb(j)**2._wp + y_cb(k)**2._wp) > (rad - 0.5_wp*thickness) .and. &
-                                    sqrt(x_cb(j)**2._wp + y_cb(k)**2._wp) < (rad + 0.5_wp*thickness)) &
-                                    trigger = .true.
-                            elseif (i == 3) then
-                                !everything else
-                                if (sqrt(x_cb(j)**2._wp + y_cb(k)**2._wp) > (rad + 0.5_wp*thickness)) &
-                                    trigger = .true.
-                            end if
-
-                            pres = 0._wp
-                            do s = 1, num_vels
-                                vel(s) = 0._wp
-                            end do
-                            rho = 0._wp
-                            pres = 0._wp
-                            gamma = 0._wp
-                            pi_inf = 0._wp
-                            qv = 0._wp
-
-                            if (trigger) then
-                                npts = npts + 1
-                                call s_convert_to_mixture_variables(q_cons_vf, j, k, l, &
-                                                                    rho, gamma, pi_inf, qv, Re)
-                                do s = 1, num_vels
-                                    vel(s) = q_cons_vf(cont_idx%end + s)%sf(j, k, l)/rho
-                                end do
-
-                                pres = ( &
-                                       (q_cons_vf(E_idx)%sf(j, k, l) - &
-                                        0.5_wp*(q_cons_vf(mom_idx%beg)%sf(j, k, l)**2._wp)/rho)/ &
-                                       (1._wp - q_cons_vf(alf_idx)%sf(j, k, l)) - &
-                                       pi_inf - qv &
-                                       )/gamma
-                                int_pres = int_pres + abs(pres - 1._wp)
-                                max_pres = max(max_pres, abs(pres - 1._wp))
-                            end if
-
-                        end do
-                    end do
-
-                    if (npts > 0) then
-                        int_pres = int_pres/(1._wp*npts)
-                    else
-                        int_pres = 0._wp
-                    end if
-
-                    if (num_procs > 1) then
-                        tmp = int_pres
-                        call s_mpi_allreduce_sum(tmp, int_pres)
-
-                        tmp = max_pres
-                        call s_mpi_allreduce_max(tmp, max_pres)
-                    end if
-
-                    if (proc_rank == 0) then
-                        if (bubbles_euler .and. (num_fluids <= 2)) then
-                            write (i + 70, '(6x,f12.6,f24.8,f24.8)') &
-                                nondim_time, int_pres, max_pres
-                        end if
-                    end if
-                end do
-            end if
-        end if
-
-    end subroutine s_write_probe_files
-
-    !>  The goal of this subroutine is to write to the run-time
-        !!      information file basic footer information applicable to
-        !!      the current computation and to close the file when done.
-        !!      The footer contains the stability criteria extrema over
-        !!      all of the time-steps and the simulation run-time.
-    impure subroutine s_close_run_time_information_file
-
-        real(wp) :: run_time !< Run-time of the simulation
-
-        ! Writing the footer of and closing the run-time information file
-        write (3, '(A)') '    '
-        write (3, '(A)') ''
-
-        write (3, '(A,F9.6)') 'ICFL Max: ', icfl_max
-        if (viscous) write (3, '(A,F9.6)') 'VCFL Max: ', vcfl_max
-        if (viscous) write (3, '(A,F10.6)') 'Rc Min: ', Rc_min
-
-        call cpu_time(run_time)
-
-        write (3, '(A)') ''
-        write (3, '(A,I0,A)') 'Run-time: ', int(anint(run_time)), 's'
-        write (3, '(A)') '    '
-        close (3)
-
-    end subroutine s_close_run_time_information_file
-
-    !> Closes communication files
-    impure subroutine s_close_com_files()
-
-        integer :: i !< Generic loop iterator
-        do i = 1, num_fluids
-            close (i + 120)
-        end do
-
-    end subroutine s_close_com_files
-
-    !> Closes probe files
-    impure subroutine s_close_probe_files
-
-        integer :: i !< Generic loop iterator
-
-        do i = 1, num_probes
-            close (i + 30)
-        end do
-
-    end subroutine s_close_probe_files
-
-    !>  The computation of parameters, the allocation of memory,
-        !!      the association of pointers and/or the execution of any
-        !!      other procedures that are necessary to setup the module.
-    impure subroutine s_initialize_data_output_module
-
-        integer :: i, m_ds, n_ds, p_ds
-
-        ! Allocating/initializing ICFL, VCFL, CCFL and Rc stability criteria
-        if (run_time_info) then
-            @:ALLOCATE(icfl_sf(0:m, 0:n, 0:p))
-            icfl_max = 0._wp
-
-            if (viscous) then
-                @:ALLOCATE(vcfl_sf(0:m, 0:n, 0:p))
-                @:ALLOCATE(Rc_sf  (0:m, 0:n, 0:p))
-
-                vcfl_max = 0._wp
-                Rc_min = 1.e3_wp
-            end if
-        end if
-
-        if (probe_wrt) then
-            @:ALLOCATE(c_mass(num_fluids,5))
-        end if
-
-        if (down_sample) then
-            m_ds = int((m + 1)/3) - 1
-            n_ds = int((n + 1)/3) - 1
-            p_ds = int((p + 1)/3) - 1
-
-            allocate (q_cons_temp_ds(1:sys_size))
-            do i = 1, sys_size
-                allocate (q_cons_temp_ds(i)%sf(-1:m_ds + 1, -1:n_ds + 1, -1:p_ds + 1))
-            end do
-        end if
-
-    end subroutine s_initialize_data_output_module
-
-    !> Module deallocation and/or disassociation procedures
-    impure subroutine s_finalize_data_output_module
-
-        integer :: i
-
-        if (probe_wrt) then
-            @:DEALLOCATE(c_mass)
-        end if
-
-        if (run_time_info) then
-            ! Deallocating the ICFL, VCFL, CCFL, and Rc stability criteria
-            @:DEALLOCATE(icfl_sf)
-            if (viscous) then
-                @:DEALLOCATE(vcfl_sf, Rc_sf)
-            end if
-        end if
-
-        if (down_sample) then
-            do i = 1, sys_size
-                deallocate (q_cons_temp_ds(i)%sf)
-            end do
-            deallocate (q_cons_temp_ds)
-        end if
-
-    end subroutine s_finalize_data_output_module
-
-end module m_data_output
+!>
+!! @file
+!! @brief Contains module m_data_output
+
+#:include 'macros.fpp'
+#:include 'case.fpp'
+
+!> @brief Writes solution data, run-time stability diagnostics (ICFL, VCFL, CCFL, Rc), and probe/center-of-mass files
+module m_data_output
+
+    use m_derived_types        !< Definitions of the derived types
+
+    use m_global_parameters    !< Definitions of the global parameters
+
+    use m_mpi_proxy            !< Message passing interface (MPI) module proxy
+
+    use m_variables_conversion !< State variables type conversion procedures
+
+    use m_compile_specific
+
+    use m_helper
+
+    use m_helper_basic         !< Functions to compare floating point numbers
+
+    use m_sim_helpers
+
+    use m_delay_file_access
+
+    use m_ibm
+
+    use m_boundary_common
+
+    use m_re_visc              !< Non-Newtonian viscosity computations
+
+    implicit none
+
+    private; 
+    public :: s_initialize_data_output_module, &
+              s_open_run_time_information_file, &
+              s_open_com_files, &
+              s_open_probe_files, &
+              s_write_run_time_information, &
+              s_write_data_files, &
+              s_write_serial_data_files, &
+              s_write_parallel_data_files, &
+              s_write_com_files, &
+              s_write_probe_files, &
+              s_close_run_time_information_file, &
+              s_close_com_files, &
+              s_close_probe_files, &
+              s_finalize_data_output_module, &
+              s_write_ib_data_file
+
+    real(wp), allocatable, dimension(:, :, :) :: icfl_sf  !< ICFL stability criterion
+    real(wp), allocatable, dimension(:, :, :) :: vcfl_sf  !< VCFL stability criterion
+    real(wp), allocatable, dimension(:, :, :) :: ccfl_sf  !< CCFL stability criterion
+    real(wp), allocatable, dimension(:, :, :) :: Rc_sf  !< Rc stability criterion
+    real(wp), public, allocatable, dimension(:, :) :: c_mass
+    $:GPU_DECLARE(create='[icfl_sf,vcfl_sf,ccfl_sf,Rc_sf,c_mass]')
+
+    real(wp) :: icfl_max_loc, icfl_max_glb !< ICFL stability extrema on local and global grids
+    real(wp) :: vcfl_max_loc, vcfl_max_glb !< VCFL stability extrema on local and global grids
+    real(wp) :: ccfl_max_loc, ccfl_max_glb !< CCFL stability extrema on local and global grids
+    real(wp) :: Rc_min_loc, Rc_min_glb !< Rc   stability extrema on local and global grids
+    $:GPU_DECLARE(create='[icfl_max_loc,icfl_max_glb,vcfl_max_loc,vcfl_max_glb]')
+    $:GPU_DECLARE(create='[ccfl_max_loc,ccfl_max_glb,Rc_min_loc,Rc_min_glb]')
+
+    !> @name ICFL, VCFL, CCFL and Rc stability criteria extrema over all the time-steps
+    !> @{
+    real(wp) :: icfl_max !< ICFL criterion maximum
+    real(wp) :: vcfl_max !< VCFL criterion maximum
+    real(wp) :: ccfl_max !< CCFL criterion maximum
+    real(wp) :: Rc_min !< Rc criterion maximum
+    !> @}
+
+    type(scalar_field), allocatable, dimension(:) :: q_cons_temp_ds
+
+contains
+
+    !> Write data files. Dispatch subroutine that replaces procedure pointer.
+        !! @param q_cons_vf Conservative variables
+        !! @param q_T_sf Temperature scalar field
+        !! @param q_prim_vf Primitive variables
+        !! @param t_step Current time step
+        !! @param bc_type Boundary condition type
+        !! @param beta Eulerian void fraction from lagrangian bubbles
+    impure subroutine s_write_data_files(q_cons_vf, q_T_sf, q_prim_vf, t_step, bc_type, beta)
+
+        type(scalar_field), &
+            dimension(sys_size), &
+            intent(inout) :: q_cons_vf
+
+        type(scalar_field), &
+            intent(inout) :: q_T_sf
+
+        type(scalar_field), &
+            dimension(sys_size), &
+            intent(inout) :: q_prim_vf
+
+        integer, intent(in) :: t_step
+
+        type(scalar_field), &
+            intent(inout), optional :: beta
+
+        type(integer_field), &
+            dimension(1:num_dims, -1:1), &
+            intent(in) :: bc_type
+
+        if (.not. parallel_io) then
+            call s_write_serial_data_files(q_cons_vf, q_T_sf, q_prim_vf, t_step, bc_type, beta)
+        else
+            call s_write_parallel_data_files(q_cons_vf, t_step, bc_type, beta)
+        end if
+
+    end subroutine s_write_data_files
+
+    !>  The purpose of this subroutine is to open a new or pre-
+        !!          existing run-time information file and append to it the
+        !!      basic header information relevant to current simulation.
+        !!      In general, this requires generating a table header for
+        !!      those stability criteria which will be written at every
+        !!      time-step.
+    impure subroutine s_open_run_time_information_file
+
+        character(LEN=name_len), parameter :: file_name = 'run_time.inf' !<
+            !! Name of the run-time information file
+
+        character(LEN=path_len + name_len) :: file_path !<
+            !! Relative path to a file in the case directory
+
+        character(LEN=8) :: file_date !<
+            !! Creation date of the run-time information file
+
+        ! Opening the run-time information file
+        file_path = trim(case_dir)//'/'//trim(file_name)
+
+        open (3, FILE=trim(file_path), &
+              FORM='formatted', &
+              STATUS='replace')
+
+        write (3, '(A)') 'Description: Stability information at '// &
+            'each time-step of the simulation. This'
+        write (3, '(13X,A)') 'data is composed of the inviscid '// &
+            'Courant–Friedrichs–Lewy (ICFL)'
+        write (3, '(13X,A)') 'number, the viscous CFL (VCFL) number, '// &
+            'the capillary CFL (CCFL)'
+        write (3, '(13X,A)') 'number and the cell Reynolds (Rc) '// &
+            'number. Please note that only'
+        write (3, '(13X,A)') 'those stability conditions pertinent '// &
+            'to the physics included in'
+        write (3, '(13X,A)') 'the current computation are displayed.'
+
+        call date_and_time(DATE=file_date)
+
+        write (3, '(A)') 'Date: '//file_date(5:6)//'/'// &
+            file_date(7:8)//'/'// &
+            file_date(3:4)
+
+        write (3, '(A)') ''; write (3, '(A)') ''
+
+        ! Generating table header for the stability criteria to be outputted
+        write (3, '(13X,A9,13X,A10,13X,A10,13X,A10)', advance="no") &
+            trim('Time-step'), trim('dt'), trim('Time'), trim('ICFL Max')
+
+        if (viscous) then
+            write (3, '(13X,A10,13X,A16)', advance="no") &
+                trim('VCFL Max'), trim('Rc Min')
+        end if
+
+        write (3, *) ! new line
+
+    end subroutine s_open_run_time_information_file
+
+    !>  This opens a formatted data file where the root processor
+        !!      can write out the CoM information
+    impure subroutine s_open_com_files()
+
+        character(len=path_len + 3*name_len) :: file_path !<
+            !! Relative path to the CoM file in the case directory
+        integer :: i !< Generic loop iterator
+
+        do i = 1, num_fluids
+            ! Generating the relative path to the CoM data file
+            write (file_path, '(A,I0,A)') '/fluid', i, '_com.dat'
+            file_path = trim(case_dir)//trim(file_path)
+            ! Creating the formatted data file and setting up its
+            ! structure
+            open (i + 120, file=trim(file_path), &
+                  form='formatted', &
+                  position='append', &
+                  status='unknown')
+            if (n == 0) then
+                write (i + 120, '(A)') '    Non-Dimensional Time '// &
+                    '    Total Mass '// &
+                    '    x-loc '// &
+                    '    Total Volume    '
+            elseif (p == 0) then
+                write (i + 120, '(A)') '    Non-Dimensional Time '// &
+                    '    Total Mass '// &
+                    '    x-loc '// &
+                    '    y-loc '// &
+                    '    Total Volume    '
+            else
+                write (i + 120, '(A)') '    Non-Dimensional Time '// &
+                    '    Total Mass '// &
+                    '    x-loc '// &
+                    '    y-loc '// &
+                    '    z-loc '// &
+                    '    Total Volume    '
+            end if
+        end do
+    end subroutine s_open_com_files
+
+    !>  This opens a formatted data file where the root processor
+        !!      can write out flow probe information
+    impure subroutine s_open_probe_files
+
+        character(LEN=path_len + 3*name_len) :: file_path !<
+            !! Relative path to the probe data file in the case directory
+
+        integer :: i !< Generic loop iterator
+        logical :: file_exist
+
+        do i = 1, num_probes
+            ! Generating the relative path to the data file
+            write (file_path, '(A,I0,A)') '/D/probe', i, '_prim.dat'
+            file_path = trim(case_dir)//trim(file_path)
+
+            ! Creating the formatted data file and setting up its
+            ! structure
+            inquire (file=trim(file_path), exist=file_exist)
+
+            if (file_exist) then
+                open (i + 30, FILE=trim(file_path), &
+                      FORM='formatted', &
+                      STATUS='old', &
+                      POSITION='append')
+            else
+                open (i + 30, FILE=trim(file_path), &
+                      FORM='formatted', &
+                      STATUS='unknown')
+            end if
+        end do
+
+        if (integral_wrt) then
+            do i = 1, num_integrals
+                write (file_path, '(A,I0,A)') '/D/integral', i, '_prim.dat'
+                file_path = trim(case_dir)//trim(file_path)
+
+                open (i + 70, FILE=trim(file_path), &
+                      FORM='formatted', &
+                      POSITION='append', &
+                      STATUS='unknown')
+            end do
+        end if
+
+    end subroutine s_open_probe_files
+
+    !>  The goal of the procedure is to output to the run-time
+        !!      information file the stability criteria extrema in the
+        !!      entire computational domain and at the given time-step.
+        !!      Moreover, the subroutine is also in charge of tracking
+        !!      these stability criteria extrema over all time-steps.
+        !!  @param q_prim_vf Cell-average primitive variables
+        !!  @param t_step Current time step
+    impure subroutine s_write_run_time_information(q_prim_vf, t_step)
+
+        type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf
+        integer, intent(in) :: t_step
+
+        real(wp) :: rho        !< Cell-avg. density
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3) :: alpha      !< Cell-avg. volume fraction
+            real(wp), dimension(3) :: vel        !< Cell-avg. velocity
+            real(wp), dimension(3, 2) :: Re_visc_per_phase  !< Per-phase Re_visc
+        #:else
+            real(wp), dimension(num_fluids) :: alpha      !< Cell-avg. volume fraction
+            real(wp), dimension(num_vels) :: vel        !< Cell-avg. velocity
+            real(wp), dimension(num_fluids, 2) :: Re_visc_per_phase  !< Per-phase Re_visc
+        #:endif
+        real(wp) :: vel_sum    !< Cell-avg. velocity sum
+        real(wp) :: pres       !< Cell-avg. pressure
+        real(wp) :: gamma      !< Cell-avg. sp. heat ratio
+        real(wp) :: pi_inf     !< Cell-avg. liquid stiffness function
+        real(wp) :: qv         !< Cell-avg. internal energy reference value
+        real(wp) :: c          !< Cell-avg. sound speed
+        real(wp) :: H          !< Cell-avg. enthalpy
+        real(wp), dimension(2) :: Re         !< Cell-avg. Reynolds numbers
+        integer :: j, k, l
+
+        ! Computing Stability Criteria at Current Time-step
+        $:GPU_PARALLEL_LOOP(collapse=3, private='[j,k,l,vel, alpha, Re, Re_visc_per_phase, rho, vel_sum, pres, gamma, pi_inf, c, H, qv]')
+        do l = 0, p
+            do k = 0, n
+                do j = 0, m
+                    call s_compute_enthalpy(q_prim_vf, pres, rho, gamma, pi_inf, Re, H, alpha, vel, vel_sum, qv, j, k, l)
+
+                    ! For non-Newtonian fluids, compute variable Re based on shear rate
+                    if (any_non_newtonian) then
+                        call s_compute_re_visc(q_prim_vf, alpha, j, k, l, Re_visc_per_phase)
+                        call s_compute_mixture_re(alpha, Re_visc_per_phase, Re)
+                    end if
+
+                    call s_compute_speed_of_sound(pres, rho, gamma, pi_inf, H, alpha, vel_sum, 0._wp, c, qv)
+
+                    if (viscous) then
+                        call s_compute_stability_from_dt(vel, c, rho, Re, j, k, l, icfl_sf, vcfl_sf, Rc_sf)
+                    else
+                        call s_compute_stability_from_dt(vel, c, rho, Re, j, k, l, icfl_sf)
+                    end if
+
+                end do
+            end do
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+        ! end: Computing Stability Criteria at Current Time-step
+
+        ! Determining local stability criteria extrema at current time-step
+
+#ifdef _CRAYFTN
+        $:GPU_UPDATE(host='[icfl_sf]')
+
+        if (viscous) then
+            $:GPU_UPDATE(host='[vcfl_sf,Rc_sf]')
+        end if
+
+        icfl_max_loc = maxval(icfl_sf)
+
+        if (viscous) then
+            vcfl_max_loc = maxval(vcfl_sf)
+            Rc_min_loc = minval(Rc_sf)
+        end if
+#else
+        #:call GPU_PARALLEL(copyout='[icfl_max_loc]', copyin='[icfl_sf]')
+            icfl_max_loc = maxval(icfl_sf)
+        #:endcall GPU_PARALLEL
+        if (viscous .or. dummy) then
+            #:call GPU_PARALLEL(copyout='[vcfl_max_loc, Rc_min_loc]', copyin='[vcfl_sf,Rc_sf]')
+                vcfl_max_loc = maxval(vcfl_sf)
+                Rc_min_loc = minval(Rc_sf)
+            #:endcall GPU_PARALLEL
+        end if
+#endif
+
+        ! Determining global stability criteria extrema at current time-step
+        if (num_procs > 1) then
+            call s_mpi_reduce_stability_criteria_extrema(icfl_max_loc, &
+                                                         vcfl_max_loc, &
+                                                         Rc_min_loc, &
+                                                         icfl_max_glb, &
+                                                         vcfl_max_glb, &
+                                                         Rc_min_glb)
+        else
+            icfl_max_glb = icfl_max_loc
+            if (viscous) vcfl_max_glb = vcfl_max_loc
+            if (viscous) Rc_min_glb = Rc_min_loc
+        end if
+
+        ! Determining the stability criteria extrema over all the time-steps
+        if (icfl_max_glb > icfl_max) icfl_max = icfl_max_glb
+
+        if (viscous) then
+            if (vcfl_max_glb > vcfl_max) vcfl_max = vcfl_max_glb
+            if (Rc_min_glb < Rc_min) Rc_min = Rc_min_glb
+        end if
+
+        ! Outputting global stability criteria extrema at current time-step
+        if (proc_rank == 0) then
+            write (3, '(13X,I9,13X,F10.6,13X,F10.6,13X,F10.6)', advance="no") &
+                t_step, dt, mytime, icfl_max_glb
+
+            if (viscous) then
+                write (3, '(13X,F10.6,13X,ES16.6)', advance="no") &
+                    vcfl_max_glb, &
+                    Rc_min_glb
+            end if
+
+            write (3, *) ! new line
+
+            if (.not. f_approx_equal(icfl_max_glb, icfl_max_glb)) then
+                call s_mpi_abort('ICFL is NaN. Exiting.')
+            elseif (icfl_max_glb > 1._wp) then
+                print *, 'icfl', icfl_max_glb
+                call s_mpi_abort('ICFL is greater than 1.0. Exiting.')
+            end if
+
+            if (viscous) then
+                if (.not. f_approx_equal(vcfl_max_glb, vcfl_max_glb)) then
+                    call s_mpi_abort('VCFL is NaN. Exiting.')
+                elseif (vcfl_max_glb > 1._wp) then
+                    print *, 'vcfl', vcfl_max_glb
+                    call s_mpi_abort('VCFL is greater than 1.0. Exiting.')
+                end if
+            end if
+        end if
+
+        call s_mpi_barrier()
+
+    end subroutine s_write_run_time_information
+
+    !>  The goal of this subroutine is to output the grid and
+        !!      conservative variables data files for given time-step.
+        !!  @param q_cons_vf Cell-average conservative variables
+        !!  @param q_T_sf Temperature scalar field
+        !!  @param q_prim_vf Cell-average primitive variables
+        !!  @param t_step Current time-step
+        !!  @param bc_type Boundary condition type
+        !!  @param beta Eulerian void fraction from lagrangian bubbles
+    impure subroutine s_write_serial_data_files(q_cons_vf, q_T_sf, q_prim_vf, t_step, bc_type, beta)
+
+        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
+        type(scalar_field), intent(inout) :: q_T_sf
+        type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
+        integer, intent(in) :: t_step
+        type(scalar_field), intent(inout), optional :: beta
+        type(integer_field), dimension(1:num_dims, -1:1), intent(in) :: bc_type
+
+        character(LEN=path_len + 2*name_len) :: t_step_dir !<
+            !! Relative path to the current time-step directory
+
+        character(LEN=path_len + 3*name_len) :: file_path !<
+            !! Relative path to the grid and conservative variables data files
+
+        logical :: file_exist !<
+            !! Logical used to check existence of current time-step directory
+
+        character(LEN=15) :: FMT
+
+        integer :: i, j, k, l, r
+
+        real(wp) :: gamma, lit_gamma, pi_inf, qv !< Temporary EOS params
+
+        ! Creating or overwriting the time-step root directory
+        write (t_step_dir, '(A,I0,A,I0)') trim(case_dir)//'/p_all'
+
+        ! Creating or overwriting the current time-step directory
+        write (t_step_dir, '(a,i0,a,i0)') trim(case_dir)//'/p_all/p', &
+            proc_rank, '/', t_step
+
+        file_path = trim(t_step_dir)//'/.'
+        call my_inquire(file_path, file_exist)
+        if (file_exist) call s_delete_directory(trim(t_step_dir))
+        call s_create_directory(trim(t_step_dir))
+
+        ! Writing the grid data file in the x-direction
+        file_path = trim(t_step_dir)//'/x_cb.dat'
+
+        open (2, FILE=trim(file_path), &
+              FORM='unformatted', &
+              STATUS='new')
+        write (2) x_cb(-1:m); close (2)
+
+        ! Writing the grid data files in the y- and z-directions
+        if (n > 0) then
+
+            file_path = trim(t_step_dir)//'/y_cb.dat'
+
+            open (2, FILE=trim(file_path), &
+                  FORM='unformatted', &
+                  STATUS='new')
+            write (2) y_cb(-1:n); close (2)
+
+            if (p > 0) then
+
+                file_path = trim(t_step_dir)//'/z_cb.dat'
+
+                open (2, FILE=trim(file_path), &
+                      FORM='unformatted', &
+                      STATUS='new')
+                write (2) z_cb(-1:p); close (2)
+
+            end if
+
+        end if
+
+        ! Writing the conservative variables data files
+        do i = 1, sys_size
+            write (file_path, '(A,I0,A)') trim(t_step_dir)//'/q_cons_vf', &
+                i, '.dat'
+
+            open (2, FILE=trim(file_path), &
+                  FORM='unformatted', &
+                  STATUS='new')
+
+            write (2) q_cons_vf(i)%sf(0:m, 0:n, 0:p); close (2)
+        end do
+
+        ! Lagrangian beta (void fraction) written as q_cons_vf(sys_size+1) to
+        ! match the parallel I/O path and allow post_process to read it.
+        if (bubbles_lagrange) then
+            write (file_path, '(A,I0,A)') trim(t_step_dir)//'/q_cons_vf', &
+                sys_size + 1, '.dat'
+
+            open (2, FILE=trim(file_path), &
+                  FORM='unformatted', &
+                  STATUS='new')
+
+            write (2) beta%sf(0:m, 0:n, 0:p); close (2)
+        end if
+
+        if (qbmm .and. .not. polytropic) then
+            do i = 1, nb
+                do r = 1, nnode
+                    write (file_path, '(A,I0,A)') trim(t_step_dir)//'/pb', &
+                        sys_size + (i - 1)*nnode + r, '.dat'
+
+                    open (2, FILE=trim(file_path), &
+                          FORM='unformatted', &
+                          STATUS='new')
+
+                    write (2) pb_ts(1)%sf(0:m, 0:n, 0:p, r, i); close (2)
+                end do
+            end do
+
+            do i = 1, nb
+                do r = 1, nnode
+                    write (file_path, '(A,I0,A)') trim(t_step_dir)//'/mv', &
+                        sys_size + (i - 1)*nnode + r, '.dat'
+
+                    open (2, FILE=trim(file_path), &
+                          FORM='unformatted', &
+                          STATUS='new')
+
+                    write (2) mv_ts(1)%sf(0:m, 0:n, 0:p, r, i); close (2)
+                end do
+            end do
+        end if
+
+        ! Writing the IB markers
+        if (ib) then
+            call s_write_serial_ib_data(t_step)
+            ! write (file_path, '(A,I0,A)') trim(t_step_dir)//'/ib.dat'
+
+            ! open (2, FILE=trim(file_path), &
+            !       FORM='unformatted', &
+            !       STATUS='new')
+
+            ! write (2) ib_markers%sf(0:m, 0:n, 0:p); close (2)
+        end if
+
+        gamma = gammas(1)
+        lit_gamma = gs_min(1)
+        pi_inf = pi_infs(1)
+        qv = qvs(1)
+
+        if (precision == 1) then
+            FMT = "(2F30.3)"
+        else
+            FMT = "(2F40.14)"
+        end if
+
+        ! writing an output directory
+        write (t_step_dir, '(A,I0,A,I0)') trim(case_dir)//'/D'
+        file_path = trim(t_step_dir)//'/.'
+
+        inquire (FILE=trim(file_path), EXIST=file_exist)
+
+        if (.not. file_exist) call s_create_directory(trim(t_step_dir))
+
+        if ((prim_vars_wrt .or. (n == 0 .and. p == 0)) .and. (.not. igr)) then
+            call s_convert_conservative_to_primitive_variables(q_cons_vf, q_T_sf, q_prim_vf, idwint)
+            do i = 1, sys_size
+                $:GPU_UPDATE(host='[q_prim_vf(i)%sf(:,:,:)]')
+            end do
+            ! q_prim_vf(bubxb) stores the value of nb needed in riemann solvers, so replace with true primitive value (=1._wp)
+            if (qbmm) then
+                q_prim_vf(bubxb)%sf = 1._wp
+            end if
+        end if
+
+        !1D
+        if (n == 0 .and. p == 0) then
+
+            if (model_eqns == 2 .and. (.not. igr)) then
+                do i = 1, sys_size
+                    write (file_path, '(A,I0,A,I2.2,A,I6.6,A)') trim(t_step_dir)//'/prim.', i, '.', proc_rank, '.', t_step, '.dat'
+
+                    open (2, FILE=trim(file_path))
+                    do j = 0, m
+                        ! todo: revisit change here
+                        if (((i >= adv_idx%beg) .and. (i <= adv_idx%end))) then
+                            write (2, FMT) x_cb(j), q_cons_vf(i)%sf(j, 0, 0)
+                        else
+                            write (2, FMT) x_cb(j), q_prim_vf(i)%sf(j, 0, 0)
+                        end if
+                    end do
+                    close (2)
+                end do
+            end if
+
+            do i = 1, sys_size
+                write (file_path, '(A,I0,A,I2.2,A,I6.6,A)') trim(t_step_dir)//'/cons.', i, '.', proc_rank, '.', t_step, '.dat'
+
+                open (2, FILE=trim(file_path))
+                do j = 0, m
+                    write (2, FMT) x_cb(j), q_cons_vf(i)%sf(j, 0, 0)
+                end do
+                close (2)
+            end do
+
+            if (qbmm .and. .not. polytropic) then
+                do i = 1, nb
+                    do r = 1, nnode
+                        write (file_path, '(A,I0,A,I0,A,I2.2,A,I6.6,A)') trim(t_step_dir)//'/pres.', i, '.', r, '.', proc_rank, '.', t_step, '.dat'
+
+                        open (2, FILE=trim(file_path))
+                        do j = 0, m
+                            write (2, FMT) x_cb(j), pb_ts(1)%sf(j, 0, 0, r, i)
+                        end do
+                        close (2)
+                    end do
+                end do
+                do i = 1, nb
+                    do r = 1, nnode
+                        write (file_path, '(A,I0,A,I0,A,I2.2,A,I6.6,A)') trim(t_step_dir)//'/mv.', i, '.', r, '.', proc_rank, '.', t_step, '.dat'
+
+                        open (2, FILE=trim(file_path))
+                        do j = 0, m
+                            write (2, FMT) x_cb(j), mv_ts(1)%sf(j, 0, 0, r, i)
+                        end do
+                        close (2)
+                    end do
+                end do
+            end if
+        end if
+
+        if (precision == 1) then
+            FMT = "(3F30.7)"
+        else
+            FMT = "(3F40.14)"
+        end if
+
+        ! 2D
+        if ((n > 0) .and. (p == 0)) then
+            do i = 1, sys_size
+                write (file_path, '(A,I0,A,I2.2,A,I6.6,A)') trim(t_step_dir)//'/cons.', i, '.', proc_rank, '.', t_step, '.dat'
+                open (2, FILE=trim(file_path))
+                do j = 0, m
+                    do k = 0, n
+                        write (2, FMT) x_cb(j), y_cb(k), q_cons_vf(i)%sf(j, k, 0)
+                    end do
+                    write (2, *)
+                end do
+                close (2)
+            end do
+
+            if (present(beta)) then
+                write (file_path, '(A,I0,A,I2.2,A,I6.6,A)') trim(t_step_dir)//'/beta.', i, '.', proc_rank, '.', t_step, '.dat'
+                open (2, FILE=trim(file_path))
+                do j = 0, m
+                    do k = 0, n
+                        write (2, FMT) x_cb(j), y_cb(k), beta%sf(j, k, 0)
+                    end do
+                    write (2, *)
+                end do
+                close (2)
+            end if
+
+            if (qbmm .and. .not. polytropic) then
+                do i = 1, nb
+                    do r = 1, nnode
+                        write (file_path, '(A,I0,A,I0,A,I2.2,A,I6.6,A)') trim(t_step_dir)//'/pres.', i, '.', r, '.', proc_rank, '.', t_step, '.dat'
+
+                        open (2, FILE=trim(file_path))
+                        do j = 0, m
+                            do k = 0, n
+                                write (2, FMT) x_cb(j), y_cb(k), pb_ts(1)%sf(j, k, 0, r, i)
+                            end do
+                        end do
+                        close (2)
+                    end do
+                end do
+                do i = 1, nb
+                    do r = 1, nnode
+                        write (file_path, '(A,I0,A,I0,A,I2.2,A,I6.6,A)') trim(t_step_dir)//'/mv.', i, '.', r, '.', proc_rank, '.', t_step, '.dat'
+
+                        open (2, FILE=trim(file_path))
+                        do j = 0, m
+                            do k = 0, n
+                                write (2, FMT) x_cb(j), y_cb(k), mv_ts(1)%sf(j, k, 0, r, i)
+                            end do
+                        end do
+                        close (2)
+                    end do
+                end do
+            end if
+
+            if (prim_vars_wrt .and. (.not. igr)) then
+                do i = 1, sys_size
+                    write (file_path, '(A,I0,A,I2.2,A,I6.6,A)') trim(t_step_dir)//'/prim.', i, '.', proc_rank, '.', t_step, '.dat'
+
+                    open (2, FILE=trim(file_path))
+
+                    do j = 0, m
+                        do k = 0, n
+                            if (((i >= cont_idx%beg) .and. (i <= cont_idx%end)) &
+                                .or. &
+                                ((i >= adv_idx%beg) .and. (i <= adv_idx%end)) &
+                                ) then
+                                write (2, FMT) x_cb(j), y_cb(k), q_cons_vf(i)%sf(j, k, 0)
+                            else
+                                write (2, FMT) x_cb(j), y_cb(k), q_prim_vf(i)%sf(j, k, 0)
+                            end if
+                        end do
+                        write (2, *)
+                    end do
+                    close (2)
+                end do
+            end if
+        end if
+
+        if (precision == 1) then
+            FMT = "(4F30.7)"
+        else
+            FMT = "(4F40.14)"
+        end if
+
+        ! 3D
+        if (p > 0) then
+            do i = 1, sys_size
+                write (file_path, '(A,I0,A,I2.2,A,I6.6,A)') trim(t_step_dir)//'/cons.', i, '.', proc_rank, '.', t_step, '.dat'
+                open (2, FILE=trim(file_path))
+                do j = 0, m
+                    do k = 0, n
+                        do l = 0, p
+                            write (2, FMT) x_cb(j), y_cb(k), z_cb(l), q_cons_vf(i)%sf(j, k, l)
+                        end do
+                        write (2, *)
+                    end do
+                    write (2, *)
+                end do
+                close (2)
+            end do
+
+            if (present(beta)) then
+                write (file_path, '(A,I0,A,I2.2,A,I6.6,A)') trim(t_step_dir)//'/beta.', i, '.', proc_rank, '.', t_step, '.dat'
+                open (2, FILE=trim(file_path))
+                do j = 0, m
+                    do k = 0, n
+                        do l = 0, p
+                            write (2, FMT) x_cb(j), y_cb(k), z_cb(l), beta%sf(j, k, l)
+                        end do
+                        write (2, *)
+                    end do
+                    write (2, *)
+                end do
+                close (2)
+            end if
+
+            if (qbmm .and. .not. polytropic) then
+                do i = 1, nb
+                    do r = 1, nnode
+                        write (file_path, '(A,I0,A,I0,A,I2.2,A,I6.6,A)') trim(t_step_dir)//'/pres.', i, '.', r, '.', proc_rank, '.', t_step, '.dat'
+
+                        open (2, FILE=trim(file_path))
+                        do j = 0, m
+                            do k = 0, n
+                                do l = 0, p
+                                    write (2, FMT) x_cb(j), y_cb(k), z_cb(l), pb_ts(1)%sf(j, k, l, r, i)
+                                end do
+                            end do
+                        end do
+                        close (2)
+                    end do
+                end do
+                do i = 1, nb
+                    do r = 1, nnode
+                        write (file_path, '(A,I0,A,I0,A,I2.2,A,I6.6,A)') trim(t_step_dir)//'/mv.', i, '.', r, '.', proc_rank, '.', t_step, '.dat'
+
+                        open (2, FILE=trim(file_path))
+                        do j = 0, m
+                            do k = 0, n
+                                do l = 0, p
+                                    write (2, FMT) x_cb(j), y_cb(k), z_cb(l), mv_ts(1)%sf(j, k, l, r, i)
+                                end do
+                            end do
+                        end do
+                        close (2)
+                    end do
+                end do
+            end if
+
+            if (prim_vars_wrt .and. (.not. igr)) then
+                do i = 1, sys_size
+                    write (file_path, '(A,I0,A,I2.2,A,I6.6,A)') trim(t_step_dir)//'/prim.', i, '.', proc_rank, '.', t_step, '.dat'
+
+                    open (2, FILE=trim(file_path))
+
+                    do j = 0, m
+                        do k = 0, n
+                            do l = 0, p
+                                if (((i >= cont_idx%beg) .and. (i <= cont_idx%end)) &
+                                    .or. &
+                                    ((i >= adv_idx%beg) .and. (i <= adv_idx%end)) &
+                                    .or. &
+                                    ((i >= chemxb) .and. (i <= chemxe)) &
+                                    ) then
+                                    write (2, FMT) x_cb(j), y_cb(k), z_cb(l), q_cons_vf(i)%sf(j, k, l)
+                                else
+                                    write (2, FMT) x_cb(j), y_cb(k), z_cb(l), q_prim_vf(i)%sf(j, k, l)
+                                end if
+                            end do
+                            write (2, *)
+                        end do
+                        write (2, *)
+                    end do
+                    close (2)
+                end do
+            end if
+        end if
+
+    end subroutine s_write_serial_data_files
+
+    !>  The goal of this subroutine is to output the grid and
+        !!      conservative variables data files for given time-step.
+        !!  @param q_cons_vf Cell-average conservative variables
+        !!  @param t_step Current time-step
+        !!  @param bc_type Boundary condition type
+        !!  @param beta Eulerian void fraction from lagrangian bubbles
+    impure subroutine s_write_parallel_data_files(q_cons_vf, t_step, bc_type, beta)
+
+        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
+        integer, intent(in) :: t_step
+        type(scalar_field), intent(inout), optional :: beta
+        type(integer_field), &
+            dimension(1:num_dims, -1:1), &
+            intent(in) :: bc_type
+
+#ifdef MFC_MPI
+
+        integer :: ifile, ierr, data_size
+        integer, dimension(MPI_STATUS_SIZE) :: status
+        integer(kind=MPI_OFFSET_kind) :: disp
+        integer(kind=MPI_OFFSET_kind) :: m_MOK, n_MOK, p_MOK
+        integer(kind=MPI_OFFSET_kind) :: WP_MOK, var_MOK, str_MOK
+        integer(kind=MPI_OFFSET_kind) :: NVARS_MOK
+        integer(kind=MPI_OFFSET_kind) :: MOK
+
+        character(LEN=path_len + 2*name_len) :: file_loc
+        logical :: file_exist, dir_check
+        character(len=10) :: t_step_string
+
+        integer :: i !< Generic loop iterator
+
+        integer :: alt_sys !< Altered system size for the lagrangian subgrid bubble model
+
+        ! Down sampling variables
+        integer :: m_ds, n_ds, p_ds
+        integer :: m_glb_ds, n_glb_ds, p_glb_ds
+        integer :: m_glb_save, n_glb_save, p_glb_save ! Global save size
+
+        if (down_sample) then
+            call s_downsample_data(q_cons_vf, q_cons_temp_ds, &
+                                   m_ds, n_ds, p_ds, m_glb_ds, n_glb_ds, p_glb_ds)
+        end if
+
+        if (present(beta)) then
+            alt_sys = sys_size + 1
+        else
+            alt_sys = sys_size
+        end if
+
+        if (file_per_process) then
+
+            call s_int_to_str(t_step, t_step_string)
+
+            ! Initialize MPI data I/O
+            if (down_sample) then
+                call s_initialize_mpi_data_ds(q_cons_temp_ds)
+            else
+                if (ib) then
+                    call s_initialize_mpi_data(q_cons_vf, ib_markers)
+                else
+                    call s_initialize_mpi_data(q_cons_vf)
+                end if
+            end if
+
+            if (proc_rank == 0) then
+                file_loc = trim(case_dir)//'/restart_data/lustre_'//trim(t_step_string)
+                call my_inquire(file_loc, dir_check)
+                if (dir_check .neqv. .true.) then
+                    call s_create_directory(trim(file_loc))
+                end if
+                call s_create_directory(trim(file_loc))
+            end if
+            call s_mpi_barrier()
+            call DelayFileAccess(proc_rank)
+
+            ! Initialize MPI data I/O
+            call s_initialize_mpi_data(q_cons_vf)
+
+            ! Open the file to write all flow variables
+            write (file_loc, '(I0,A,i7.7,A)') t_step, '_', proc_rank, '.dat'
+            file_loc = trim(case_dir)//'/restart_data/lustre_'//trim(t_step_string)//trim(mpiiofs)//trim(file_loc)
+            inquire (FILE=trim(file_loc), EXIST=file_exist)
+            if (file_exist .and. proc_rank == 0) then
+                call MPI_FILE_DELETE(file_loc, mpi_info_int, ierr)
+            end if
+            call MPI_FILE_OPEN(MPI_COMM_SELF, file_loc, ior(MPI_MODE_WRONLY, MPI_MODE_CREATE), &
+                               mpi_info_int, ifile, ierr)
+
+            if (down_sample) then
+                ! Size of local arrays
+                data_size = (m_ds + 3)*(n_ds + 3)*(p_ds + 3)
+                m_glb_save = m_glb_ds + 1
+                n_glb_save = n_glb_ds + 1
+                p_glb_save = p_glb_ds + 1
+            else
+                ! Size of local arrays
+                data_size = (m + 1)*(n + 1)*(p + 1)
+                m_glb_save = m_glb + 1
+                n_glb_save = n_glb + 1
+                p_glb_save = p_glb + 1
+            end if
+
+            ! Resize some integers so MPI can write even the biggest files
+            m_MOK = int(m_glb_save + 1, MPI_OFFSET_KIND)
+            n_MOK = int(n_glb_save + 1, MPI_OFFSET_KIND)
+            p_MOK = int(p_glb_save + 1, MPI_OFFSET_KIND)
+            WP_MOK = int(8._wp, MPI_OFFSET_KIND)
+            MOK = int(1._wp, MPI_OFFSET_KIND)
+            str_MOK = int(name_len, MPI_OFFSET_KIND)
+            NVARS_MOK = int(sys_size, MPI_OFFSET_KIND)
+
+            if (bubbles_euler) then
+                ! Write the data for each variable
+                do i = 1, sys_size
+                    var_MOK = int(i, MPI_OFFSET_KIND)
+
+                    call MPI_FILE_WRITE_ALL(ifile, MPI_IO_DATA%var(i)%sf, data_size*mpi_io_type, &
+                                            mpi_io_p, status, ierr)
+                end do
+                !Write pb and mv for non-polytropic qbmm
+                if (qbmm .and. .not. polytropic) then
+                    do i = sys_size + 1, sys_size + 2*nb*nnode
+                        var_MOK = int(i, MPI_OFFSET_KIND)
+
+                        call MPI_FILE_WRITE_ALL(ifile, MPI_IO_DATA%var(i)%sf, data_size*mpi_io_type, &
+                                                mpi_io_p, status, ierr)
+                    end do
+                end if
+            else
+                if (down_sample) then
+                    do i = 1, sys_size !TODO: check if correct (sys_size
+                        var_MOK = int(i, MPI_OFFSET_KIND)
+
+                        call MPI_FILE_WRITE_ALL(ifile, q_cons_temp_ds(i)%sf, data_size*mpi_io_type, &
+                                                mpi_io_p, status, ierr)
+                    end do
+                else
+                    do i = 1, sys_size !TODO: check if correct (sys_size
+                        var_MOK = int(i, MPI_OFFSET_KIND)
+
+                        call MPI_FILE_WRITE_ALL(ifile, MPI_IO_DATA%var(i)%sf, data_size*mpi_io_type, &
+                                                mpi_io_p, status, ierr)
+                    end do
+                end if
+            end if
+
+            call MPI_FILE_CLOSE(ifile, ierr)
+        else
+            ! Initialize MPI data I/O
+
+            if (ib) then
+                call s_initialize_mpi_data(q_cons_vf, ib_markers)
+            elseif (present(beta)) then
+                call s_initialize_mpi_data(q_cons_vf, beta=beta)
+            else
+                call s_initialize_mpi_data(q_cons_vf)
+            end if
+
+            write (file_loc, '(I0,A)') t_step, '.dat'
+            file_loc = trim(case_dir)//'/restart_data'//trim(mpiiofs)//trim(file_loc)
+            inquire (FILE=trim(file_loc), EXIST=file_exist)
+            if (file_exist .and. proc_rank == 0) then
+                call MPI_FILE_DELETE(file_loc, mpi_info_int, ierr)
+            end if
+            call MPI_FILE_OPEN(MPI_COMM_WORLD, file_loc, ior(MPI_MODE_WRONLY, MPI_MODE_CREATE), &
+                               mpi_info_int, ifile, ierr)
+
+            ! Size of local arrays
+            data_size = (m + 1)*(n + 1)*(p + 1)
+
+            ! Resize some integers so MPI can write even the biggest files
+            m_MOK = int(m_glb + 1, MPI_OFFSET_KIND)
+            n_MOK = int(n_glb + 1, MPI_OFFSET_KIND)
+            p_MOK = int(p_glb + 1, MPI_OFFSET_KIND)
+            WP_MOK = int(8._wp, MPI_OFFSET_KIND)
+            MOK = int(1._wp, MPI_OFFSET_KIND)
+            str_MOK = int(name_len, MPI_OFFSET_KIND)
+            NVARS_MOK = int(alt_sys, MPI_OFFSET_KIND)
+
+            if (bubbles_euler) then
+                ! Write the data for each variable
+                do i = 1, sys_size
+                    var_MOK = int(i, MPI_OFFSET_KIND)
+
+                    ! Initial displacement to skip at beginning of file
+                    disp = m_MOK*max(MOK, n_MOK)*max(MOK, p_MOK)*WP_MOK*(var_MOK - 1)
+
+                    call MPI_FILE_SET_VIEW(ifile, disp, mpi_p, MPI_IO_DATA%view(i), &
+                                           'native', mpi_info_int, ierr)
+                    call MPI_FILE_WRITE_ALL(ifile, MPI_IO_DATA%var(i)%sf, data_size*mpi_io_type, &
+                                            mpi_io_p, status, ierr)
+                end do
+                !Write pb and mv for non-polytropic qbmm
+                if (qbmm .and. .not. polytropic) then
+                    do i = sys_size + 1, sys_size + 2*nb*nnode
+                        var_MOK = int(i, MPI_OFFSET_KIND)
+
+                        ! Initial displacement to skip at beginning of file
+                        disp = m_MOK*max(MOK, n_MOK)*max(MOK, p_MOK)*WP_MOK*(var_MOK - 1)
+
+                        call MPI_FILE_SET_VIEW(ifile, disp, mpi_p, MPI_IO_DATA%view(i), &
+                                               'native', mpi_info_int, ierr)
+                        call MPI_FILE_WRITE_ALL(ifile, MPI_IO_DATA%var(i)%sf, data_size*mpi_io_type, &
+                                                mpi_io_p, status, ierr)
+                    end do
+                end if
+            else
+                do i = 1, sys_size !TODO: check if correct (sys_size
+                    var_MOK = int(i, MPI_OFFSET_KIND)
+
+                    ! Initial displacement to skip at beginning of file
+                    disp = m_MOK*max(MOK, n_MOK)*max(MOK, p_MOK)*WP_MOK*(var_MOK - 1)
+
+                    call MPI_FILE_SET_VIEW(ifile, disp, mpi_p, MPI_IO_DATA%view(i), &
+                                           'native', mpi_info_int, ierr)
+                    call MPI_FILE_WRITE_ALL(ifile, MPI_IO_DATA%var(i)%sf, data_size*mpi_io_type, &
+                                            mpi_io_p, status, ierr)
+                end do
+            end if
+
+            ! Correction for the lagrangian subgrid bubble model
+            if (present(beta)) then
+                var_MOK = int(sys_size + 1, MPI_OFFSET_KIND)
+
+                ! Initial displacement to skip at beginning of file
+                disp = m_MOK*max(MOK, n_MOK)*max(MOK, p_MOK)*WP_MOK*(var_MOK - 1)
+
+                call MPI_FILE_SET_VIEW(ifile, disp, mpi_p, MPI_IO_DATA%view(sys_size + 1), &
+                                       'native', mpi_info_int, ierr)
+                call MPI_FILE_WRITE_ALL(ifile, MPI_IO_DATA%var(sys_size + 1)%sf, data_size*mpi_io_type, &
+                                        mpi_io_p, status, ierr)
+            end if
+
+            call MPI_FILE_CLOSE(ifile, ierr)
+
+            !Write ib data
+            if (ib) then
+                call s_write_parallel_ib_data(t_step)
+                ! write (file_loc, '(A)') 'ib.dat'
+                ! file_loc = trim(case_dir)//'/restart_data'//trim(mpiiofs)//trim(file_loc)
+                ! call MPI_FILE_OPEN(MPI_COMM_WORLD, file_loc, ior(MPI_MODE_WRONLY, MPI_MODE_CREATE), &
+                !                    mpi_info_int, ifile, ierr)
+
+                ! var_MOK = int(sys_size + 1, MPI_OFFSET_KIND)
+                ! disp = m_MOK*max(MOK, n_MOK)*max(MOK, p_MOK)*WP_MOK*(var_MOK - 1 + int(t_step/t_step_save))
+
+                ! call MPI_FILE_SET_VIEW(ifile, disp, MPI_INTEGER, MPI_IO_IB_DATA%view, &
+                !                        'native', mpi_info_int, ierr)
+                ! call MPI_FILE_WRITE_ALL(ifile, MPI_IO_IB_DATA%var%sf, data_size, &
+                !                         MPI_INTEGER, status, ierr)
+                ! call MPI_FILE_CLOSE(ifile, ierr)
+            end if
+
+        end if
+#endif
+
+    end subroutine s_write_parallel_data_files
+
+    !> @brief Writes immersed boundary marker data to a serial (per-processor) unformatted file.
+    subroutine s_write_serial_ib_data(time_step)
+
+        integer, intent(in) :: time_step
+        character(LEN=path_len + 2*name_len) :: file_path
+        character(LEN=path_len + 2*name_len) :: t_step_dir
+
+        ! Creating or overwriting the time-step root directory
+        write (t_step_dir, '(A,I0,A,I0)') trim(case_dir)//'/p_all'
+        write (t_step_dir, '(a,i0,a,i0)') trim(case_dir)//'/p_all/p', &
+            proc_rank, '/', time_step
+        write (file_path, '(A,I0,A)') trim(t_step_dir)//'/ib.dat'
+
+        open (2, FILE=trim(file_path), &
+              FORM='unformatted', &
+              STATUS='new')
+
+        $:GPU_UPDATE(host='[ib_markers%sf]')
+        write (2) ib_markers%sf(0:m, 0:n, 0:p); close (2)
+
+    end subroutine
+
+    !> @brief Writes immersed boundary marker data in parallel using MPI I/O.
+    subroutine s_write_parallel_ib_data(time_step)
+
+        integer, intent(in) :: time_step
+
+#ifdef MFC_MPI
+
+        character(LEN=path_len + 2*name_len) :: file_loc
+        integer(kind=MPI_OFFSET_kind) :: disp
+        integer(kind=MPI_OFFSET_kind) :: m_MOK, n_MOK, p_MOK
+        integer(kind=MPI_OFFSET_kind) :: WP_MOK, var_MOK, MOK
+        integer :: ifile, ierr, data_size
+        integer, dimension(MPI_STATUS_SIZE) :: status
+
+        $:GPU_UPDATE(host='[ib_markers%sf]')
+
+        ! Size of local arrays
+        data_size = (m + 1)*(n + 1)*(p + 1)
+        m_MOK = int(m_glb + 1, MPI_OFFSET_KIND)
+        n_MOK = int(n_glb + 1, MPI_OFFSET_KIND)
+        p_MOK = int(p_glb + 1, MPI_OFFSET_KIND)
+        WP_MOK = int(8._wp, MPI_OFFSET_KIND)
+        MOK = int(1._wp, MPI_OFFSET_KIND)
+
+        write (file_loc, '(A)') 'ib.dat'
+        file_loc = trim(case_dir)//'/restart_data'//trim(mpiiofs)//trim(file_loc)
+        call MPI_FILE_OPEN(MPI_COMM_WORLD, file_loc, ior(MPI_MODE_WRONLY, MPI_MODE_CREATE), &
+                           mpi_info_int, ifile, ierr)
+
+        var_MOK = int(sys_size + 1, MPI_OFFSET_KIND)
+        disp = m_MOK*max(MOK, n_MOK)*max(MOK, p_MOK)*WP_MOK*(var_MOK - 1 + int(time_step/t_step_save))
+        if (time_step == 0) disp = 0
+
+        call MPI_FILE_SET_VIEW(ifile, disp, MPI_INTEGER, MPI_IO_IB_DATA%view, &
+                               'native', mpi_info_int, ierr)
+        call MPI_FILE_WRITE_ALL(ifile, MPI_IO_IB_DATA%var%sf, data_size, &
+                                MPI_INTEGER, status, ierr)
+        call MPI_FILE_CLOSE(ifile, ierr)
+
+#endif
+
+    end subroutine s_write_parallel_ib_data
+
+    !> @brief Dispatches immersed boundary data output to the serial or parallel writer.
+    subroutine s_write_ib_data_file(time_step)
+
+        integer, intent(in) :: time_step
+
+        if (parallel_io) then
+            call s_write_parallel_ib_data(time_step)
+        else
+            call s_write_serial_ib_data(time_step)
+        end if
+
+    end subroutine
+
+    !>  This writes a formatted data file where the root processor
+    !!      can write out the CoM information
+    !!  @param t_step Current time-step
+    !!  @param c_mass_in Center of mass information
+    impure subroutine s_write_com_files(t_step, c_mass_in)
+
+        integer, intent(in) :: t_step
+        real(wp), dimension(num_fluids, 5), intent(in) :: c_mass_in
+        integer :: i !< Generic loop iterator
+        real(wp) :: nondim_time !< Non-dimensional time
+
+        ! Non-dimensional time calculation
+        if (t_step_old /= dflt_int) then
+            nondim_time = real(t_step + t_step_old, wp)*dt
+        else
+            nondim_time = real(t_step, wp)*dt
+        end if
+
+        if (proc_rank == 0) then
+            if (n == 0) then ! 1D simulation
+                do i = 1, num_fluids ! Loop through fluids
+                    write (i + 120, '(6X,4F24.12)') &
+                        nondim_time, &
+                        c_mass_in(i, 1), &
+                        c_mass_in(i, 2), &
+                        c_mass_in(i, 5)
+                end do
+            elseif (p == 0) then ! 2D simulation
+                do i = 1, num_fluids ! Loop through fluids
+                    write (i + 120, '(6X,5F24.12)') &
+                        nondim_time, &
+                        c_mass_in(i, 1), &
+                        c_mass_in(i, 2), &
+                        c_mass_in(i, 3), &
+                        c_mass_in(i, 5)
+                end do
+            else ! 3D simulation
+                do i = 1, num_fluids ! Loop through fluids
+                    write (i + 120, '(6X,6F24.12)') &
+                        nondim_time, &
+                        c_mass_in(i, 1), &
+                        c_mass_in(i, 2), &
+                        c_mass_in(i, 3), &
+                        c_mass_in(i, 4), &
+                        c_mass_in(i, 5)
+                end do
+            end if
+        end if
+
+    end subroutine s_write_com_files
+
+    !>  This writes a formatted data file for the flow probe information
+        !!  @param t_step Current time-step
+        !!  @param q_cons_vf Conservative variables
+        !!  @param accel_mag Acceleration magnitude information
+    impure subroutine s_write_probe_files(t_step, q_cons_vf, accel_mag)
+
+        integer, intent(in) :: t_step
+        type(scalar_field), dimension(sys_size), intent(in) :: q_cons_vf
+        real(wp), dimension(0:m, 0:n, 0:p), intent(in) :: accel_mag
+
+        real(wp), dimension(-1:m) :: distx
+        real(wp), dimension(-1:n) :: disty
+        real(wp), dimension(-1:p) :: distz
+
+        ! The cell-averaged partial densities, density, velocity, pressure,
+        ! volume fractions, specific heat ratio function, liquid stiffness
+        ! function, and sound speed.
+        real(wp) :: lit_gamma, nbub
+        real(wp) :: rho
+        real(wp), dimension(num_vels) :: vel
+        real(wp) :: pres
+        real(wp) :: ptilde
+        real(wp) :: ptot
+        real(wp) :: alf
+        real(wp) :: alfgr
+        real(wp), dimension(num_fluids) :: alpha
+        real(wp) :: gamma
+        real(wp) :: pi_inf
+        real(wp) :: qv
+        real(wp) :: c
+        real(wp) :: M00, M10, M01, M20, M11, M02
+        real(wp) :: varR, varV
+        real(wp), dimension(Nb) :: nR, R, nRdot, Rdot
+        real(wp) :: nR3
+        real(wp) :: accel
+        real(wp) :: int_pres
+        real(wp) :: max_pres
+        real(wp), dimension(2) :: Re
+        real(wp), dimension(6) :: tau_e
+        real(wp) :: G_local
+        real(wp) :: dyn_p, T
+        real(wp) :: damage_state
+
+        integer :: i, j, k, l, s, d !< Generic loop iterator
+
+        real(wp) :: nondim_time !< Non-dimensional time
+
+        real(wp) :: tmp !<
+            !! Temporary variable to store quantity for mpi_allreduce
+
+        integer :: npts !< Number of included integral points
+        real(wp) :: rad, thickness !< For integral quantities
+        logical :: trigger !< For integral quantities
+
+        real(wp) :: rhoYks(1:num_species)
+
+        T = dflt_T_guess
+
+        ! Non-dimensional time calculation
+        if (time_stepper == 23) then
+            nondim_time = mytime
+        else
+            if (t_step_old /= dflt_int) then
+                nondim_time = real(t_step + t_step_old, wp)*dt
+            else
+                nondim_time = real(t_step, wp)*dt
+            end if
+        end if
+
+        do i = 1, num_probes
+            ! Zeroing out flow variables for all processors
+            rho = 0._wp
+            do s = 1, num_vels
+                vel(s) = 0._wp
+            end do
+            pres = 0._wp
+            gamma = 0._wp
+            pi_inf = 0._wp
+            qv = 0._wp
+            c = 0._wp
+            accel = 0._wp
+            nR = 0._wp; R = 0._wp
+            nRdot = 0._wp; Rdot = 0._wp
+            nbub = 0._wp
+            M00 = 0._wp
+            M10 = 0._wp
+            M01 = 0._wp
+            M20 = 0._wp
+            M11 = 0._wp
+            M02 = 0._wp
+            varR = 0._wp; varV = 0._wp
+            alf = 0._wp
+            do s = 1, (num_dims*(num_dims + 1))/2
+                tau_e(s) = 0._wp
+            end do
+            damage_state = 0._wp
+
+            ! Find probe location in terms of indices on a
+            ! specific processor
+            if (n == 0) then ! 1D simulation
+                if ((probe(i)%x >= x_cb(-1)) .and. (probe(i)%x <= x_cb(m))) then
+                    do s = -1, m
+                        distx(s) = x_cb(s) - probe(i)%x
+                        if (distx(s) < 0._wp) distx(s) = 1000._wp
+                    end do
+                    j = minloc(distx, 1)
+                    if (j == 1) j = 2 ! Pick first point if probe is at edge
+                    k = 0
+                    l = 0
+
+                    if (chemistry) then
+                        do d = 1, num_species
+                            rhoYks(d) = q_cons_vf(chemxb + d - 1)%sf(j - 2, k, l)
+                        end do
+                    end if
+
+                    ! Computing/Sharing necessary state variables
+                    if (elasticity) then
+                        call s_convert_to_mixture_variables(q_cons_vf, j - 2, k, l, &
+                                                            rho, gamma, pi_inf, qv, &
+                                                            Re, G_local, fluid_pp(:)%G)
+                    else
+                        call s_convert_to_mixture_variables(q_cons_vf, j - 2, k, l, &
+                                                            rho, gamma, pi_inf, qv)
+                    end if
+                    do s = 1, num_vels
+                        vel(s) = q_cons_vf(cont_idx%end + s)%sf(j - 2, k, l)/rho
+                    end do
+
+                    dyn_p = 0.5_wp*rho*dot_product(vel, vel)
+
+                    if (elasticity) then
+                        if (cont_damage) then
+                            damage_state = q_cons_vf(damage_idx)%sf(j - 2, k, l)
+                            G_local = G_local*max((1._wp - damage_state), 0._wp)
+                        end if
+
+                        call s_compute_pressure( &
+                            q_cons_vf(1)%sf(j - 2, k, l), &
+                            q_cons_vf(alf_idx)%sf(j - 2, k, l), &
+                            dyn_p, pi_inf, gamma, rho, qv, rhoYks(:), pres, T, &
+                            q_cons_vf(stress_idx%beg)%sf(j - 2, k, l), &
+                            q_cons_vf(mom_idx%beg)%sf(j - 2, k, l), G_local)
+                    else
+                        call s_compute_pressure( &
+                            q_cons_vf(E_idx)%sf(j - 2, k, l), &
+                            q_cons_vf(alf_idx)%sf(j - 2, k, l), &
+                            dyn_p, pi_inf, gamma, rho, qv, rhoYks, pres, T)
+                    end if
+
+                    if (model_eqns == 4) then
+                        lit_gamma = gammas(1)
+                    else if (elasticity) then
+                        tau_e(1) = q_cons_vf(stress_idx%end)%sf(j - 2, k, l)/rho
+                    end if
+
+                    if (bubbles_euler) then
+                        alf = q_cons_vf(alf_idx)%sf(j - 2, k, l)
+                        if (num_fluids == 3) then
+                            alfgr = q_cons_vf(alf_idx - 1)%sf(j - 2, k, l)
+                        end if
+                        do s = 1, nb
+                            nR(s) = q_cons_vf(bub_idx%rs(s))%sf(j - 2, k, l)
+                            nRdot(s) = q_cons_vf(bub_idx%vs(s))%sf(j - 2, k, l)
+                        end do
+
+                        if (adv_n) then
+                            nbub = q_cons_vf(n_idx)%sf(j - 2, k, l)
+                        else
+                            nR3 = 0._wp
+                            do s = 1, nb
+                                nR3 = nR3 + weight(s)*(nR(s)**3._wp)
+                            end do
+
+                            nbub = sqrt((4._wp*pi/3._wp)*nR3/alf)
+                        end if
+#ifdef DEBUG
+                        print *, 'In probe, nbub: ', nbub
+#endif
+                        if (qbmm) then
+                            M00 = q_cons_vf(bub_idx%moms(1, 1))%sf(j - 2, k, l)/nbub
+                            M10 = q_cons_vf(bub_idx%moms(1, 2))%sf(j - 2, k, l)/nbub
+                            M01 = q_cons_vf(bub_idx%moms(1, 3))%sf(j - 2, k, l)/nbub
+                            M20 = q_cons_vf(bub_idx%moms(1, 4))%sf(j - 2, k, l)/nbub
+                            M11 = q_cons_vf(bub_idx%moms(1, 5))%sf(j - 2, k, l)/nbub
+                            M02 = q_cons_vf(bub_idx%moms(1, 6))%sf(j - 2, k, l)/nbub
+
+                            M10 = M10/M00
+                            M01 = M01/M00
+                            M20 = M20/M00
+                            M11 = M11/M00
+                            M02 = M02/M00
+
+                            varR = M20 - M10**2._wp
+                            varV = M02 - M01**2._wp
+                        end if
+                        R(:) = nR(:)/nbub
+                        Rdot(:) = nRdot(:)/nbub
+
+                        ptilde = ptil(j - 2, k, l)
+                        ptot = pres - ptilde
+                    end if
+
+                    ! Compute mixture sound Speed
+                    call s_compute_speed_of_sound(pres, rho, gamma, pi_inf, &
+                                                  ((gamma + 1._wp)*pres + pi_inf)/rho, alpha, 0._wp, 0._wp, c, qv)
+
+                    accel = accel_mag(j - 2, k, l)
+                end if
+            elseif (p == 0) then ! 2D simulation
+                if (chemistry) then
+                    do d = 1, num_species
+                        rhoYks(d) = q_cons_vf(chemxb + d - 1)%sf(j - 2, k - 2, l)
+                    end do
+                end if
+
+                if ((probe(i)%x >= x_cb(-1)) .and. (probe(i)%x <= x_cb(m))) then
+                    if ((probe(i)%y >= y_cb(-1)) .and. (probe(i)%y <= y_cb(n))) then
+                        do s = -1, m
+                            distx(s) = x_cb(s) - probe(i)%x
+                            if (distx(s) < 0._wp) distx(s) = 1000._wp
+                        end do
+                        do s = -1, n
+                            disty(s) = y_cb(s) - probe(i)%y
+                            if (disty(s) < 0._wp) disty(s) = 1000._wp
+                        end do
+                        j = minloc(distx, 1)
+                        k = minloc(disty, 1)
+                        if (j == 1) j = 2 ! Pick first point if probe is at edge
+                        if (k == 1) k = 2 ! Pick first point if probe is at edge
+                        l = 0
+
+                        ! Computing/Sharing necessary state variables
+                        call s_convert_to_mixture_variables(q_cons_vf, j - 2, k - 2, l, &
+                                                            rho, gamma, pi_inf, qv, &
+                                                            Re, G_local, fluid_pp(:)%G)
+                        do s = 1, num_vels
+                            vel(s) = q_cons_vf(cont_idx%end + s)%sf(j - 2, k - 2, l)/rho
+                        end do
+
+                        dyn_p = 0.5_wp*rho*dot_product(vel, vel)
+
+                        if (elasticity) then
+                            if (cont_damage) then
+                                damage_state = q_cons_vf(damage_idx)%sf(j - 2, k - 2, l)
+                                G_local = G_local*max((1._wp - damage_state), 0._wp)
+                            end if
+
+                            call s_compute_pressure( &
+                                q_cons_vf(1)%sf(j - 2, k - 2, l), &
+                                q_cons_vf(alf_idx)%sf(j - 2, k - 2, l), &
+                                dyn_p, pi_inf, gamma, rho, qv, &
+                                rhoYks, &
+                                pres, &
+                                T, &
+                                q_cons_vf(stress_idx%beg)%sf(j - 2, k - 2, l), &
+                                q_cons_vf(mom_idx%beg)%sf(j - 2, k - 2, l), G_local)
+                        else
+                            call s_compute_pressure(q_cons_vf(E_idx)%sf(j - 2, k - 2, l), &
+                                                    q_cons_vf(alf_idx)%sf(j - 2, k - 2, l), &
+                                                    dyn_p, pi_inf, gamma, rho, qv, &
+                                                    rhoYks, pres, T)
+                        end if
+
+                        if (model_eqns == 4) then
+                            lit_gamma = gs_min(1)
+                        else if (elasticity) then
+                            do s = 1, 3
+                                tau_e(s) = q_cons_vf(s)%sf(j - 2, k - 2, l)/rho
+                            end do
+                        end if
+
+                        if (bubbles_euler) then
+                            alf = q_cons_vf(alf_idx)%sf(j - 2, k - 2, l)
+                            do s = 1, nb
+                                nR(s) = q_cons_vf(bub_idx%rs(s))%sf(j - 2, k - 2, l)
+                                nRdot(s) = q_cons_vf(bub_idx%vs(s))%sf(j - 2, k - 2, l)
+                            end do
+
+                            if (adv_n) then
+                                nbub = q_cons_vf(n_idx)%sf(j - 2, k - 2, l)
+                            else
+                                nR3 = 0._wp
+                                do s = 1, nb
+                                    nR3 = nR3 + weight(s)*(nR(s)**3._wp)
+                                end do
+
+                                nbub = sqrt((4._wp*pi/3._wp)*nR3/alf)
+                            end if
+
+                            R(:) = nR(:)/nbub
+                            Rdot(:) = nRdot(:)/nbub
+                        end if
+                        ! Compute mixture sound speed
+                        call s_compute_speed_of_sound(pres, rho, gamma, pi_inf, &
+                                                      ((gamma + 1._wp)*pres + pi_inf)/rho, alpha, 0._wp, 0._wp, c, qv)
+
+                    end if
+                end if
+            else ! 3D
+                if ((probe(i)%x >= x_cb(-1)) .and. (probe(i)%x <= x_cb(m))) then
+                    if ((probe(i)%y >= y_cb(-1)) .and. (probe(i)%y <= y_cb(n))) then
+                        if ((probe(i)%z >= z_cb(-1)) .and. (probe(i)%z <= z_cb(p))) then
+                            do s = -1, m
+                                distx(s) = x_cb(s) - probe(i)%x
+                                if (distx(s) < 0._wp) distx(s) = 1000._wp
+                            end do
+                            do s = -1, n
+                                disty(s) = y_cb(s) - probe(i)%y
+                                if (disty(s) < 0._wp) disty(s) = 1000._wp
+                            end do
+                            do s = -1, p
+                                distz(s) = z_cb(s) - probe(i)%z
+                                if (distz(s) < 0._wp) distz(s) = 1000._wp
+                            end do
+                            j = minloc(distx, 1)
+                            k = minloc(disty, 1)
+                            l = minloc(distz, 1)
+                            if (j == 1) j = 2 ! Pick first point if probe is at edge
+                            if (k == 1) k = 2 ! Pick first point if probe is at edge
+                            if (l == 1) l = 2 ! Pick first point if probe is at edge
+
+                            ! Computing/Sharing necessary state variables
+                            call s_convert_to_mixture_variables(q_cons_vf, j - 2, k - 2, l - 2, &
+                                                                rho, gamma, pi_inf, qv, &
+                                                                Re, G_local, fluid_pp(:)%G)
+                            do s = 1, num_vels
+                                vel(s) = q_cons_vf(cont_idx%end + s)%sf(j - 2, k - 2, l - 2)/rho
+                            end do
+
+                            dyn_p = 0.5_wp*rho*dot_product(vel, vel)
+
+                            if (chemistry) then
+                                do d = 1, num_species
+                                    rhoYks(d) = q_cons_vf(chemxb + d - 1)%sf(j - 2, k - 2, l - 2)
+                                end do
+                            end if
+
+                            if (elasticity) then
+                                if (cont_damage) then
+                                    damage_state = q_cons_vf(damage_idx)%sf(j - 2, k - 2, l - 2)
+                                    G_local = G_local*max((1._wp - damage_state), 0._wp)
+                                end if
+
+                                call s_compute_pressure( &
+                                    q_cons_vf(1)%sf(j - 2, k - 2, l - 2), &
+                                    q_cons_vf(alf_idx)%sf(j - 2, k - 2, l - 2), &
+                                    dyn_p, pi_inf, gamma, rho, qv, &
+                                    rhoYks, pres, T, &
+                                    q_cons_vf(stress_idx%beg)%sf(j - 2, k - 2, l - 2), &
+                                    q_cons_vf(mom_idx%beg)%sf(j - 2, k - 2, l - 2), G_local)
+                            else
+                                call s_compute_pressure(q_cons_vf(E_idx)%sf(j - 2, k - 2, l - 2), &
+                                                        q_cons_vf(alf_idx)%sf(j - 2, k - 2, l - 2), &
+                                                        dyn_p, pi_inf, gamma, rho, qv, &
+                                                        rhoYks, pres, T)
+                            end if
+
+                            ! Compute mixture sound speed
+                            call s_compute_speed_of_sound(pres, rho, gamma, pi_inf, &
+                                                          ((gamma + 1._wp)*pres + pi_inf)/rho, alpha, 0._wp, 0._wp, c, qv)
+
+                            accel = accel_mag(j - 2, k - 2, l - 2)
+                        end if
+                    end if
+                end if
+            end if
+            if (num_procs > 1) then
+                #:for VAR in ['rho','pres','gamma','pi_inf','qv','c','accel']
+                    tmp = ${VAR}$
+                    call s_mpi_allreduce_sum(tmp, ${VAR}$)
+                #:endfor
+
+                do s = 1, num_vels
+                    tmp = vel(s)
+                    call s_mpi_allreduce_sum(tmp, vel(s))
+                end do
+
+                if (bubbles_euler) then
+                    #:for VAR in ['alf','alfgr','nbub','nR(1)','nRdot(1)','M00','R(1)','Rdot(1)','ptilde','ptot']
+                        tmp = ${VAR}$
+                        call s_mpi_allreduce_sum(tmp, ${VAR}$)
+                    #:endfor
+
+                    if (qbmm) then
+                        #:for VAR in ['varR','varV','M10','M01','M20','M02']
+                            tmp = ${VAR}$
+                            call s_mpi_allreduce_sum(tmp, ${VAR}$)
+                        #:endfor
+                    end if
+                end if
+
+                if (elasticity) then
+                    do s = 1, (num_dims*(num_dims + 1))/2
+                        tmp = tau_e(s)
+                        call s_mpi_allreduce_sum(tmp, tau_e(s))
+                    end do
+                end if
+
+                if (cont_damage) then
+                    tmp = damage_state
+                    call s_mpi_allreduce_sum(tmp, damage_state)
+                end if
+            end if
+            if (proc_rank == 0) then
+                if (n == 0) then
+                    if (bubbles_euler .and. (num_fluids <= 2)) then
+                        if (qbmm) then
+                            write (i + 30, '(6x,f12.6,14f28.16)') &
+                                nondim_time, &
+                                rho, &
+                                vel(1), &
+                                pres, &
+                                alf, &
+                                R(1), &
+                                Rdot(1), &
+                                nR(1), &
+                                nRdot(1), &
+                                varR, &
+                                varV, &
+                                M10, &
+                                M01, &
+                                M20, &
+                                M02
+                        else
+                            write (i + 30, '(6x,f12.6,8f24.8)') &
+                                nondim_time, &
+                                rho, &
+                                vel(1), &
+                                pres, &
+                                alf, &
+                                R(1), &
+                                Rdot(1), &
+                                nR(1), &
+                                nRdot(1)
+                            ! ptilde, &
+                            ! ptot
+                        end if
+                    else if (bubbles_euler .and. (num_fluids == 3)) then
+                        write (i + 30, '(6x,f12.6,f24.8,f24.8,f24.8,f24.8,f24.8,'// &
+                               'f24.8,f24.8,f24.8,f24.8,f24.8, f24.8)') &
+                            nondim_time, &
+                            rho, &
+                            vel(1), &
+                            pres, &
+                            alf, &
+                            alfgr, &
+                            nR(1), &
+                            nRdot(1), &
+                            R(1), &
+                            Rdot(1), &
+                            ptilde, &
+                            ptot
+                    else if (bubbles_euler .and. num_fluids == 4) then
+                        write (i + 30, '(6x,f12.6,f24.8,f24.8,f24.8,f24.8,'// &
+                               'f24.8,f24.8,f24.8,f24.8,f24.8,f24.8,f24.8,f24.8,f24.8)') &
+                            nondim_time, &
+                            q_cons_vf(1)%sf(j - 2, 0, 0), &
+                            q_cons_vf(2)%sf(j - 2, 0, 0), &
+                            q_cons_vf(3)%sf(j - 2, 0, 0), &
+                            q_cons_vf(4)%sf(j - 2, 0, 0), &
+                            q_cons_vf(5)%sf(j - 2, 0, 0), &
+                            q_cons_vf(6)%sf(j - 2, 0, 0), &
+                            q_cons_vf(7)%sf(j - 2, 0, 0), &
+                            q_cons_vf(8)%sf(j - 2, 0, 0), &
+                            q_cons_vf(9)%sf(j - 2, 0, 0), &
+                            q_cons_vf(10)%sf(j - 2, 0, 0), &
+                            nbub, &
+                            R(1), &
+                            Rdot(1)
+                    else
+                        write (i + 30, '(6X,F12.6,F24.8,F24.8,F24.8)') &
+                            nondim_time, &
+                            rho, &
+                            vel(1), &
+                            pres
+                    end if
+                elseif (p == 0) then
+                    if (bubbles_euler) then
+                        #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
+                            write (i + 30, '(6X,10F24.8)') &
+                                nondim_time, &
+                                rho, &
+                                vel(1), &
+                                vel(2), &
+                                pres, &
+                                alf, &
+                                nR(1), &
+                                nRdot(1), &
+                                R(1), &
+                                Rdot(1)
+                        #:endif
+                    else if (elasticity) then
+                        #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
+                            write (i + 30, '(6X,F12.6,F24.8,F24.8,F24.8,F24.8,'// &
+                                   'F24.8,F24.8,F24.8)') &
+                                nondim_time, &
+                                rho, &
+                                vel(1), &
+                                vel(2), &
+                                pres, &
+                                tau_e(1), &
+                                tau_e(2), &
+                                tau_e(3)
+                        #:endif
+                    else
+                        write (i + 30, '(6X,F12.6,F24.8,F24.8,F24.8)') &
+                            nondim_time, &
+                            rho, &
+                            vel(1), &
+                            pres
+                        print *, 'time =', nondim_time, 'rho =', rho, 'pres =', pres
+                    end if
+                else
+                    #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
+                        write (i + 30, '(6X,F12.6,F24.8,F24.8,F24.8,F24.8,'// &
+                               'F24.8,F24.8,F24.8,F24.8,F24.8,'// &
+                               'F24.8)') &
+                            nondim_time, &
+                            rho, &
+                            vel(1), &
+                            vel(2), &
+                            vel(3), &
+                            pres, &
+                            gamma, &
+                            pi_inf, &
+                            qv, &
+                            c, &
+                            accel
+                    #:endif
+                end if
+            end if
+        end do
+
+        if (integral_wrt .and. bubbles_euler) then
+            if (n == 0) then ! 1D simulation
+                do i = 1, num_integrals
+                    int_pres = 0._wp
+                    max_pres = 0._wp
+                    k = 0; l = 0
+                    npts = 0
+                    do j = 1, m
+                        pres = 0._wp
+                        do s = 1, num_vels
+                            vel(s) = 0._wp
+                        end do
+                        rho = 0._wp
+                        pres = 0._wp
+                        gamma = 0._wp
+                        pi_inf = 0._wp
+                        qv = 0._wp
+
+                        if ((integral(i)%xmin <= x_cb(j)) .and. (integral(i)%xmax >= x_cb(j))) then
+                            npts = npts + 1
+                            call s_convert_to_mixture_variables(q_cons_vf, j, k, l, &
+                                                                rho, gamma, pi_inf, qv, Re)
+                            do s = 1, num_vels
+                                vel(s) = q_cons_vf(cont_idx%end + s)%sf(j, k, l)/rho
+                            end do
+
+                            pres = ( &
+                                   (q_cons_vf(E_idx)%sf(j, k, l) - &
+                                    0.5_wp*(q_cons_vf(mom_idx%beg)%sf(j, k, l)**2._wp)/rho)/ &
+                                   (1._wp - q_cons_vf(alf_idx)%sf(j, k, l)) - &
+                                   pi_inf - qv &
+                                   )/gamma
+                            int_pres = int_pres + (pres - 1._wp)**2._wp
+                        end if
+                    end do
+                    int_pres = sqrt(int_pres/(1._wp*npts))
+
+                    if (num_procs > 1) then
+                        tmp = int_pres
+                        call s_mpi_allreduce_sum(tmp, int_pres)
+                    end if
+
+                    if (proc_rank == 0) then
+                        if (bubbles_euler .and. (num_fluids <= 2)) then
+                            write (i + 70, '(6x,f12.6,f24.8)') &
+                                nondim_time, int_pres
+                        end if
+                    end if
+                end do
+            elseif (p == 0) then
+                if (num_integrals /= 3) then
+                    call s_mpi_abort('Incorrect number of integrals')
+                end if
+
+                rad = integral(1)%xmax
+                thickness = integral(1)%xmin
+
+                do i = 1, num_integrals
+                    int_pres = 0._wp
+                    max_pres = 0._wp
+                    l = 0
+                    npts = 0
+                    do j = 1, m
+                        do k = 1, n
+                            trigger = .false.
+                            if (i == 1) then
+                                !inner portion
+                                if (sqrt(x_cb(j)**2._wp + y_cb(k)**2._wp) < (rad - 0.5_wp*thickness)) &
+                                    trigger = .true.
+                            elseif (i == 2) then
+                                !net region
+                                if (sqrt(x_cb(j)**2._wp + y_cb(k)**2._wp) > (rad - 0.5_wp*thickness) .and. &
+                                    sqrt(x_cb(j)**2._wp + y_cb(k)**2._wp) < (rad + 0.5_wp*thickness)) &
+                                    trigger = .true.
+                            elseif (i == 3) then
+                                !everything else
+                                if (sqrt(x_cb(j)**2._wp + y_cb(k)**2._wp) > (rad + 0.5_wp*thickness)) &
+                                    trigger = .true.
+                            end if
+
+                            pres = 0._wp
+                            do s = 1, num_vels
+                                vel(s) = 0._wp
+                            end do
+                            rho = 0._wp
+                            pres = 0._wp
+                            gamma = 0._wp
+                            pi_inf = 0._wp
+                            qv = 0._wp
+
+                            if (trigger) then
+                                npts = npts + 1
+                                call s_convert_to_mixture_variables(q_cons_vf, j, k, l, &
+                                                                    rho, gamma, pi_inf, qv, Re)
+                                do s = 1, num_vels
+                                    vel(s) = q_cons_vf(cont_idx%end + s)%sf(j, k, l)/rho
+                                end do
+
+                                pres = ( &
+                                       (q_cons_vf(E_idx)%sf(j, k, l) - &
+                                        0.5_wp*(q_cons_vf(mom_idx%beg)%sf(j, k, l)**2._wp)/rho)/ &
+                                       (1._wp - q_cons_vf(alf_idx)%sf(j, k, l)) - &
+                                       pi_inf - qv &
+                                       )/gamma
+                                int_pres = int_pres + abs(pres - 1._wp)
+                                max_pres = max(max_pres, abs(pres - 1._wp))
+                            end if
+
+                        end do
+                    end do
+
+                    if (npts > 0) then
+                        int_pres = int_pres/(1._wp*npts)
+                    else
+                        int_pres = 0._wp
+                    end if
+
+                    if (num_procs > 1) then
+                        tmp = int_pres
+                        call s_mpi_allreduce_sum(tmp, int_pres)
+
+                        tmp = max_pres
+                        call s_mpi_allreduce_max(tmp, max_pres)
+                    end if
+
+                    if (proc_rank == 0) then
+                        if (bubbles_euler .and. (num_fluids <= 2)) then
+                            write (i + 70, '(6x,f12.6,f24.8,f24.8)') &
+                                nondim_time, int_pres, max_pres
+                        end if
+                    end if
+                end do
+            end if
+        end if
+
+    end subroutine s_write_probe_files
+
+    !>  The goal of this subroutine is to write to the run-time
+        !!      information file basic footer information applicable to
+        !!      the current computation and to close the file when done.
+        !!      The footer contains the stability criteria extrema over
+        !!      all of the time-steps and the simulation run-time.
+    impure subroutine s_close_run_time_information_file
+
+        real(wp) :: run_time !< Run-time of the simulation
+
+        ! Writing the footer of and closing the run-time information file
+        write (3, '(A)') '    '
+        write (3, '(A)') ''
+
+        write (3, '(A,F9.6)') 'ICFL Max: ', icfl_max
+        if (viscous) write (3, '(A,F9.6)') 'VCFL Max: ', vcfl_max
+        if (viscous) write (3, '(A,F10.6)') 'Rc Min: ', Rc_min
+
+        call cpu_time(run_time)
+
+        write (3, '(A)') ''
+        write (3, '(A,I0,A)') 'Run-time: ', int(anint(run_time)), 's'
+        write (3, '(A)') '    '
+        close (3)
+
+    end subroutine s_close_run_time_information_file
+
+    !> Closes communication files
+    impure subroutine s_close_com_files()
+
+        integer :: i !< Generic loop iterator
+        do i = 1, num_fluids
+            close (i + 120)
+        end do
+
+    end subroutine s_close_com_files
+
+    !> Closes probe files
+    impure subroutine s_close_probe_files
+
+        integer :: i !< Generic loop iterator
+
+        do i = 1, num_probes
+            close (i + 30)
+        end do
+
+    end subroutine s_close_probe_files
+
+    !>  The computation of parameters, the allocation of memory,
+        !!      the association of pointers and/or the execution of any
+        !!      other procedures that are necessary to setup the module.
+    impure subroutine s_initialize_data_output_module
+
+        integer :: i, m_ds, n_ds, p_ds
+
+        ! Allocating/initializing ICFL, VCFL, CCFL and Rc stability criteria
+        if (run_time_info) then
+            @:ALLOCATE(icfl_sf(0:m, 0:n, 0:p))
+            icfl_max = 0._wp
+
+            if (viscous) then
+                @:ALLOCATE(vcfl_sf(0:m, 0:n, 0:p))
+                @:ALLOCATE(Rc_sf  (0:m, 0:n, 0:p))
+
+                vcfl_max = 0._wp
+                Rc_min = 1.e3_wp
+            end if
+        end if
+
+        if (probe_wrt) then
+            @:ALLOCATE(c_mass(num_fluids,5))
+        end if
+
+        if (down_sample) then
+            m_ds = int((m + 1)/3) - 1
+            n_ds = int((n + 1)/3) - 1
+            p_ds = int((p + 1)/3) - 1
+
+            allocate (q_cons_temp_ds(1:sys_size))
+            do i = 1, sys_size
+                allocate (q_cons_temp_ds(i)%sf(-1:m_ds + 1, -1:n_ds + 1, -1:p_ds + 1))
+            end do
+        end if
+
+    end subroutine s_initialize_data_output_module
+
+    !> Module deallocation and/or disassociation procedures
+    impure subroutine s_finalize_data_output_module
+
+        integer :: i
+
+        if (probe_wrt) then
+            @:DEALLOCATE(c_mass)
+        end if
+
+        if (run_time_info) then
+            ! Deallocating the ICFL, VCFL, CCFL, and Rc stability criteria
+            @:DEALLOCATE(icfl_sf)
+            if (viscous) then
+                @:DEALLOCATE(vcfl_sf, Rc_sf)
+            end if
+        end if
+
+        if (down_sample) then
+            do i = 1, sys_size
+                deallocate (q_cons_temp_ds(i)%sf)
+            end do
+            deallocate (q_cons_temp_ds)
+        end if
+
+    end subroutine s_finalize_data_output_module
+
+end module m_data_output
diff --git a/src/simulation/m_global_parameters.fpp b/src/simulation/m_global_parameters.fpp
index c1be537ecd..1e129754f2 100644
--- a/src/simulation/m_global_parameters.fpp
+++ b/src/simulation/m_global_parameters.fpp
@@ -184,6 +184,7 @@ module m_global_parameters
     logical :: bulk_stress   !< Bulk stresses
     logical :: cont_damage   !< Continuum damage modeling
     logical :: hyper_cleaning !< Hyperbolic cleaning for MHD for divB=0
+    logical :: any_non_newtonian !< True if any fluid is non-Newtonian
     integer :: num_igr_iters !< number of iterations for elliptic solve
     integer :: num_igr_warm_start_iters !< number of warm start iterations for elliptic solve
     real(wp) :: alf_factor  !< alpha factor for IGR
@@ -213,7 +214,7 @@ module m_global_parameters
     $:GPU_DECLARE(create='[mpp_lim,model_eqns,mixture_err,alt_soundspeed]')
     $:GPU_DECLARE(create='[avg_state,mp_weno,weno_eps,teno_CT,hypoelasticity]')
     $:GPU_DECLARE(create='[hyperelasticity,hyper_model,elasticity,low_Mach]')
-    $:GPU_DECLARE(create='[shear_stress,bulk_stress,cont_damage,hyper_cleaning]')
+    $:GPU_DECLARE(create='[shear_stress,bulk_stress,cont_damage,hyper_cleaning,any_non_newtonian]')
 
     logical :: relax          !< activate phase change
     integer :: relax_model    !< Relaxation model
@@ -636,6 +637,7 @@ contains
         bulk_stress = .false.
         cont_damage = .false.
         hyper_cleaning = .false.
+        any_non_newtonian = .false.
         num_igr_iters = dflt_num_igr_iters
         num_igr_warm_start_iters = dflt_num_igr_warm_start_iters
         alf_factor = dflt_alf_factor
@@ -684,6 +686,14 @@ contains
             fluid_pp(i)%qvp = 0._wp
             fluid_pp(i)%Re(:) = dflt_real
             fluid_pp(i)%G = 0._wp
+            fluid_pp(i)%non_newtonian = .false.
+            fluid_pp(i)%tau0 = 0._wp
+            fluid_pp(i)%K = 0._wp
+            fluid_pp(i)%nn = 1._wp
+            fluid_pp(i)%mu_max = dflt_real
+            fluid_pp(i)%mu_min = 0._wp
+            fluid_pp(i)%mu_bulk = 0._wp
+            fluid_pp(i)%hb_m = 1000._wp
         end do
 
         ! Subgrid bubble parameters
@@ -1120,7 +1130,15 @@ contains
 
             Re_size_max = maxval(Re_size)
 
-            $:GPU_UPDATE(device='[Re_size,Re_size_max,shear_stress,bulk_stress]')
+            ! Detect non-Newtonian fluids
+            any_non_newtonian = .false.
+            do i = 1, num_fluids
+                if (fluid_pp(i)%non_newtonian) then
+                    any_non_newtonian = .true.
+                end if
+            end do
+
+            $:GPU_UPDATE(device='[Re_size,Re_size_max,shear_stress,bulk_stress,any_non_newtonian]')
 
             ! Bookkeeping the indexes of any viscous fluids and any pairs of
             ! fluids whose interface will support effects of surface tension
diff --git a/src/simulation/m_hb_function.fpp b/src/simulation/m_hb_function.fpp
new file mode 100644
index 0000000000..6f56bc5717
--- /dev/null
+++ b/src/simulation/m_hb_function.fpp
@@ -0,0 +1,77 @@
+!>
+!! @file m_hb_function.f90
+!! @brief Contains module m_hb_function
+
+#:include 'macros.fpp'
+
+!> @brief The module contains functions to compute Herschel-Bulkley
+!!        non-Newtonian viscosity with Papanastasiou regularization.
+!!        mu = (tau0/gdot)*(1 - exp(-m*gdot)) + K*gdot^(nn-1)
+module m_hb_function
+
+    use m_derived_types        !< Definitions of the derived types
+
+    use m_global_parameters    !< Definitions of the global parameters
+
+    implicit none
+
+    private; public :: f_compute_hb_viscosity, &
+ f_compute_shear_rate_from_components
+
+contains
+
+    !> Computes Herschel-Bulkley viscosity with Papanastasiou regularization
+    !! @param tau0 Yield stress
+    !! @param K_val Consistency index
+    !! @param nn_val Flow behavior index
+    !! @param mu_min_val Minimum viscosity limit
+    !! @param mu_max_val Maximum viscosity limit
+    !! @param shear_rate Shear rate magnitude
+    !! @param hb_m_val Papanastasiou regularization parameter
+    !! @return Viscosity
+    pure function f_compute_hb_viscosity(tau0, K_val, nn_val, &
+                                         mu_min_val, mu_max_val, shear_rate, hb_m_val) result(mu)
+        $:GPU_ROUTINE(parallelism='[seq]')
+
+        real(wp), intent(in) :: tau0, K_val, nn_val
+        real(wp), intent(in) :: mu_min_val, mu_max_val
+        real(wp), intent(in) :: shear_rate, hb_m_val
+        real(wp) :: mu
+        real(wp) :: yield_term, power_law_term, exp_term
+
+        exp_term = exp(-hb_m_val*shear_rate)
+        yield_term = tau0*(1._wp - exp_term)/shear_rate
+        power_law_term = K_val*(shear_rate**(nn_val - 1._wp))
+
+        mu = yield_term + power_law_term
+        mu = min(max(mu, mu_min_val), mu_max_val)
+
+    end function f_compute_hb_viscosity
+
+    !> Computes shear rate from strain rate tensor components.
+    !!      gdot = sqrt(2*D_ij*D_ij) where D_ij is the strain rate tensor.
+    !!      Set D_zz, D_xz, D_yz to 0 for 2D/1D cases.
+    !! @param D_xx Normal strain rate du/dx
+    !! @param D_yy Normal strain rate dv/dy
+    !! @param D_zz Normal strain rate dw/dz
+    !! @param D_xy Shear strain rate 0.5*(du/dy + dv/dx)
+    !! @param D_xz Shear strain rate 0.5*(du/dz + dw/dx)
+    !! @param D_yz Shear strain rate 0.5*(dv/dz + dw/dy)
+    !! @return Shear rate magnitude
+    pure function f_compute_shear_rate_from_components( &
+                                                        D_xx, D_yy, D_zz, D_xy, D_xz, D_yz) result(shear_rate)
+        $:GPU_ROUTINE(parallelism='[seq]')
+
+        real(wp), intent(in) :: D_xx, D_yy, D_zz, D_xy, D_xz, D_yz
+        real(wp) :: shear_rate
+
+        ! 2*D_ij*D_ij = 2*(D_xx^2+D_yy^2+D_zz^2+2*(D_xy^2+D_xz^2+D_yz^2))
+        shear_rate = sqrt(2._wp*(D_xx*D_xx + D_yy*D_yy + D_zz*D_zz + &
+                                 2._wp*(D_xy*D_xy + D_xz*D_xz + D_yz*D_yz)))
+
+        ! Clamp for numerical safety
+        shear_rate = min(max(shear_rate, 1.0e-2_wp), 1.0e5_wp)
+
+    end function f_compute_shear_rate_from_components
+
+end module m_hb_function
diff --git a/src/simulation/m_ibm.fpp b/src/simulation/m_ibm.fpp
index b87f5a1b19..c5b44d7c1f 100644
--- a/src/simulation/m_ibm.fpp
+++ b/src/simulation/m_ibm.fpp
@@ -27,6 +27,8 @@ module m_ibm
 
     use m_viscous
 
+    use m_hb_function
+
     use m_model
 
     implicit none
@@ -1013,7 +1015,14 @@ contains
 
         if (viscous) then
             do fluid_idx = 1, num_fluids
-                if (fluid_pp(fluid_idx)%Re(1) /= 0._wp) then
+                if (fluid_pp(fluid_idx)%non_newtonian) then
+                    ! Non-Newtonian: compute reference viscosity at gdot = 1
+                    dynamic_viscosities(fluid_idx) = f_compute_hb_viscosity( &
+                        fluid_pp(fluid_idx)%tau0, fluid_pp(fluid_idx)%K, &
+                        fluid_pp(fluid_idx)%nn, fluid_pp(fluid_idx)%mu_min, &
+                        fluid_pp(fluid_idx)%mu_max, 1._wp, &
+                        fluid_pp(fluid_idx)%hb_m)
+                else if (fluid_pp(fluid_idx)%Re(1) /= 0._wp) then
                     dynamic_viscosities(fluid_idx) = 1._wp/fluid_pp(fluid_idx)%Re(1)
                 else
                     dynamic_viscosities(fluid_idx) = 0._wp
diff --git a/src/simulation/m_mpi_proxy.fpp b/src/simulation/m_mpi_proxy.fpp
index a684c79ec4..57ed18a3d3 100644
--- a/src/simulation/m_mpi_proxy.fpp
+++ b/src/simulation/m_mpi_proxy.fpp
@@ -190,6 +190,10 @@ contains
                 call MPI_BCAST(fluid_pp(i)%${VAR}$, 1, mpi_p, 0, MPI_COMM_WORLD, ierr)
             #:endfor
             call MPI_BCAST(fluid_pp(i)%Re(1), 2, mpi_p, 0, MPI_COMM_WORLD, ierr)
+            call MPI_BCAST(fluid_pp(i)%non_newtonian, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr)
+            #:for VAR in ['tau0', 'K', 'nn', 'mu_max', 'mu_min', 'mu_bulk', 'hb_m']
+                call MPI_BCAST(fluid_pp(i)%${VAR}$, 1, mpi_p, 0, MPI_COMM_WORLD, ierr)
+            #:endfor
         end do
 
         if (bubbles_euler .or. bubbles_lagrange) then
diff --git a/src/simulation/m_pressure_relaxation.fpp b/src/simulation/m_pressure_relaxation.fpp
index 69689de06c..92a39724d0 100644
--- a/src/simulation/m_pressure_relaxation.fpp
+++ b/src/simulation/m_pressure_relaxation.fpp
@@ -1,308 +1,279 @@
-!>
-!! @file
-!! @brief Contains module m_pressure_relaxation
-
-#:include 'case.fpp'
-#:include 'macros.fpp'
-
-!> @brief Pressure relaxation for the six-equation multi-component model via Newton--Raphson equilibration and volume-fraction correction
-module m_pressure_relaxation
-
-    use m_derived_types        !< Definitions of the derived types
-    use m_global_parameters    !< Definitions of the global parameters
-
-    implicit none
-
-    private; public :: s_pressure_relaxation_procedure, &
- s_initialize_pressure_relaxation_module, &
- s_finalize_pressure_relaxation_module
-
-    real(wp), allocatable, dimension(:, :) :: Res_pr
-    $:GPU_DECLARE(create='[Res_pr]')
-
-contains
-
-    !> Initialize the pressure relaxation module
-    impure subroutine s_initialize_pressure_relaxation_module
-
-        integer :: i, j
-
-        if (viscous) then
-            @:ALLOCATE(Res_pr(1:2, 1:Re_size_max))
-            do i = 1, 2
-                do j = 1, Re_size(i)
-                    Res_pr(i, j) = fluid_pp(Re_idx(i, j))%Re(i)
-                end do
-            end do
-            $:GPU_UPDATE(device='[Res_pr, Re_idx, Re_size]')
-        end if
-
-    end subroutine s_initialize_pressure_relaxation_module
-
-    !> Finalize the pressure relaxation module
-    impure subroutine s_finalize_pressure_relaxation_module
-
-        if (viscous) then
-            @:DEALLOCATE(Res_pr)
-        end if
-
-    end subroutine s_finalize_pressure_relaxation_module
-
-    !> The main pressure relaxation procedure
-    !! @param q_cons_vf Cell-average conservative variables
-    subroutine s_pressure_relaxation_procedure(q_cons_vf)
-
-        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
-        integer :: j, k, l
-
-        $:GPU_PARALLEL_LOOP(private='[j,k,l]', collapse=3)
-        do l = 0, p
-            do k = 0, n
-                do j = 0, m
-                    call s_relax_cell_pressure(q_cons_vf, j, k, l)
-                end do
-            end do
-        end do
-        $:END_GPU_PARALLEL_LOOP()
-
-    end subroutine s_pressure_relaxation_procedure
-
-    !> Process pressure relaxation for a single cell
-    subroutine s_relax_cell_pressure(q_cons_vf, j, k, l)
-        $:GPU_ROUTINE(parallelism='[seq]')
-
-        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
-        integer, intent(in) :: j, k, l
-
-        ! Volume fraction correction
-        if (mpp_lim) call s_correct_volume_fractions(q_cons_vf, j, k, l)
-
-        ! Pressure equilibration
-        if (s_needs_pressure_relaxation(q_cons_vf, j, k, l)) then
-            call s_equilibrate_pressure(q_cons_vf, j, k, l)
-        end if
-
-        ! Internal energy correction
-        call s_correct_internal_energies(q_cons_vf, j, k, l)
-
-    end subroutine s_relax_cell_pressure
-
-    !> Check if pressure relaxation is needed for this cell
-    logical function s_needs_pressure_relaxation(q_cons_vf, j, k, l)
-        $:GPU_ROUTINE(parallelism='[seq]')
-
-        type(scalar_field), dimension(sys_size), intent(in) :: q_cons_vf
-        integer, intent(in) :: j, k, l
-        integer :: i
-
-        s_needs_pressure_relaxation = .true.
-        $:GPU_LOOP(parallelism='[seq]')
-        do i = 1, num_fluids
-            if (q_cons_vf(i + advxb - 1)%sf(j, k, l) > (1._wp - sgm_eps)) then
-                s_needs_pressure_relaxation = .false.
-            end if
-        end do
-
-    end function s_needs_pressure_relaxation
-
-    !> Correct volume fractions to physical bounds
-    subroutine s_correct_volume_fractions(q_cons_vf, j, k, l)
-        $:GPU_ROUTINE(parallelism='[seq]')
-
-        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
-        integer, intent(in) :: j, k, l
-        real(wp) :: sum_alpha
-        integer :: i
-
-        sum_alpha = 0._wp
-        $:GPU_LOOP(parallelism='[seq]')
-        do i = 1, num_fluids
-            if ((q_cons_vf(i + contxb - 1)%sf(j, k, l) < 0._wp) .or. &
-                (q_cons_vf(i + advxb - 1)%sf(j, k, l) < 0._wp)) then
-                q_cons_vf(i + contxb - 1)%sf(j, k, l) = 0._wp
-                q_cons_vf(i + advxb - 1)%sf(j, k, l) = 0._wp
-                q_cons_vf(i + intxb - 1)%sf(j, k, l) = 0._wp
-            end if
-            if (q_cons_vf(i + advxb - 1)%sf(j, k, l) > 1._wp) &
-                q_cons_vf(i + advxb - 1)%sf(j, k, l) = 1._wp
-            sum_alpha = sum_alpha + q_cons_vf(i + advxb - 1)%sf(j, k, l)
-        end do
-
-        $:GPU_LOOP(parallelism='[seq]')
-        do i = 1, num_fluids
-            q_cons_vf(i + advxb - 1)%sf(j, k, l) = q_cons_vf(i + advxb - 1)%sf(j, k, l)/sum_alpha
-        end do
-
-    end subroutine s_correct_volume_fractions
-
-    !> Main pressure equilibration using Newton-Raphson
-    subroutine s_equilibrate_pressure(q_cons_vf, j, k, l)
-        $:GPU_ROUTINE(parallelism='[seq]')
-
-        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
-        integer, intent(in) :: j, k, l
-
-        real(wp) :: pres_relax, f_pres, df_pres
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
-            real(wp), dimension(3) :: pres_K_init, rho_K_s
-        #:else
-            real(wp), dimension(num_fluids) :: pres_K_init, rho_K_s
-        #:endif
-        integer, parameter :: MAX_ITER = 50
-        real(wp), parameter :: TOLERANCE = 1.e-10_wp
-        integer :: iter, i
-
-        ! Initialize pressures
-        pres_relax = 0._wp
-        $:GPU_LOOP(parallelism='[seq]')
-        do i = 1, num_fluids
-            if (q_cons_vf(i + advxb - 1)%sf(j, k, l) > sgm_eps) then
-                pres_K_init(i) = (q_cons_vf(i + intxb - 1)%sf(j, k, l)/ &
-                                  q_cons_vf(i + advxb - 1)%sf(j, k, l) - pi_infs(i))/gammas(i)
-                if (pres_K_init(i) <= -(1._wp - 1.e-8_wp)*ps_inf(i) + 1.e-8_wp) &
-                    pres_K_init(i) = -(1._wp - 1.e-8_wp)*ps_inf(i) + 1.e-8_wp
-            else
-                pres_K_init(i) = 0._wp
-            end if
-            pres_relax = pres_relax + q_cons_vf(i + advxb - 1)%sf(j, k, l)*pres_K_init(i)
-        end do
-
-        ! Newton-Raphson iteration
-        f_pres = 1.e-9_wp
-        df_pres = 1.e9_wp
-        $:GPU_LOOP(parallelism='[seq]')
-        do iter = 0, MAX_ITER - 1
-            if (abs(f_pres) > TOLERANCE) then
-                pres_relax = pres_relax - f_pres/df_pres
-
-                ! Enforce pressure bounds
-                do i = 1, num_fluids
-                    if (pres_relax <= -(1._wp - 1.e-8_wp)*ps_inf(i) + 1.e-8_wp) &
-                        pres_relax = -(1._wp - 1.e-8_wp)*ps_inf(i) + 1.e-8_wp
-                end do
-
-                ! Newton-Raphson step
-                f_pres = -1._wp
-                df_pres = 0._wp
-                $:GPU_LOOP(parallelism='[seq]')
-                do i = 1, num_fluids
-                    if (q_cons_vf(i + advxb - 1)%sf(j, k, l) > sgm_eps) then
-                        rho_K_s(i) = q_cons_vf(i + contxb - 1)%sf(j, k, l)/ &
-                                     max(q_cons_vf(i + advxb - 1)%sf(j, k, l), sgm_eps) &
-                                     *((pres_relax + ps_inf(i))/(pres_K_init(i) + &
-                                                                 ps_inf(i)))**(1._wp/gs_min(i))
-                        f_pres = f_pres + q_cons_vf(i + contxb - 1)%sf(j, k, l)/rho_K_s(i)
-                        df_pres = df_pres - q_cons_vf(i + contxb - 1)%sf(j, k, l) &
-                                  /(gs_min(i)*rho_K_s(i)*(pres_relax + ps_inf(i)))
-                    end if
-                end do
-            end if
-        end do
-
-        ! Update volume fractions
-        $:GPU_LOOP(parallelism='[seq]')
-        do i = 1, num_fluids
-            if (q_cons_vf(i + advxb - 1)%sf(j, k, l) > sgm_eps) &
-                q_cons_vf(i + advxb - 1)%sf(j, k, l) = q_cons_vf(i + contxb - 1)%sf(j, k, l)/rho_K_s(i)
-        end do
-
-    end subroutine s_equilibrate_pressure
-
-    !> Correct internal energies using equilibrated pressure
-    subroutine s_correct_internal_energies(q_cons_vf, j, k, l)
-        $:GPU_ROUTINE(parallelism='[seq]')
-
-        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
-        integer, intent(in) :: j, k, l
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
-            real(wp), dimension(2) :: alpha_rho, alpha
-        #:else
-            real(wp), dimension(num_fluids) :: alpha_rho, alpha
-        #:endif
-        real(wp) :: rho, dyn_pres, gamma, pi_inf, pres_relax, sum_alpha
-        real(wp), dimension(2) :: Re
-        integer :: i, q
-
-        $:GPU_LOOP(parallelism='[seq]')
-        do i = 1, num_fluids
-            alpha_rho(i) = q_cons_vf(i)%sf(j, k, l)
-            alpha(i) = q_cons_vf(E_idx + i)%sf(j, k, l)
-        end do
-
-        ! Compute mixture properties (combined bubble and standard logic)
-        rho = 0._wp
-        gamma = 0._wp
-        pi_inf = 0._wp
-
-        if (bubbles_euler) then
-            if (mpp_lim .and. (model_eqns == 2) .and. (num_fluids > 2)) then
-                $:GPU_LOOP(parallelism='[seq]')
-                do i = 1, num_fluids
-                    rho = rho + alpha_rho(i)
-                    gamma = gamma + alpha(i)*gammas(i)
-                    pi_inf = pi_inf + alpha(i)*pi_infs(i)
-                end do
-            else if ((model_eqns == 2) .and. (num_fluids > 2)) then
-                $:GPU_LOOP(parallelism='[seq]')
-                do i = 1, num_fluids - 1
-                    rho = rho + alpha_rho(i)
-                    gamma = gamma + alpha(i)*gammas(i)
-                    pi_inf = pi_inf + alpha(i)*pi_infs(i)
-                end do
-            else
-                rho = alpha_rho(1)
-                gamma = gammas(1)
-                pi_inf = pi_infs(1)
-            end if
-        else
-            sum_alpha = 0._wp
-            if (mpp_lim) then
-                $:GPU_LOOP(parallelism='[seq]')
-                do i = 1, num_fluids
-                    alpha_rho(i) = max(0._wp, alpha_rho(i))
-                    alpha(i) = min(max(0._wp, alpha(i)), 1._wp)
-                    sum_alpha = sum_alpha + alpha(i)
-                end do
-                alpha = alpha/max(sum_alpha, sgm_eps)
-            end if
-
-            $:GPU_LOOP(parallelism='[seq]')
-            do i = 1, num_fluids
-                rho = rho + alpha_rho(i)
-                gamma = gamma + alpha(i)*gammas(i)
-                pi_inf = pi_inf + alpha(i)*pi_infs(i)
-            end do
-
-            if (viscous) then
-                $:GPU_LOOP(parallelism='[seq]')
-                do i = 1, 2
-                    Re(i) = dflt_real
-                    if (Re_size(i) > 0) Re(i) = 0._wp
-                    $:GPU_LOOP(parallelism='[seq]')
-                    do q = 1, Re_size(i)
-                        Re(i) = alpha(Re_idx(i, q))/Res_pr(i, q) + Re(i)
-                    end do
-                    Re(i) = 1._wp/max(Re(i), sgm_eps)
-                end do
-            end if
-        end if
-
-        ! Compute dynamic pressure and update internal energies
-        dyn_pres = 0._wp
-        $:GPU_LOOP(parallelism='[seq]')
-        do i = momxb, momxe
-            dyn_pres = dyn_pres + 5.e-1_wp*q_cons_vf(i)%sf(j, k, l)* &
-                       q_cons_vf(i)%sf(j, k, l)/max(rho, sgm_eps)
-        end do
-
-        pres_relax = (q_cons_vf(E_idx)%sf(j, k, l) - dyn_pres - pi_inf)/gamma
-
-        $:GPU_LOOP(parallelism='[seq]')
-        do i = 1, num_fluids
-            q_cons_vf(i + intxb - 1)%sf(j, k, l) = &
-                q_cons_vf(i + advxb - 1)%sf(j, k, l)*(gammas(i)*pres_relax + pi_infs(i))
-        end do
-
-    end subroutine s_correct_internal_energies
-
-end module m_pressure_relaxation
+!>
+!! @file
+!! @brief Contains module m_pressure_relaxation
+
+#:include 'case.fpp'
+#:include 'macros.fpp'
+
+!> @brief Pressure relaxation for the six-equation multi-component model via Newton--Raphson equilibration and volume-fraction correction
+module m_pressure_relaxation
+
+    use m_derived_types        !< Definitions of the derived types
+    use m_global_parameters    !< Definitions of the global parameters
+
+    implicit none
+
+    private; public :: s_pressure_relaxation_procedure, &
+ s_initialize_pressure_relaxation_module, &
+ s_finalize_pressure_relaxation_module
+
+contains
+
+    !> Initialize the pressure relaxation module
+    impure subroutine s_initialize_pressure_relaxation_module
+
+        ! Nothing to initialize - Re is computed dynamically via m_re_visc
+
+    end subroutine s_initialize_pressure_relaxation_module
+
+    !> Finalize the pressure relaxation module
+    impure subroutine s_finalize_pressure_relaxation_module
+
+        ! Nothing to finalize
+
+    end subroutine s_finalize_pressure_relaxation_module
+
+    !> The main pressure relaxation procedure
+    !! @param q_cons_vf Cell-average conservative variables
+    subroutine s_pressure_relaxation_procedure(q_cons_vf)
+
+        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
+        integer :: j, k, l
+
+        $:GPU_PARALLEL_LOOP(private='[j,k,l]', collapse=3)
+        do l = 0, p
+            do k = 0, n
+                do j = 0, m
+                    call s_relax_cell_pressure(q_cons_vf, j, k, l)
+                end do
+            end do
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+    end subroutine s_pressure_relaxation_procedure
+
+    !> Process pressure relaxation for a single cell
+    subroutine s_relax_cell_pressure(q_cons_vf, j, k, l)
+        $:GPU_ROUTINE(parallelism='[seq]')
+
+        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
+        integer, intent(in) :: j, k, l
+
+        ! Volume fraction correction
+        if (mpp_lim) call s_correct_volume_fractions(q_cons_vf, j, k, l)
+
+        ! Pressure equilibration
+        if (s_needs_pressure_relaxation(q_cons_vf, j, k, l)) then
+            call s_equilibrate_pressure(q_cons_vf, j, k, l)
+        end if
+
+        ! Internal energy correction
+        call s_correct_internal_energies(q_cons_vf, j, k, l)
+
+    end subroutine s_relax_cell_pressure
+
+    !> Check if pressure relaxation is needed for this cell
+    logical function s_needs_pressure_relaxation(q_cons_vf, j, k, l)
+        $:GPU_ROUTINE(parallelism='[seq]')
+
+        type(scalar_field), dimension(sys_size), intent(in) :: q_cons_vf
+        integer, intent(in) :: j, k, l
+        integer :: i
+
+        s_needs_pressure_relaxation = .true.
+        $:GPU_LOOP(parallelism='[seq]')
+        do i = 1, num_fluids
+            if (q_cons_vf(i + advxb - 1)%sf(j, k, l) > (1._wp - sgm_eps)) then
+                s_needs_pressure_relaxation = .false.
+            end if
+        end do
+
+    end function s_needs_pressure_relaxation
+
+    !> Correct volume fractions to physical bounds
+    subroutine s_correct_volume_fractions(q_cons_vf, j, k, l)
+        $:GPU_ROUTINE(parallelism='[seq]')
+
+        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
+        integer, intent(in) :: j, k, l
+        real(wp) :: sum_alpha
+        integer :: i
+
+        sum_alpha = 0._wp
+        $:GPU_LOOP(parallelism='[seq]')
+        do i = 1, num_fluids
+            if ((q_cons_vf(i + contxb - 1)%sf(j, k, l) < 0._wp) .or. &
+                (q_cons_vf(i + advxb - 1)%sf(j, k, l) < 0._wp)) then
+                q_cons_vf(i + contxb - 1)%sf(j, k, l) = 0._wp
+                q_cons_vf(i + advxb - 1)%sf(j, k, l) = 0._wp
+                q_cons_vf(i + intxb - 1)%sf(j, k, l) = 0._wp
+            end if
+            if (q_cons_vf(i + advxb - 1)%sf(j, k, l) > 1._wp) &
+                q_cons_vf(i + advxb - 1)%sf(j, k, l) = 1._wp
+            sum_alpha = sum_alpha + q_cons_vf(i + advxb - 1)%sf(j, k, l)
+        end do
+
+        $:GPU_LOOP(parallelism='[seq]')
+        do i = 1, num_fluids
+            q_cons_vf(i + advxb - 1)%sf(j, k, l) = q_cons_vf(i + advxb - 1)%sf(j, k, l)/sum_alpha
+        end do
+
+    end subroutine s_correct_volume_fractions
+
+    !> Main pressure equilibration using Newton-Raphson
+    subroutine s_equilibrate_pressure(q_cons_vf, j, k, l)
+        $:GPU_ROUTINE(parallelism='[seq]')
+
+        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
+        integer, intent(in) :: j, k, l
+
+        real(wp) :: pres_relax, f_pres, df_pres
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3) :: pres_K_init, rho_K_s
+        #:else
+            real(wp), dimension(num_fluids) :: pres_K_init, rho_K_s
+        #:endif
+        integer, parameter :: MAX_ITER = 50
+        real(wp), parameter :: TOLERANCE = 1.e-10_wp
+        integer :: iter, i
+
+        ! Initialize pressures
+        pres_relax = 0._wp
+        $:GPU_LOOP(parallelism='[seq]')
+        do i = 1, num_fluids
+            if (q_cons_vf(i + advxb - 1)%sf(j, k, l) > sgm_eps) then
+                pres_K_init(i) = (q_cons_vf(i + intxb - 1)%sf(j, k, l)/ &
+                                  q_cons_vf(i + advxb - 1)%sf(j, k, l) - pi_infs(i))/gammas(i)
+                if (pres_K_init(i) <= -(1._wp - 1.e-8_wp)*ps_inf(i) + 1.e-8_wp) &
+                    pres_K_init(i) = -(1._wp - 1.e-8_wp)*ps_inf(i) + 1.e-8_wp
+            else
+                pres_K_init(i) = 0._wp
+            end if
+            pres_relax = pres_relax + q_cons_vf(i + advxb - 1)%sf(j, k, l)*pres_K_init(i)
+        end do
+
+        ! Newton-Raphson iteration
+        f_pres = 1.e-9_wp
+        df_pres = 1.e9_wp
+        $:GPU_LOOP(parallelism='[seq]')
+        do iter = 0, MAX_ITER - 1
+            if (abs(f_pres) > TOLERANCE) then
+                pres_relax = pres_relax - f_pres/df_pres
+
+                ! Enforce pressure bounds
+                do i = 1, num_fluids
+                    if (pres_relax <= -(1._wp - 1.e-8_wp)*ps_inf(i) + 1.e-8_wp) &
+                        pres_relax = -(1._wp - 1.e-8_wp)*ps_inf(i) + 1.e-8_wp
+                end do
+
+                ! Newton-Raphson step
+                f_pres = -1._wp
+                df_pres = 0._wp
+                $:GPU_LOOP(parallelism='[seq]')
+                do i = 1, num_fluids
+                    if (q_cons_vf(i + advxb - 1)%sf(j, k, l) > sgm_eps) then
+                        rho_K_s(i) = q_cons_vf(i + contxb - 1)%sf(j, k, l)/ &
+                                     max(q_cons_vf(i + advxb - 1)%sf(j, k, l), sgm_eps) &
+                                     *((pres_relax + ps_inf(i))/(pres_K_init(i) + &
+                                                                 ps_inf(i)))**(1._wp/gs_min(i))
+                        f_pres = f_pres + q_cons_vf(i + contxb - 1)%sf(j, k, l)/rho_K_s(i)
+                        df_pres = df_pres - q_cons_vf(i + contxb - 1)%sf(j, k, l) &
+                                  /(gs_min(i)*rho_K_s(i)*(pres_relax + ps_inf(i)))
+                    end if
+                end do
+            end if
+        end do
+
+        ! Update volume fractions
+        $:GPU_LOOP(parallelism='[seq]')
+        do i = 1, num_fluids
+            if (q_cons_vf(i + advxb - 1)%sf(j, k, l) > sgm_eps) &
+                q_cons_vf(i + advxb - 1)%sf(j, k, l) = q_cons_vf(i + contxb - 1)%sf(j, k, l)/rho_K_s(i)
+        end do
+
+    end subroutine s_equilibrate_pressure
+
+    !> Correct internal energies using equilibrated pressure
+    subroutine s_correct_internal_energies(q_cons_vf, j, k, l)
+        $:GPU_ROUTINE(parallelism='[seq]')
+
+        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
+        integer, intent(in) :: j, k, l
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(2) :: alpha_rho, alpha
+        #:else
+            real(wp), dimension(num_fluids) :: alpha_rho, alpha
+        #:endif
+        real(wp) :: rho, dyn_pres, gamma, pi_inf, pres_relax, sum_alpha
+        integer :: i
+
+        $:GPU_LOOP(parallelism='[seq]')
+        do i = 1, num_fluids
+            alpha_rho(i) = q_cons_vf(i)%sf(j, k, l)
+            alpha(i) = q_cons_vf(E_idx + i)%sf(j, k, l)
+        end do
+
+        ! Compute mixture properties (combined bubble and standard logic)
+        rho = 0._wp
+        gamma = 0._wp
+        pi_inf = 0._wp
+
+        if (bubbles_euler) then
+            if (mpp_lim .and. (model_eqns == 2) .and. (num_fluids > 2)) then
+                $:GPU_LOOP(parallelism='[seq]')
+                do i = 1, num_fluids
+                    rho = rho + alpha_rho(i)
+                    gamma = gamma + alpha(i)*gammas(i)
+                    pi_inf = pi_inf + alpha(i)*pi_infs(i)
+                end do
+            else if ((model_eqns == 2) .and. (num_fluids > 2)) then
+                $:GPU_LOOP(parallelism='[seq]')
+                do i = 1, num_fluids - 1
+                    rho = rho + alpha_rho(i)
+                    gamma = gamma + alpha(i)*gammas(i)
+                    pi_inf = pi_inf + alpha(i)*pi_infs(i)
+                end do
+            else
+                rho = alpha_rho(1)
+                gamma = gammas(1)
+                pi_inf = pi_infs(1)
+            end if
+        else
+            sum_alpha = 0._wp
+            if (mpp_lim) then
+                $:GPU_LOOP(parallelism='[seq]')
+                do i = 1, num_fluids
+                    alpha_rho(i) = max(0._wp, alpha_rho(i))
+                    alpha(i) = min(max(0._wp, alpha(i)), 1._wp)
+                    sum_alpha = sum_alpha + alpha(i)
+                end do
+                alpha = alpha/max(sum_alpha, sgm_eps)
+            end if
+
+            $:GPU_LOOP(parallelism='[seq]')
+            do i = 1, num_fluids
+                rho = rho + alpha_rho(i)
+                gamma = gamma + alpha(i)*gammas(i)
+                pi_inf = pi_inf + alpha(i)*pi_infs(i)
+            end do
+        end if
+
+        ! Compute dynamic pressure and update internal energies
+        dyn_pres = 0._wp
+        $:GPU_LOOP(parallelism='[seq]')
+        do i = momxb, momxe
+            dyn_pres = dyn_pres + 5.e-1_wp*q_cons_vf(i)%sf(j, k, l)* &
+                       q_cons_vf(i)%sf(j, k, l)/max(rho, sgm_eps)
+        end do
+
+        pres_relax = (q_cons_vf(E_idx)%sf(j, k, l) - dyn_pres - pi_inf)/gamma
+
+        $:GPU_LOOP(parallelism='[seq]')
+        do i = 1, num_fluids
+            q_cons_vf(i + intxb - 1)%sf(j, k, l) = &
+                q_cons_vf(i + advxb - 1)%sf(j, k, l)*(gammas(i)*pres_relax + pi_infs(i))
+        end do
+
+    end subroutine s_correct_internal_energies
+
+end module m_pressure_relaxation
diff --git a/src/simulation/m_re_visc.fpp b/src/simulation/m_re_visc.fpp
new file mode 100644
index 0000000000..e741b563ec
--- /dev/null
+++ b/src/simulation/m_re_visc.fpp
@@ -0,0 +1,346 @@
+!>
+!! @file m_re_visc.f90
+!! @brief Contains module m_re_visc
+
+#:include 'macros.fpp'
+
+!> @brief The module contains routines that compute viscosity-related
+!!        quantities for both Newtonian and non-Newtonian fluids.
+!!        s_compute_re_visc returns Re_visc = 1/mu per phase.
+!!        s_compute_mixture_re computes mixture Re from per-phase values.
+module m_re_visc
+
+    use m_derived_types        !< Definitions of the derived types
+
+    use m_global_parameters    !< Definitions of the global parameters
+
+    use m_hb_function          !< Herschel-Bulkley viscosity model
+
+    implicit none
+
+    private; public :: s_compute_re_visc, &
+ s_compute_mixture_re
+
+contains
+
+    !> Computes velocity gradients at a single cell using finite differences.
+    !!      Uses 2nd order central differences in interior, 1st order at boundaries.
+    !! @param q_prim_vf Primitive variables
+    !! @param j x index
+    !! @param k y index
+    !! @param l z index
+    !! @param D_xx Output: du/dx
+    !! @param D_yy Output: dv/dy
+    !! @param D_zz Output: dw/dz
+    !! @param D_xy Output: 0.5*(du/dy + dv/dx)
+    !! @param D_xz Output: 0.5*(du/dz + dw/dx)
+    !! @param D_yz Output: 0.5*(dv/dz + dw/dy)
+    pure subroutine s_compute_velocity_gradients_at_cell( &
+                                                          q_prim_vf, j, k, l, D_xx, D_yy, D_zz, D_xy, D_xz, D_yz)
+        $:GPU_ROUTINE(parallelism='[seq]')
+
+        type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf
+        integer, intent(in) :: j, k, l
+        real(wp), intent(out) :: D_xx, D_yy, D_zz, D_xy, D_xz, D_yz
+
+        integer :: j_lo, j_hi, k_lo, k_hi, l_lo, l_hi
+
+        j_lo = idwbuff(1)%beg
+        j_hi = idwbuff(1)%end
+        k_lo = idwbuff(2)%beg
+        k_hi = idwbuff(2)%end
+        l_lo = idwbuff(3)%beg
+        l_hi = idwbuff(3)%end
+
+        ! Check bounds
+        if (.not. ((j >= j_lo) .and. (j <= j_hi) .and. &
+                   (k >= k_lo) .and. (k <= k_hi) .and. &
+                   (l >= l_lo) .and. (l <= l_hi))) then
+            D_xx = 0._wp; D_yy = 0._wp; D_zz = 0._wp
+            D_xy = 0._wp; D_xz = 0._wp; D_yz = 0._wp
+            return
+        end if
+
+        ! D_xx = du/dx
+        if (j - 1 >= j_lo .and. j + 1 <= j_hi) then
+            D_xx = (q_prim_vf(momxb)%sf(j + 1, k, l) - &
+                    q_prim_vf(momxb)%sf(j - 1, k, l))/ &
+                   (x_cc(j + 1) - x_cc(j - 1))
+        else if (j + 1 <= j_hi) then
+            D_xx = (q_prim_vf(momxb)%sf(j + 1, k, l) - &
+                    q_prim_vf(momxb)%sf(j, k, l))/ &
+                   (x_cc(j + 1) - x_cc(j))
+        else if (j - 1 >= j_lo) then
+            D_xx = (q_prim_vf(momxb)%sf(j, k, l) - &
+                    q_prim_vf(momxb)%sf(j - 1, k, l))/ &
+                   (x_cc(j) - x_cc(j - 1))
+        else
+            D_xx = 0._wp
+        end if
+
+        ! D_yy = dv/dy (2D and 3D only)
+        if (n > 0) then
+            if (k - 1 >= k_lo .and. k + 1 <= k_hi) then
+                D_yy = (q_prim_vf(momxb + 1)%sf(j, k + 1, l) - &
+                        q_prim_vf(momxb + 1)%sf(j, k - 1, l))/ &
+                       (y_cc(k + 1) - y_cc(k - 1))
+            else if (k + 1 <= k_hi) then
+                D_yy = (q_prim_vf(momxb + 1)%sf(j, k + 1, l) - &
+                        q_prim_vf(momxb + 1)%sf(j, k, l))/ &
+                       (y_cc(k + 1) - y_cc(k))
+            else if (k - 1 >= k_lo) then
+                D_yy = (q_prim_vf(momxb + 1)%sf(j, k, l) - &
+                        q_prim_vf(momxb + 1)%sf(j, k - 1, l))/ &
+                       (y_cc(k) - y_cc(k - 1))
+            else
+                D_yy = 0._wp
+            end if
+        else
+            D_yy = 0._wp
+        end if
+
+        ! D_zz = dw/dz (3D only)
+        if (p > 0) then
+            if (l - 1 >= l_lo .and. l + 1 <= l_hi) then
+                D_zz = (q_prim_vf(momxb + 2)%sf(j, k, l + 1) - &
+                        q_prim_vf(momxb + 2)%sf(j, k, l - 1))/ &
+                       (z_cc(l + 1) - z_cc(l - 1))
+            else if (l + 1 <= l_hi) then
+                D_zz = (q_prim_vf(momxb + 2)%sf(j, k, l + 1) - &
+                        q_prim_vf(momxb + 2)%sf(j, k, l))/ &
+                       (z_cc(l + 1) - z_cc(l))
+            else if (l - 1 >= l_lo) then
+                D_zz = (q_prim_vf(momxb + 2)%sf(j, k, l) - &
+                        q_prim_vf(momxb + 2)%sf(j, k, l - 1))/ &
+                       (z_cc(l) - z_cc(l - 1))
+            else
+                D_zz = 0._wp
+            end if
+        else
+            D_zz = 0._wp
+        end if
+
+        ! D_xy = 0.5*(du/dy + dv/dx) (2D and 3D only)
+        if (n > 0) then
+            if (j - 1 >= j_lo .and. j + 1 <= j_hi .and. &
+                k - 1 >= k_lo .and. k + 1 <= k_hi) then
+                D_xy = 0.5_wp*( &
+                    (q_prim_vf(momxb)%sf(j, k + 1, l) - &
+                     q_prim_vf(momxb)%sf(j, k - 1, l))/ &
+                    (y_cc(k + 1) - y_cc(k - 1)) + &
+                    (q_prim_vf(momxb + 1)%sf(j + 1, k, l) - &
+                     q_prim_vf(momxb + 1)%sf(j - 1, k, l))/ &
+                    (x_cc(j + 1) - x_cc(j - 1)))
+            else
+                D_xy = 0._wp
+                if (k - 1 >= k_lo .and. k + 1 <= k_hi) then
+                    D_xy = 0.5_wp*(q_prim_vf(momxb)%sf(j, k + 1, l) - &
+                                    q_prim_vf(momxb)%sf(j, k - 1, l))/ &
+                                   (y_cc(k + 1) - y_cc(k - 1))
+                end if
+                if (j - 1 >= j_lo .and. j + 1 <= j_hi) then
+                    D_xy = D_xy + 0.5_wp* &
+                        (q_prim_vf(momxb + 1)%sf(j + 1, k, l) - &
+                         q_prim_vf(momxb + 1)%sf(j - 1, k, l))/ &
+                        (x_cc(j + 1) - x_cc(j - 1))
+                end if
+            end if
+        else
+            D_xy = 0._wp
+        end if
+
+        ! D_xz = 0.5*(du/dz + dw/dx) (3D only)
+        if (p > 0) then
+            if (j - 1 >= j_lo .and. j + 1 <= j_hi .and. &
+                l - 1 >= l_lo .and. l + 1 <= l_hi) then
+                D_xz = 0.5_wp*( &
+                    (q_prim_vf(momxb)%sf(j, k, l + 1) - &
+                     q_prim_vf(momxb)%sf(j, k, l - 1))/ &
+                    (z_cc(l + 1) - z_cc(l - 1)) + &
+                    (q_prim_vf(momxb + 2)%sf(j + 1, k, l) - &
+                     q_prim_vf(momxb + 2)%sf(j - 1, k, l))/ &
+                    (x_cc(j + 1) - x_cc(j - 1)))
+            else
+                D_xz = 0._wp
+            end if
+        else
+            D_xz = 0._wp
+        end if
+
+        ! D_yz = 0.5*(dv/dz + dw/dy) (3D only)
+        if (p > 0 .and. n > 0) then
+            if (k - 1 >= k_lo .and. k + 1 <= k_hi .and. &
+                l - 1 >= l_lo .and. l + 1 <= l_hi) then
+                D_yz = 0.5_wp*( &
+                    (q_prim_vf(momxb + 1)%sf(j, k, l + 1) - &
+                     q_prim_vf(momxb + 1)%sf(j, k, l - 1))/ &
+                    (z_cc(l + 1) - z_cc(l - 1)) + &
+                    (q_prim_vf(momxb + 2)%sf(j, k + 1, l) - &
+                     q_prim_vf(momxb + 2)%sf(j, k - 1, l))/ &
+                    (y_cc(k + 1) - y_cc(k - 1)))
+            else
+                D_yz = 0._wp
+            end if
+        else
+            D_yz = 0._wp
+        end if
+
+    end subroutine s_compute_velocity_gradients_at_cell
+
+    !> Computes Re_visc per-phase for both Newtonian and non-Newtonian fluids.
+    !!      Re_visc = 1/mu for each phase in each direction (shear and bulk).
+    !! @param q_prim_vf Primitive variables
+    !! @param alpha_visc Volume fractions
+    !! @param j x index
+    !! @param k y index
+    !! @param l z index
+    !! @param Re_visc_per_phase Output: 1/mu per fluid per direction
+    !! @param grad_x_vf Optional pre-computed x-direction gradients
+    !! @param grad_y_vf Optional pre-computed y-direction gradients
+    !! @param grad_z_vf Optional pre-computed z-direction gradients
+    pure subroutine s_compute_re_visc(q_prim_vf, alpha_visc, j, k, l, &
+                                      Re_visc_per_phase, grad_x_vf, grad_y_vf, grad_z_vf)
+        $:GPU_ROUTINE(parallelism='[seq]')
+
+        type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3), intent(in) :: alpha_visc
+        #:else
+            real(wp), dimension(num_fluids), intent(in) :: alpha_visc
+        #:endif
+        integer, intent(in) :: j, k, l
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3, 2), intent(out) :: Re_visc_per_phase
+        #:else
+            real(wp), dimension(num_fluids, 2), intent(out) :: Re_visc_per_phase
+        #:endif
+        type(scalar_field), dimension(:), intent(in), optional :: &
+            grad_x_vf, grad_y_vf, grad_z_vf
+
+        real(wp) :: D_xx, D_yy, D_zz, D_xy, D_xz, D_yz
+        real(wp) :: shear_rate, mu_fluid
+        integer :: i, q, r
+
+        ! Initialize all to default (inviscid)
+        $:GPU_LOOP(parallelism='[seq]')
+        do i = 1, 2
+            $:GPU_LOOP(parallelism='[seq]')
+            do q = 1, num_fluids
+                Re_visc_per_phase(q, i) = dflt_real
+            end do
+        end do
+
+        if (any_non_newtonian) then
+            ! Non-Newtonian path: compute velocity gradients and shear rate
+            if (present(grad_x_vf) .and. present(grad_y_vf) .and. &
+                present(grad_z_vf)) then
+                ! Use provided gradients
+                D_xx = grad_x_vf(1)%sf(j, k, l)
+                if (n > 0) then
+                    D_yy = grad_y_vf(2)%sf(j, k, l)
+                    D_xy = 0.5_wp*(grad_y_vf(1)%sf(j, k, l) + &
+                                    grad_x_vf(2)%sf(j, k, l))
+                else
+                    D_yy = 0._wp; D_xy = 0._wp
+                end if
+                if (p > 0) then
+                    D_zz = grad_z_vf(3)%sf(j, k, l)
+                    D_xz = 0.5_wp*(grad_z_vf(1)%sf(j, k, l) + &
+                                    grad_x_vf(3)%sf(j, k, l))
+                    D_yz = 0.5_wp*(grad_z_vf(2)%sf(j, k, l) + &
+                                    grad_y_vf(3)%sf(j, k, l))
+                else
+                    D_zz = 0._wp; D_xz = 0._wp; D_yz = 0._wp
+                end if
+            else
+                ! Compute gradients from primitive variables
+                call s_compute_velocity_gradients_at_cell( &
+                    q_prim_vf, j, k, l, D_xx, D_yy, D_zz, D_xy, D_xz, D_yz)
+            end if
+
+            ! Compute shear rate
+            shear_rate = f_compute_shear_rate_from_components( &
+                D_xx, D_yy, D_zz, D_xy, D_xz, D_yz)
+
+            ! For each phase, compute Re_visc
+            $:GPU_LOOP(parallelism='[seq]')
+            do q = 1, num_fluids
+                if (fluid_pp(q)%non_newtonian) then
+                    ! Non-Newtonian: compute shear mu from HB model
+                    mu_fluid = f_compute_hb_viscosity( &
+                        fluid_pp(q)%tau0, fluid_pp(q)%K, &
+                        fluid_pp(q)%nn, fluid_pp(q)%mu_min, &
+                        fluid_pp(q)%mu_max, shear_rate, &
+                        fluid_pp(q)%hb_m)
+                    Re_visc_per_phase(q, 1) = 1._wp/mu_fluid
+                    ! Bulk viscosity
+                    if (fluid_pp(q)%mu_bulk > 0._wp) then
+                        Re_visc_per_phase(q, 2) = 1._wp/fluid_pp(q)%mu_bulk
+                    else
+                        Re_visc_per_phase(q, 2) = dflt_real
+                    end if
+                else
+                    ! Newtonian: return Re input values
+                    $:GPU_LOOP(parallelism='[seq]')
+                    do i = 1, 2
+                        if (Re_size(i) > 0) then
+                            $:GPU_LOOP(parallelism='[seq]')
+                            do r = 1, Re_size(i)
+                                if (Re_idx(i, r) == q) then
+                                    Re_visc_per_phase(q, i) = fluid_pp(q)%Re(i)
+                                    exit
+                                end if
+                            end do
+                        end if
+                    end do
+                end if
+            end do
+        else
+            ! All Newtonian: return Re input values
+            $:GPU_LOOP(parallelism='[seq]')
+            do i = 1, 2
+                $:GPU_LOOP(parallelism='[seq]')
+                do q = 1, Re_size(i)
+                    Re_visc_per_phase(Re_idx(i, q), i) = &
+                        fluid_pp(Re_idx(i, q))%Re(i)
+                end do
+            end do
+        end if
+
+    end subroutine s_compute_re_visc
+
+    !> Computes mixture Reynolds number from per-phase values and volume fractions.
+    !!      Re_mix(i) = 1 / sum_q(alpha(q) / Re_per_phase(q, i))
+    !! @param alpha Volume fractions
+    !! @param Re_per_phase Per-phase Re_visc = 1/mu
+    !! @param Re_mix Output: mixture Re (shear and bulk)
+    pure subroutine s_compute_mixture_re(alpha, Re_per_phase, Re_mix)
+        $:GPU_ROUTINE(parallelism='[seq]')
+
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3), intent(in) :: alpha
+            real(wp), dimension(3, 2), intent(in) :: Re_per_phase
+        #:else
+            real(wp), dimension(num_fluids), intent(in) :: alpha
+            real(wp), dimension(num_fluids, 2), intent(in) :: Re_per_phase
+        #:endif
+        real(wp), dimension(2), intent(out) :: Re_mix
+
+        integer :: i, q
+
+        $:GPU_LOOP(parallelism='[seq]')
+        do i = 1, 2
+            Re_mix(i) = 0._wp
+            $:GPU_LOOP(parallelism='[seq]')
+            do q = 1, num_fluids
+                if (Re_per_phase(q, i) /= dflt_real &
+                    .and. Re_per_phase(q, i) > sgm_eps) then
+                    Re_mix(i) = Re_mix(i) + alpha(q)/Re_per_phase(q, i)
+                end if
+            end do
+            Re_mix(i) = 1._wp/max(Re_mix(i), sgm_eps)
+        end do
+
+    end subroutine s_compute_mixture_re
+
+end module m_re_visc
diff --git a/src/simulation/m_riemann_solvers.fpp b/src/simulation/m_riemann_solvers.fpp
index 5a8d3a118a..93eade8a09 100644
--- a/src/simulation/m_riemann_solvers.fpp
+++ b/src/simulation/m_riemann_solvers.fpp
@@ -1,5257 +1,5272 @@
-!>
-!! @file
-!! @brief Contains module m_riemann_solvers
-
-!> @brief Approximate and exact Riemann solvers (HLL, HLLC, HLLD, exact) for the multicomponent Navier--Stokes equations
-
-#:include 'case.fpp'
-#:include 'macros.fpp'
-#:include 'inline_riemann.fpp'
-
-module m_riemann_solvers
-
-    use m_derived_types        !< Definitions of the derived types
-
-    use m_global_parameters    !< Definitions of the global parameters
-
-    use m_mpi_proxy            !< Message passing interface (MPI) module proxy
-
-    use m_variables_conversion !< State variables type conversion procedures
-
-    use m_bubbles              !< To get the bubble wall pressure function
-
-    use m_bubbles_EE
-
-    use m_surface_tension      !< To get the capillary fluxes
-
-    use m_helper_basic         !< Functions to compare floating point numbers
-
-    use m_chemistry
-
-    use m_thermochem, only: &
-        gas_constant, get_mixture_molecular_weight, &
-        get_mixture_specific_heat_cv_mass, get_mixture_energy_mass, &
-        get_species_specific_heats_r, get_species_enthalpies_rt, &
-        get_mixture_specific_heat_cp_mass
-
-    #:if USING_AMD
-        use m_chemistry, only: molecular_weights_nonparameter
-    #:endif
-
-    implicit none
-
-    private; public :: s_initialize_riemann_solvers_module, &
- s_riemann_solver, &
- s_hll_riemann_solver, &
- s_hllc_riemann_solver, &
- s_hlld_riemann_solver, &
- s_lf_riemann_solver, &
- s_finalize_riemann_solvers_module
-
-    !> The cell-boundary values of the fluxes (src - source) that are computed
-    !! through the chosen Riemann problem solver, and the direct evaluation of
-    !! source terms, by using the left and right states given in qK_prim_rs_vf,
-    !! dqK_prim_ds_vf where ds = dx, dy or dz.
-    !> @{
-
-    real(wp), allocatable, dimension(:, :, :, :) :: flux_rsx_vf, flux_src_rsx_vf
-    real(wp), allocatable, dimension(:, :, :, :) :: flux_rsy_vf, flux_src_rsy_vf
-    real(wp), allocatable, dimension(:, :, :, :) :: flux_rsz_vf, flux_src_rsz_vf
-    $:GPU_DECLARE(create='[flux_rsx_vf,flux_src_rsx_vf,flux_rsy_vf,flux_src_rsy_vf,flux_rsz_vf,flux_src_rsz_vf]')
-    !> @}
-
-    !> The cell-boundary values of the geometrical source flux that are computed
-    !! through the chosen Riemann problem solver by using the left and right
-    !! states given in qK_prim_rs_vf. Currently 2D axisymmetric for inviscid only.
-    !> @{
-
-    real(wp), allocatable, dimension(:, :, :, :) :: flux_gsrc_rsx_vf !<
-    real(wp), allocatable, dimension(:, :, :, :) :: flux_gsrc_rsy_vf !<
-    real(wp), allocatable, dimension(:, :, :, :) :: flux_gsrc_rsz_vf !<
-    $:GPU_DECLARE(create='[flux_gsrc_rsx_vf,flux_gsrc_rsy_vf,flux_gsrc_rsz_vf]')
-    !> @}
-
-    ! The cell-boundary values of the velocity. vel_src_rs_vf is determined as
-    ! part of Riemann problem solution and is used to evaluate the source flux.
-
-    real(wp), allocatable, dimension(:, :, :, :) :: vel_src_rsx_vf
-    real(wp), allocatable, dimension(:, :, :, :) :: vel_src_rsy_vf
-    real(wp), allocatable, dimension(:, :, :, :) :: vel_src_rsz_vf
-    $:GPU_DECLARE(create='[vel_src_rsx_vf,vel_src_rsy_vf,vel_src_rsz_vf]')
-
-    real(wp), allocatable, dimension(:, :, :, :) :: mom_sp_rsx_vf
-    real(wp), allocatable, dimension(:, :, :, :) :: mom_sp_rsy_vf
-    real(wp), allocatable, dimension(:, :, :, :) :: mom_sp_rsz_vf
-    $:GPU_DECLARE(create='[mom_sp_rsx_vf,mom_sp_rsy_vf,mom_sp_rsz_vf]')
-
-    real(wp), allocatable, dimension(:, :, :, :) :: Re_avg_rsx_vf
-    real(wp), allocatable, dimension(:, :, :, :) :: Re_avg_rsy_vf
-    real(wp), allocatable, dimension(:, :, :, :) :: Re_avg_rsz_vf
-    $:GPU_DECLARE(create='[Re_avg_rsx_vf,Re_avg_rsy_vf,Re_avg_rsz_vf]')
-
-    !> @name Indical bounds in the s1-, s2- and s3-directions
-    !> @{
-    type(int_bounds_info) :: is1, is2, is3
-    type(int_bounds_info) :: isx, isy, isz
-    !> @}
-
-    $:GPU_DECLARE(create='[is1,is2,is3,isx,isy,isz]')
-
-    real(wp), allocatable, dimension(:) :: Gs_rs
-    $:GPU_DECLARE(create='[Gs_rs]')
-
-    real(wp), allocatable, dimension(:, :) :: Res_gs
-    $:GPU_DECLARE(create='[Res_gs]')
-
-contains
-
-    !> Dispatch to the subroutines that are utilized to compute the
-        !! Riemann problem solution. For additional information please reference:
-        !!                        1) s_hll_riemann_solver
-        !!                        2) s_hllc_riemann_solver
-        !!                        3) s_exact_riemann_solver
-        !!                        4) s_hlld_riemann_solver
-        !!  @param qL_prim_rsx_vf Left WENO-reconstructed cell-boundary values (x-dir)
-        !!  @param qL_prim_rsy_vf Left WENO-reconstructed cell-boundary values (y-dir)
-        !!  @param qL_prim_rsz_vf Left WENO-reconstructed cell-boundary values (z-dir)
-        !!  @param dqL_prim_dx_vf The  left WENO-reconstructed cell-boundary values of the
-        !!      first-order x-dir spatial derivatives
-        !!  @param dqL_prim_dy_vf The  left WENO-reconstructed cell-boundary values of the
-        !!      first-order y-dir spatial derivatives
-        !!  @param dqL_prim_dz_vf The  left WENO-reconstructed cell-boundary values of the
-        !!      first-order z-dir spatial derivatives
-        !!  @param qL_prim_vf The  left WENO-reconstructed cell-boundary values of the
-        !!      cell-average primitive variables
-        !!  @param qR_prim_rsx_vf Right WENO-reconstructed cell-boundary values (x-dir)
-        !!  @param qR_prim_rsy_vf Right WENO-reconstructed cell-boundary values (y-dir)
-        !!  @param qR_prim_rsz_vf Right WENO-reconstructed cell-boundary values (z-dir)
-        !!  @param dqR_prim_dx_vf The right WENO-reconstructed cell-boundary values of the
-        !!      first-order x-dir spatial derivatives
-        !!  @param dqR_prim_dy_vf The right WENO-reconstructed cell-boundary values of the
-        !!      first-order y-dir spatial derivatives
-        !!  @param dqR_prim_dz_vf The right WENO-reconstructed cell-boundary values of the
-        !!      first-order z-dir spatial derivatives
-        !!  @param qR_prim_vf The right WENO-reconstructed cell-boundary values of the
-        !!      cell-average primitive variables
-        !!  @param q_prim_vf Cell-averaged primitive variables
-        !!  @param flux_vf Intra-cell fluxes
-        !!  @param flux_src_vf Intra-cell fluxes sources
-        !!  @param flux_gsrc_vf Intra-cell geometric fluxes sources
-        !!  @param norm_dir Dir. splitting direction
-        !!  @param ix Index bounds in the x-dir
-        !!  @param iy Index bounds in the y-dir
-        !!  @param iz Index bounds in the z-dir
-    subroutine s_riemann_solver(qL_prim_rsx_vf, qL_prim_rsy_vf, qL_prim_rsz_vf, dqL_prim_dx_vf, &
-                                dqL_prim_dy_vf, &
-                                dqL_prim_dz_vf, &
-                                qL_prim_vf, &
-                                qR_prim_rsx_vf, qR_prim_rsy_vf, qR_prim_rsz_vf, dqR_prim_dx_vf, &
-                                dqR_prim_dy_vf, &
-                                dqR_prim_dz_vf, &
-                                qR_prim_vf, &
-                                q_prim_vf, &
-                                flux_vf, flux_src_vf, &
-                                flux_gsrc_vf, &
-                                norm_dir, ix, iy, iz)
-
-        real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:), intent(INOUT) :: qL_prim_rsx_vf, qL_prim_rsy_vf, qL_prim_rsz_vf, qR_prim_rsx_vf, qR_prim_rsy_vf, qR_prim_rsz_vf
-        type(scalar_field), dimension(sys_size), intent(IN) :: q_prim_vf
-
-        type(scalar_field), allocatable, dimension(:), intent(INOUT) :: qL_prim_vf, qR_prim_vf
-
-        type(scalar_field), &
-            allocatable, dimension(:), &
-            intent(INOUT) :: dqL_prim_dx_vf, dqR_prim_dx_vf, &
-                             dqL_prim_dy_vf, dqR_prim_dy_vf, &
-                             dqL_prim_dz_vf, dqR_prim_dz_vf
-
-        type(scalar_field), &
-            dimension(sys_size), &
-            intent(INOUT) :: flux_vf, flux_src_vf, flux_gsrc_vf
-
-        integer, intent(IN) :: norm_dir
-
-        type(int_bounds_info), intent(IN) :: ix, iy, iz
-
-        #:for NAME, NUM in [('hll', 1), ('hllc', 2), ('hlld', 4), ('lf', 5)]
-            if (riemann_solver == ${NUM}$) then
-                call s_${NAME}$_riemann_solver(qL_prim_rsx_vf, qL_prim_rsy_vf, qL_prim_rsz_vf, dqL_prim_dx_vf, &
-                                               dqL_prim_dy_vf, &
-                                               dqL_prim_dz_vf, &
-                                               qL_prim_vf, &
-                                               qR_prim_rsx_vf, qR_prim_rsy_vf, qR_prim_rsz_vf, dqR_prim_dx_vf, &
-                                               dqR_prim_dy_vf, &
-                                               dqR_prim_dz_vf, &
-                                               qR_prim_vf, &
-                                               q_prim_vf, &
-                                               flux_vf, flux_src_vf, &
-                                               flux_gsrc_vf, &
-                                               norm_dir, ix, iy, iz)
-            end if
-        #:endfor
-
-    end subroutine s_riemann_solver
-
-    !> Dispatch to the subroutines that are utilized to compute
-        !! the viscous source fluxes for either Cartesian or cylindrical geometries.
-        !! For more information please refer to:
-        !!      1) s_compute_cartesian_viscous_source_flux
-        !!      2) s_compute_cylindrical_viscous_source_flux
-    subroutine s_compute_viscous_source_flux(velL_vf, &
-                                             dvelL_dx_vf, &
-                                             dvelL_dy_vf, &
-                                             dvelL_dz_vf, &
-                                             velR_vf, &
-                                             dvelR_dx_vf, &
-                                             dvelR_dy_vf, &
-                                             dvelR_dz_vf, &
-                                             flux_src_vf, &
-                                             norm_dir, &
-                                             ix, iy, iz)
-
-        type(scalar_field), &
-            dimension(num_vels), &
-            intent(IN) :: velL_vf, velR_vf, &
-                          dvelL_dx_vf, dvelR_dx_vf, &
-                          dvelL_dy_vf, dvelR_dy_vf, &
-                          dvelL_dz_vf, dvelR_dz_vf
-
-        type(scalar_field), &
-            dimension(sys_size), &
-            intent(INOUT) :: flux_src_vf
-
-        integer, intent(IN) :: norm_dir
-
-        type(int_bounds_info), intent(IN) :: ix, iy, iz
-
-        if (grid_geometry == 3) then
-            call s_compute_cylindrical_viscous_source_flux(velL_vf, &
-                                                           dvelL_dx_vf, &
-                                                           dvelL_dy_vf, &
-                                                           dvelL_dz_vf, &
-                                                           velR_vf, &
-                                                           dvelR_dx_vf, &
-                                                           dvelR_dy_vf, &
-                                                           dvelR_dz_vf, &
-                                                           flux_src_vf, &
-                                                           norm_dir, &
-                                                           ix, iy, iz)
-        else
-            call s_compute_cartesian_viscous_source_flux(dvelL_dx_vf, &
-                                                         dvelL_dy_vf, &
-                                                         dvelL_dz_vf, &
-                                                         dvelR_dx_vf, &
-                                                         dvelR_dy_vf, &
-                                                         dvelR_dz_vf, &
-                                                         flux_src_vf, &
-                                                         norm_dir)
-        end if
-    end subroutine s_compute_viscous_source_flux
-
-    !> @brief Computes intercell fluxes using the Harten-Lax-van Leer (HLL) approximate Riemann solver.
-    subroutine s_hll_riemann_solver(qL_prim_rsx_vf, qL_prim_rsy_vf, qL_prim_rsz_vf, dqL_prim_dx_vf, &
-                                    dqL_prim_dy_vf, &
-                                    dqL_prim_dz_vf, &
-                                    qL_prim_vf, &
-                                    qR_prim_rsx_vf, qR_prim_rsy_vf, qR_prim_rsz_vf, dqR_prim_dx_vf, &
-                                    dqR_prim_dy_vf, &
-                                    dqR_prim_dz_vf, &
-                                    qR_prim_vf, &
-                                    q_prim_vf, &
-                                    flux_vf, flux_src_vf, &
-                                    flux_gsrc_vf, &
-                                    norm_dir, ix, iy, iz)
-
-        real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:), intent(inout) :: qL_prim_rsx_vf, qL_prim_rsy_vf, qL_prim_rsz_vf, qR_prim_rsx_vf, qR_prim_rsy_vf, qR_prim_rsz_vf
-        type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf
-
-        type(scalar_field), allocatable, dimension(:), intent(inout) :: qL_prim_vf, qR_prim_vf
-
-        type(scalar_field), &
-            allocatable, dimension(:), &
-            intent(inout) :: dqL_prim_dx_vf, dqR_prim_dx_vf, &
-                             dqL_prim_dy_vf, dqR_prim_dy_vf, &
-                             dqL_prim_dz_vf, dqR_prim_dz_vf
-
-        ! Intercell fluxes
-        type(scalar_field), &
-            dimension(sys_size), &
-            intent(inout) :: flux_vf, flux_src_vf, flux_gsrc_vf
-        real(wp) :: flux_tau_L, flux_tau_R
-
-        integer, intent(in) :: norm_dir
-        type(int_bounds_info), intent(in) :: ix, iy, iz
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
-            real(wp), dimension(3) :: alpha_rho_L, alpha_rho_R
-            real(wp), dimension(3) :: vel_L, vel_R
-            real(wp), dimension(3) :: alpha_L, alpha_R
-            real(wp), dimension(10) :: Ys_L, Ys_R
-            real(wp), dimension(10) :: Cp_iL, Cp_iR, Xs_L, Xs_R, Gamma_iL, Gamma_iR
-            real(wp), dimension(10) :: Yi_avg, Phi_avg, h_iL, h_iR, h_avg_2
-        #:else
-            real(wp), dimension(num_fluids) :: alpha_rho_L, alpha_rho_R
-            real(wp), dimension(num_vels) :: vel_L, vel_R
-            real(wp), dimension(num_fluids) :: alpha_L, alpha_R
-            real(wp), dimension(num_species) :: Ys_L, Ys_R
-            real(wp), dimension(num_species) :: Cp_iL, Cp_iR, Xs_L, Xs_R, Gamma_iL, Gamma_iR
-            real(wp), dimension(num_species) :: Yi_avg, Phi_avg, h_iL, h_iR, h_avg_2
-        #:endif
-        real(wp) :: rho_L, rho_R
-        real(wp) :: pres_L, pres_R
-        real(wp) :: E_L, E_R
-        real(wp) :: H_L, H_R
-        real(wp) :: Cp_avg, Cv_avg, T_avg, eps, c_sum_Yi_Phi
-        real(wp) :: T_L, T_R
-        real(wp) :: Y_L, Y_R
-        real(wp) :: MW_L, MW_R
-        real(wp) :: R_gas_L, R_gas_R
-        real(wp) :: Cp_L, Cp_R
-        real(wp) :: Cv_L, Cv_R
-        real(wp) :: Gamm_L, Gamm_R
-        real(wp) :: gamma_L, gamma_R
-        real(wp) :: pi_inf_L, pi_inf_R
-        real(wp) :: qv_L, qv_R
-        real(wp) :: c_L, c_R
-        real(wp), dimension(6) :: tau_e_L, tau_e_R
-        real(wp) :: G_L, G_R
-        real(wp), dimension(2) :: Re_L, Re_R
-        real(wp), dimension(3) :: xi_field_L, xi_field_R
-
-        real(wp) :: rho_avg
-        real(wp) :: H_avg
-        real(wp) :: qv_avg
-        real(wp) :: gamma_avg
-        real(wp) :: c_avg
-
-        real(wp) :: s_L, s_R, s_M, s_P, s_S
-        real(wp) :: xi_M, xi_P
-
-        real(wp) :: ptilde_L, ptilde_R
-        real(wp) :: vel_L_rms, vel_R_rms, vel_avg_rms
-        real(wp) :: vel_L_tmp, vel_R_tmp
-        real(wp) :: Ms_L, Ms_R, pres_SL, pres_SR
-        real(wp) :: alpha_L_sum, alpha_R_sum
-        real(wp) :: zcoef, pcorr !< low Mach number correction
-
-        type(riemann_states) :: c_fast, pres_mag
-        type(riemann_states_vec3) :: B
-
-        type(riemann_states) :: Ga ! Gamma (Lorentz factor)
-        type(riemann_states) :: vdotB, B2
-        type(riemann_states_vec3) :: b4 ! 4-magnetic field components (spatial: b4x, b4y, b4z)
-        type(riemann_states_vec3) :: cm ! Conservative momentum variables
-
-        integer :: i, j, k, l, q !< Generic loop iterators
-
-        ! Populating the buffers of the left and right Riemann problem
-        ! states variables, based on the choice of boundary conditions
-        call s_populate_riemann_states_variables_buffers( &
-            qL_prim_rsx_vf, qL_prim_rsy_vf, qL_prim_rsz_vf, dqL_prim_dx_vf, &
-            dqL_prim_dy_vf, &
-            dqL_prim_dz_vf, &
-            qR_prim_rsx_vf, qR_prim_rsy_vf, qR_prim_rsz_vf, dqR_prim_dx_vf, &
-            dqR_prim_dy_vf, &
-            dqR_prim_dz_vf, &
-            norm_dir, ix, iy, iz)
-
-        ! Reshaping inputted data based on dimensional splitting direction
-        call s_initialize_riemann_solver( &
-            flux_src_vf, &
-            norm_dir)
-        #:for NORM_DIR, XYZ in [(1, 'x'), (2, 'y'), (3, 'z')]
-
-            if (norm_dir == ${NORM_DIR}$) then
-                $:GPU_PARALLEL_LOOP(collapse=3, private='[i,j,k,l,q,alpha_rho_L,alpha_rho_R,vel_L,vel_R,alpha_L,alpha_R,tau_e_L,tau_e_R,Re_L,Re_R,s_L,s_R,s_S,Ys_L,Ys_R,xi_field_L, xi_field_R, Cp_iL, Cp_iR, Xs_L, Xs_R, Gamma_iL, Gamma_iR, Yi_avg, Phi_avg, h_iL, h_iR, h_avg_2, c_fast, pres_mag, B, Ga, vdotB, B2, b4, cm, pcorr, zcoef, vel_L_tmp, vel_R_tmp, rho_L, rho_R, pres_L, pres_R, E_L, E_R, H_L, H_R, Cp_avg, Cv_avg, T_avg, eps, c_sum_Yi_Phi, T_L, T_R, Y_L, Y_R, MW_L, MW_R, R_gas_L, R_gas_R, Cp_L, Cp_R, Cv_L, Cv_R, Gamm_L, Gamm_R, gamma_L, gamma_R, pi_inf_L, pi_inf_R, qv_L, qv_R, qv_avg, c_L, c_R, G_L, G_R, rho_avg, H_avg, c_avg, gamma_avg, ptilde_L, ptilde_R, vel_L_rms, vel_R_rms, vel_avg_rms, Ms_L, Ms_R, pres_SL, pres_SR, alpha_L_sum, alpha_R_sum, flux_tau_L, flux_tau_R]', copyin='[norm_dir]')
-                do l = is3%beg, is3%end
-                    do k = is2%beg, is2%end
-                        do j = is1%beg, is1%end
-                            $:GPU_LOOP(parallelism='[seq]')
-                            do i = 1, contxe
-                                alpha_rho_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, i)
-                                alpha_rho_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)
-                            end do
-
-                            vel_L_rms = 0._wp; vel_R_rms = 0._wp
-
-                            $:GPU_LOOP(parallelism='[seq]')
-                            do i = 1, num_vels
-                                vel_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, contxe + i)
-                                vel_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, contxe + i)
-                                vel_L_rms = vel_L_rms + vel_L(i)**2._wp
-                                vel_R_rms = vel_R_rms + vel_R(i)**2._wp
-                            end do
-
-                            $:GPU_LOOP(parallelism='[seq]')
-                            do i = 1, num_fluids
-                                alpha_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)
-                                alpha_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)
-                            end do
-
-                            pres_L = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx)
-                            pres_R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx)
-
-                            if (mhd) then
-                                if (n == 0) then ! 1D: constant Bx; By, Bz as variables
-                                    B%L(1) = Bx0
-                                    B%R(1) = Bx0
-                                    B%L(2) = qL_prim_rs${XYZ}$_vf(j, k, l, B_idx%beg)
-                                    B%R(2) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, B_idx%beg)
-                                    B%L(3) = qL_prim_rs${XYZ}$_vf(j, k, l, B_idx%beg + 1)
-                                    B%R(3) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, B_idx%beg + 1)
-                                else ! 2D/3D: Bx, By, Bz as variables
-                                    B%L(1) = qL_prim_rs${XYZ}$_vf(j, k, l, B_idx%beg)
-                                    B%R(1) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, B_idx%beg)
-                                    B%L(2) = qL_prim_rs${XYZ}$_vf(j, k, l, B_idx%beg + 1)
-                                    B%R(2) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, B_idx%beg + 1)
-                                    B%L(3) = qL_prim_rs${XYZ}$_vf(j, k, l, B_idx%beg + 2)
-                                    B%R(3) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, B_idx%beg + 2)
-                                end if
-                            end if
-
-                            rho_L = 0._wp
-                            gamma_L = 0._wp
-                            pi_inf_L = 0._wp
-                            qv_L = 0._wp
-
-                            rho_R = 0._wp
-                            gamma_R = 0._wp
-                            pi_inf_R = 0._wp
-                            qv_R = 0._wp
-
-                            alpha_L_sum = 0._wp
-                            alpha_R_sum = 0._wp
-
-                            pres_mag%L = 0._wp
-                            pres_mag%R = 0._wp
-
-                            if (mpp_lim) then
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, num_fluids
-                                    alpha_rho_L(i) = max(0._wp, alpha_rho_L(i))
-                                    alpha_L(i) = min(max(0._wp, alpha_L(i)), 1._wp)
-                                    alpha_L_sum = alpha_L_sum + alpha_L(i)
-                                    alpha_rho_R(i) = max(0._wp, alpha_rho_R(i))
-                                    alpha_R(i) = min(max(0._wp, alpha_R(i)), 1._wp)
-                                    alpha_R_sum = alpha_R_sum + alpha_R(i)
-                                end do
-
-                                alpha_L = alpha_L/max(alpha_L_sum, sgm_eps)
-                                alpha_R = alpha_R/max(alpha_R_sum, sgm_eps)
-                            end if
-
-                            $:GPU_LOOP(parallelism='[seq]')
-                            do i = 1, num_fluids
-                                rho_L = rho_L + alpha_rho_L(i)
-                                gamma_L = gamma_L + alpha_L(i)*gammas(i)
-                                pi_inf_L = pi_inf_L + alpha_L(i)*pi_infs(i)
-                                qv_L = qv_L + alpha_rho_L(i)*qvs(i)
-
-                                rho_R = rho_R + alpha_rho_R(i)
-                                gamma_R = gamma_R + alpha_R(i)*gammas(i)
-                                pi_inf_R = pi_inf_R + alpha_R(i)*pi_infs(i)
-                                qv_R = qv_R + alpha_rho_R(i)*qvs(i)
-                            end do
-
-                            if (viscous) then
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, 2
-                                    Re_L(i) = dflt_real
-                                    Re_R(i) = dflt_real
-
-                                    if (Re_size(i) > 0) Re_L(i) = 0._wp
-                                    if (Re_size(i) > 0) Re_R(i) = 0._wp
-
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do q = 1, Re_size(i)
-                                        Re_L(i) = alpha_L(Re_idx(i, q))/Res_gs(i, q) &
-                                                  + Re_L(i)
-                                        Re_R(i) = alpha_R(Re_idx(i, q))/Res_gs(i, q) &
-                                                  + Re_R(i)
-                                    end do
-
-                                    Re_L(i) = 1._wp/max(Re_L(i), sgm_eps)
-                                    Re_R(i) = 1._wp/max(Re_R(i), sgm_eps)
-                                end do
-                            end if
-
-                            if (chemistry) then
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = chemxb, chemxe
-                                    Ys_L(i - chemxb + 1) = qL_prim_rs${XYZ}$_vf(j, k, l, i)
-                                    Ys_R(i - chemxb + 1) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)
-                                end do
-
-                                call get_mixture_molecular_weight(Ys_L, MW_L)
-                                call get_mixture_molecular_weight(Ys_R, MW_R)
-                                #:if USING_AMD
-                                    Xs_L(:) = Ys_L(:)*MW_L/molecular_weights_nonparameter(:)
-                                    Xs_R(:) = Ys_R(:)*MW_R/molecular_weights_nonparameter(:)
-                                #:else
-                                    Xs_L(:) = Ys_L(:)*MW_L/molecular_weights(:)
-                                    Xs_R(:) = Ys_R(:)*MW_R/molecular_weights(:)
-                                #:endif
-
-                                R_gas_L = gas_constant/MW_L
-                                R_gas_R = gas_constant/MW_R
-                                T_L = pres_L/rho_L/R_gas_L
-                                T_R = pres_R/rho_R/R_gas_R
-
-                                call get_species_specific_heats_r(T_L, Cp_iL)
-                                call get_species_specific_heats_r(T_R, Cp_iR)
-
-                                if (chem_params%gamma_method == 1) then
-                                    ! gamma_method = 1: Ref. Section 2.3.1 Formulation of doi:10.7907/ZKW8-ES97.
-                                    Gamma_iL = Cp_iL/(Cp_iL - 1.0_wp)
-                                    Gamma_iR = Cp_iR/(Cp_iR - 1.0_wp)
-
-                                    gamma_L = sum(Xs_L(:)/(Gamma_iL(:) - 1.0_wp))
-                                    gamma_R = sum(Xs_R(:)/(Gamma_iR(:) - 1.0_wp))
-                                else if (chem_params%gamma_method == 2) then
-                                    ! gamma_method = 2: c_p / c_v where c_p, c_v are specific heats.
-                                    call get_mixture_specific_heat_cp_mass(T_L, Ys_L, Cp_L)
-                                    call get_mixture_specific_heat_cp_mass(T_R, Ys_R, Cp_R)
-                                    call get_mixture_specific_heat_cv_mass(T_L, Ys_L, Cv_L)
-                                    call get_mixture_specific_heat_cv_mass(T_R, Ys_R, Cv_R)
-
-                                    Gamm_L = Cp_L/Cv_L
-                                    gamma_L = 1.0_wp/(Gamm_L - 1.0_wp)
-                                    Gamm_R = Cp_R/Cv_R
-                                    gamma_R = 1.0_wp/(Gamm_R - 1.0_wp)
-                                end if
-
-                                call get_mixture_energy_mass(T_L, Ys_L, E_L)
-                                call get_mixture_energy_mass(T_R, Ys_R, E_R)
-
-                                E_L = rho_L*E_L + 5.e-1*rho_L*vel_L_rms
-                                E_R = rho_R*E_R + 5.e-1*rho_R*vel_R_rms
-                                H_L = (E_L + pres_L)/rho_L
-                                H_R = (E_R + pres_R)/rho_R
-                            elseif (mhd .and. relativity) then
-                                Ga%L = 1._wp/sqrt(1._wp - vel_L_rms)
-                                Ga%R = 1._wp/sqrt(1._wp - vel_R_rms)
-                                #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
-                                    vdotB%L = vel_L(1)*B%L(1) + vel_L(2)*B%L(2) + vel_L(3)*B%L(3)
-                                    vdotB%R = vel_R(1)*B%R(1) + vel_R(2)*B%R(2) + vel_R(3)*B%R(3)
-
-                                    b4%L(1:3) = B%L(1:3)/Ga%L + Ga%L*vel_L(1:3)*vdotB%L
-                                    b4%R(1:3) = B%R(1:3)/Ga%R + Ga%R*vel_R(1:3)*vdotB%R
-                                    B2%L = B%L(1)**2._wp + B%L(2)**2._wp + B%L(3)**2._wp
-                                    B2%R = B%R(1)**2._wp + B%R(2)**2._wp + B%R(3)**2._wp
-                                #:endif
-
-                                pres_mag%L = 0.5_wp*(B2%L/Ga%L**2._wp + vdotB%L**2._wp)
-                                pres_mag%R = 0.5_wp*(B2%R/Ga%R**2._wp + vdotB%R**2._wp)
-
-                                ! Hard-coded EOS
-                                H_L = 1._wp + (gamma_L + 1)*pres_L/rho_L
-                                H_R = 1._wp + (gamma_R + 1)*pres_R/rho_R
-                                #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
-                                    cm%L(1:3) = (rho_L*H_L*Ga%L**2 + B2%L)*vel_L(1:3) - vdotB%L*B%L(1:3)
-                                    cm%R(1:3) = (rho_R*H_R*Ga%R**2 + B2%R)*vel_R(1:3) - vdotB%R*B%R(1:3)
-                                #:endif
-
-                                E_L = rho_L*H_L*Ga%L**2 - pres_L + 0.5_wp*(B2%L + vel_L_rms*B2%L - vdotB%L**2._wp) - rho_L*Ga%L
-                                E_R = rho_R*H_R*Ga%R**2 - pres_R + 0.5_wp*(B2%R + vel_R_rms*B2%R - vdotB%R**2._wp) - rho_R*Ga%R
-                            elseif (mhd .and. .not. relativity) then
-                                #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
-                                    pres_mag%L = 0.5_wp*(B%L(1)**2._wp + B%L(2)**2._wp + B%L(3)**2._wp)
-                                    pres_mag%R = 0.5_wp*(B%R(1)**2._wp + B%R(2)**2._wp + B%R(3)**2._wp)
-                                #:endif
-                                E_L = gamma_L*pres_L + pi_inf_L + 0.5_wp*rho_L*vel_L_rms + qv_L + pres_mag%L
-                                E_R = gamma_R*pres_R + pi_inf_R + 0.5_wp*rho_R*vel_R_rms + qv_R + pres_mag%R ! includes magnetic energy
-                                H_L = (E_L + pres_L - pres_mag%L)/rho_L
-                                H_R = (E_R + pres_R - pres_mag%R)/rho_R ! stagnation enthalpy here excludes magnetic energy (only used to find speed of sound)
-                            else
-                                E_L = gamma_L*pres_L + pi_inf_L + 5.e-1*rho_L*vel_L_rms + qv_L
-                                E_R = gamma_R*pres_R + pi_inf_R + 5.e-1*rho_R*vel_R_rms + qv_R
-                                H_L = (E_L + pres_L)/rho_L
-                                H_R = (E_R + pres_R)/rho_R
-                            end if
-
-                            ! elastic energy update
-                            if (hypoelasticity) then
-                                G_L = 0._wp; G_R = 0._wp
-
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, num_fluids
-                                    G_L = G_L + alpha_L(i)*Gs_rs(i)
-                                    G_R = G_R + alpha_R(i)*Gs_rs(i)
-                                end do
-
-                                if (cont_damage) then
-                                    G_L = G_L*max((1._wp - qL_prim_rs${XYZ}$_vf(j, k, l, damage_idx)), 0._wp)
-                                    G_R = G_R*max((1._wp - qR_prim_rs${XYZ}$_vf(j, k, l, damage_idx)), 0._wp)
-                                end if
-
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, strxe - strxb + 1
-                                    tau_e_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, strxb - 1 + i)
-                                    tau_e_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, strxb - 1 + i)
-                                    ! Elastic contribution to energy if G large enough
-                                    !TODO take out if statement if stable without
-                                    if ((G_L > 1000) .and. (G_R > 1000)) then
-                                        E_L = E_L + (tau_e_L(i)*tau_e_L(i))/(4._wp*G_L)
-                                        E_R = E_R + (tau_e_R(i)*tau_e_R(i))/(4._wp*G_R)
-                                        ! Double for shear stresses
-                                        if (any(strxb - 1 + i == shear_indices)) then
-                                            E_L = E_L + (tau_e_L(i)*tau_e_L(i))/(4._wp*G_L)
-                                            E_R = E_R + (tau_e_R(i)*tau_e_R(i))/(4._wp*G_R)
-                                        end if
-                                    end if
-                                end do
-                            end if
-
-                            ! elastic energy update
-                            !if ( hyperelasticity ) then
-                            !    G_L = 0._wp
-                            !    G_R = 0._wp
-                            !
-                            !    $:GPU_LOOP(parallelism='[seq]')
-                            !    do i = 1, num_fluids
-                            !        G_L = G_L + alpha_L(i)*Gs_rs(i)
-                            !        G_R = G_R + alpha_R(i)*Gs_rs(i)
-                            !    end do
-                            !    ! Elastic contribution to energy if G large enough
-                            !    if ((G_L > 1.e-3_wp) .and. (G_R > 1.e-3_wp)) then
-                            !    E_L = E_L + G_L*qL_prim_rs${XYZ}$_vf(j, k, l, xiend + 1)
-                            !    E_R = E_R + G_R*qR_prim_rs${XYZ}$_vf(j + 1, k, l, xiend + 1)
-                            !    $:GPU_LOOP(parallelism='[seq]')
-                            !    do i = 1, b_size-1
-                            !        tau_e_L(i) = G_L*qL_prim_rs${XYZ}$_vf(j, k, l, strxb - 1 + i)
-                            !        tau_e_R(i) = G_R*qR_prim_rs${XYZ}$_vf(j + 1, k, l, strxb - 1 + i)
-                            !    end do
-                            !    $:GPU_LOOP(parallelism='[seq]')
-                            !    do i = 1, b_size-1
-                            !        tau_e_L(i) = 0._wp
-                            !        tau_e_R(i) = 0._wp
-                            !    end do
-                            !    $:GPU_LOOP(parallelism='[seq]')
-                            !    do i = 1, num_dims
-                            !        xi_field_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, xibeg - 1 + i)
-                            !        xi_field_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, xibeg - 1 + i)
-                            !    end do
-                            !    end if
-                            !end if
-
-                            @:compute_average_state()
-
-                            call s_compute_speed_of_sound(pres_L, rho_L, gamma_L, pi_inf_L, H_L, alpha_L, &
-                                                          vel_L_rms, 0._wp, c_L, qv_L)
-
-                            call s_compute_speed_of_sound(pres_R, rho_R, gamma_R, pi_inf_R, H_R, alpha_R, &
-                                                          vel_R_rms, 0._wp, c_R, qv_R)
-
-                            !> The computation of c_avg does not require all the variables, and therefore the non '_avg'
-                            ! variables are placeholders to call the subroutine.
-
-                            call s_compute_speed_of_sound(pres_R, rho_avg, gamma_avg, pi_inf_R, H_avg, alpha_R, &
-                                                          vel_avg_rms, c_sum_Yi_Phi, c_avg, qv_avg)
-
-                            if (mhd) then
-                                call s_compute_fast_magnetosonic_speed(rho_L, c_L, B%L, norm_dir, c_fast%L, H_L)
-                                call s_compute_fast_magnetosonic_speed(rho_R, c_R, B%R, norm_dir, c_fast%R, H_R)
-                            end if
-
-                            if (hyper_cleaning) then ! mhd
-                                c_fast%L = min(c_fast%L, -hyper_cleaning_speed)
-                                c_fast%R = max(c_fast%R, hyper_cleaning_speed)
-                            end if
-
-                            if (viscous) then
-                                if (chemistry) then
-                                    call compute_viscosity_and_inversion(T_L, Ys_L, T_R, Ys_R, Re_L(1), Re_R(1))
-                                end if
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, 2
-                                    Re_avg_rs${XYZ}$_vf(j, k, l, i) = 2._wp/(1._wp/Re_L(i) + 1._wp/Re_R(i))
-                                end do
-                            end if
-
-                            if (wave_speeds == 1) then
-                                if (mhd) then
-                                    s_L = min(vel_L(dir_idx(1)) - c_fast%L, vel_R(dir_idx(1)) - c_fast%R)
-                                    s_R = max(vel_R(dir_idx(1)) + c_fast%R, vel_L(dir_idx(1)) + c_fast%L)
-                                elseif (hypoelasticity) then
-                                    s_L = min(vel_L(dir_idx(1)) - sqrt(c_L*c_L + &
-                                                                       (((4._wp*G_L)/3._wp) + &
-                                                                        tau_e_L(dir_idx_tau(1)))/rho_L) &
-                                              , vel_R(dir_idx(1)) - sqrt(c_R*c_R + &
-                                                                         (((4._wp*G_R)/3._wp) + &
-                                                                          tau_e_R(dir_idx_tau(1)))/rho_R))
-                                    s_R = max(vel_R(dir_idx(1)) + sqrt(c_R*c_R + &
-                                                                       (((4._wp*G_R)/3._wp) + &
-                                                                        tau_e_R(dir_idx_tau(1)))/rho_R) &
-                                              , vel_L(dir_idx(1)) + sqrt(c_L*c_L + &
-                                                                         (((4._wp*G_L)/3._wp) + &
-                                                                          tau_e_L(dir_idx_tau(1)))/rho_L))
-                                else if (hyperelasticity) then
-                                    s_L = min(vel_L(dir_idx(1)) - sqrt(c_L*c_L + (4._wp*G_L/3._wp)/rho_L) &
-                                              , vel_R(dir_idx(1)) - sqrt(c_R*c_R + (4._wp*G_R/3._wp)/rho_R))
-                                    s_R = max(vel_R(dir_idx(1)) + sqrt(c_R*c_R + (4._wp*G_R/3._wp)/rho_R) &
-                                              , vel_L(dir_idx(1)) + sqrt(c_L*c_L + (4._wp*G_L/3._wp)/rho_L))
-                                else
-                                    s_L = min(vel_L(dir_idx(1)) - c_L, vel_R(dir_idx(1)) - c_R)
-                                    s_R = max(vel_R(dir_idx(1)) + c_R, vel_L(dir_idx(1)) + c_L)
-                                end if
-
-                                s_S = (pres_R - pres_L + rho_L*vel_L(dir_idx(1))* &
-                                       (s_L - vel_L(dir_idx(1))) - &
-                                       rho_R*vel_R(dir_idx(1))* &
-                                       (s_R - vel_R(dir_idx(1)))) &
-                                      /(rho_L*(s_L - vel_L(dir_idx(1))) - &
-                                        rho_R*(s_R - vel_R(dir_idx(1))))
-                            elseif (wave_speeds == 2) then
-                                pres_SL = 5.e-1_wp*(pres_L + pres_R + rho_avg*c_avg* &
-                                                    (vel_L(dir_idx(1)) - &
-                                                     vel_R(dir_idx(1))))
-
-                                pres_SR = pres_SL
-
-                                Ms_L = max(1._wp, sqrt(1._wp + ((5.e-1_wp + gamma_L)/(1._wp + gamma_L))* &
-                                                       (pres_SL/pres_L - 1._wp)*pres_L/ &
-                                                       ((pres_L + pi_inf_L/(1._wp + gamma_L)))))
-                                Ms_R = max(1._wp, sqrt(1._wp + ((5.e-1_wp + gamma_R)/(1._wp + gamma_R))* &
-                                                       (pres_SR/pres_R - 1._wp)*pres_R/ &
-                                                       ((pres_R + pi_inf_R/(1._wp + gamma_R)))))
-
-                                s_L = vel_L(dir_idx(1)) - c_L*Ms_L
-                                s_R = vel_R(dir_idx(1)) + c_R*Ms_R
-
-                                s_S = 5.e-1_wp*((vel_L(dir_idx(1)) + vel_R(dir_idx(1))) + &
-                                                (pres_L - pres_R)/ &
-                                                (rho_avg*c_avg))
-                            end if
-
-                            s_M = min(0._wp, s_L); s_P = max(0._wp, s_R)
-
-                            xi_M = (5.e-1_wp + sign(5.e-1_wp, s_L)) &
-                                   + (5.e-1_wp - sign(5.e-1_wp, s_L)) &
-                                   *(5.e-1_wp + sign(5.e-1_wp, s_R))
-                            xi_P = (5.e-1_wp - sign(5.e-1_wp, s_R)) &
-                                   + (5.e-1_wp - sign(5.e-1_wp, s_L)) &
-                                   *(5.e-1_wp + sign(5.e-1_wp, s_R))
-
-                            ! Low Mach correction
-                            if (low_Mach == 1) then
-                                @:compute_low_Mach_correction()
-                            else
-                                pcorr = 0._wp
-                            end if
-
-                            ! Mass
-                            if (.not. relativity) then
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, contxe
-                                    flux_rs${XYZ}$_vf(j, k, l, i) = &
-                                        (s_M*alpha_rho_R(i)*vel_R(norm_dir) &
-                                         - s_P*alpha_rho_L(i)*vel_L(norm_dir) &
-                                         + s_M*s_P*(alpha_rho_L(i) &
-                                                    - alpha_rho_R(i))) &
-                                        /(s_M - s_P)
-                                end do
-                            elseif (relativity) then
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, contxe
-                                    flux_rs${XYZ}$_vf(j, k, l, i) = &
-                                        (s_M*Ga%R*alpha_rho_R(i)*vel_R(norm_dir) &
-                                         - s_P*Ga%L*alpha_rho_L(i)*vel_L(norm_dir) &
-                                         + s_M*s_P*(Ga%L*alpha_rho_L(i) &
-                                                    - Ga%R*alpha_rho_R(i))) &
-                                        /(s_M - s_P)
-                                end do
-                            end if
-
-                            ! Momentum
-                            if (mhd .and. (.not. relativity)) then
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, 3
-                                    ! Flux of rho*v_i in the ${XYZ}$ direction
-                                    ! = rho * v_i * v_${XYZ}$ - B_i * B_${XYZ}$ + delta_(${XYZ}$,i) * p_tot
-                                    flux_rs${XYZ}$_vf(j, k, l, contxe + i) = &
-                                        (s_M*(rho_R*vel_R(i)*vel_R(norm_dir) &
-                                              - B%R(i)*B%R(norm_dir) &
-                                              + dir_flg(i)*(pres_R + pres_mag%R)) &
-                                         - s_P*(rho_L*vel_L(i)*vel_L(norm_dir) &
-                                                - B%L(i)*B%L(norm_dir) &
-                                                + dir_flg(i)*(pres_L + pres_mag%L)) &
-                                         + s_M*s_P*(rho_L*vel_L(i) - rho_R*vel_R(i))) &
-                                        /(s_M - s_P)
-                                end do
-                            elseif (mhd .and. relativity) then
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, 3
-                                    ! Flux of m_i in the ${XYZ}$ direction
-                                    ! = m_i * v_${XYZ}$ - b_i/Gamma * B_${XYZ}$ + delta_(${XYZ}$,i) * p_tot
-                                    flux_rs${XYZ}$_vf(j, k, l, contxe + i) = &
-                                        (s_M*(cm%R(i)*vel_R(norm_dir) &
-                                              - b4%R(i)/Ga%R*B%R(norm_dir) &
-                                              + dir_flg(i)*(pres_R + pres_mag%R)) &
-                                         - s_P*(cm%L(i)*vel_L(norm_dir) &
-                                                - b4%L(i)/Ga%L*B%L(norm_dir) &
-                                                + dir_flg(i)*(pres_L + pres_mag%L)) &
-                                         + s_M*s_P*(cm%L(i) - cm%R(i))) &
-                                        /(s_M - s_P)
-                                end do
-                            elseif (bubbles_euler) then
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, num_vels
-                                    flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = &
-                                        (s_M*(rho_R*vel_R(dir_idx(1)) &
-                                              *vel_R(dir_idx(i)) &
-                                              + dir_flg(dir_idx(i))*(pres_R - ptilde_R)) &
-                                         - s_P*(rho_L*vel_L(dir_idx(1)) &
-                                                *vel_L(dir_idx(i)) &
-                                                + dir_flg(dir_idx(i))*(pres_L - ptilde_L)) &
-                                         + s_M*s_P*(rho_L*vel_L(dir_idx(i)) &
-                                                    - rho_R*vel_R(dir_idx(i)))) &
-                                        /(s_M - s_P) &
-                                        + (s_M/s_L)*(s_P/s_R)*pcorr*(vel_R(dir_idx(i)) - vel_L(dir_idx(i)))
-                                end do
-                            else if (hypoelasticity) then
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, num_vels
-                                    flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = &
-                                        (s_M*(rho_R*vel_R(dir_idx(1)) &
-                                              *vel_R(dir_idx(i)) &
-                                              + dir_flg(dir_idx(i))*pres_R &
-                                              - tau_e_R(dir_idx_tau(i))) &
-                                         - s_P*(rho_L*vel_L(dir_idx(1)) &
-                                                *vel_L(dir_idx(i)) &
-                                                + dir_flg(dir_idx(i))*pres_L &
-                                                - tau_e_L(dir_idx_tau(i))) &
-                                         + s_M*s_P*(rho_L*vel_L(dir_idx(i)) &
-                                                    - rho_R*vel_R(dir_idx(i)))) &
-                                        /(s_M - s_P)
-                                end do
-                            else
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, num_vels
-                                    flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = &
-                                        (s_M*(rho_R*vel_R(dir_idx(1)) &
-                                              *vel_R(dir_idx(i)) &
-                                              + dir_flg(dir_idx(i))*pres_R) &
-                                         - s_P*(rho_L*vel_L(dir_idx(1)) &
-                                                *vel_L(dir_idx(i)) &
-                                                + dir_flg(dir_idx(i))*pres_L) &
-                                         + s_M*s_P*(rho_L*vel_L(dir_idx(i)) &
-                                                    - rho_R*vel_R(dir_idx(i)))) &
-                                        /(s_M - s_P) &
-                                        + (s_M/s_L)*(s_P/s_R)*pcorr*(vel_R(dir_idx(i)) - vel_L(dir_idx(i)))
-                                end do
-                            end if
-
-                            ! Energy
-                            if (mhd .and. (.not. relativity)) then
-                                ! energy flux = (E + p + p_mag) * v_${XYZ}$ - B_${XYZ}$ * (v_x*B_x + v_y*B_y + v_z*B_z)
-                                #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
-                                    flux_rs${XYZ}$_vf(j, k, l, E_idx) = &
-                                        (s_M*(vel_R(norm_dir)*(E_R + pres_R + pres_mag%R) - B%R(norm_dir)*(vel_R(1)*B%R(1) + vel_R(2)*B%R(2) + vel_R(3)*B%R(3))) &
-                                         - s_P*(vel_L(norm_dir)*(E_L + pres_L + pres_mag%L) - B%L(norm_dir)*(vel_L(1)*B%L(1) + vel_L(2)*B%L(2) + vel_L(3)*B%L(3))) &
-                                         + s_M*s_P*(E_L - E_R)) &
-                                        /(s_M - s_P)
-                                #:endif
-                            elseif (mhd .and. relativity) then
-                                ! energy flux = m_${XYZ}$ - mass flux
-                                ! Hard-coded for single-component for now
-                                flux_rs${XYZ}$_vf(j, k, l, E_idx) = &
-                                    (s_M*(cm%R(norm_dir) - Ga%R*alpha_rho_R(1)*vel_R(norm_dir)) &
-                                     - s_P*(cm%L(norm_dir) - Ga%L*alpha_rho_L(1)*vel_L(norm_dir)) &
-                                     + s_M*s_P*(E_L - E_R)) &
-                                    /(s_M - s_P)
-                            else if (bubbles_euler) then
-                                flux_rs${XYZ}$_vf(j, k, l, E_idx) = &
-                                    (s_M*vel_R(dir_idx(1))*(E_R + pres_R - ptilde_R) &
-                                     - s_P*vel_L(dir_idx(1))*(E_L + pres_L - ptilde_L) &
-                                     + s_M*s_P*(E_L - E_R)) &
-                                    /(s_M - s_P) &
-                                    + (s_M/s_L)*(s_P/s_R)*pcorr*(vel_R_rms - vel_L_rms)/2._wp
-                            else if (hypoelasticity) then
-                                flux_tau_L = 0._wp; flux_tau_R = 0._wp
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, num_dims
-                                    flux_tau_L = flux_tau_L + tau_e_L(dir_idx_tau(i))*vel_L(dir_idx(i))
-                                    flux_tau_R = flux_tau_R + tau_e_R(dir_idx_tau(i))*vel_R(dir_idx(i))
-                                end do
-                                flux_rs${XYZ}$_vf(j, k, l, E_idx) = &
-                                    (s_M*(vel_R(dir_idx(1))*(E_R + pres_R) - flux_tau_R) &
-                                     - s_P*(vel_L(dir_idx(1))*(E_L + pres_L) - flux_tau_L) &
-                                     + s_M*s_P*(E_L - E_R))/(s_M - s_P)
-                            else
-                                flux_rs${XYZ}$_vf(j, k, l, E_idx) = &
-                                    (s_M*vel_R(dir_idx(1))*(E_R + pres_R) &
-                                     - s_P*vel_L(dir_idx(1))*(E_L + pres_L) &
-                                     + s_M*s_P*(E_L - E_R)) &
-                                    /(s_M - s_P) &
-                                    + (s_M/s_L)*(s_P/s_R)*pcorr*(vel_R_rms - vel_L_rms)/2._wp
-                            end if
-
-                            ! Elastic Stresses
-                            if (hypoelasticity) then
-                                do i = 1, strxe - strxb + 1 !TODO: this indexing may be slow
-                                    flux_rs${XYZ}$_vf(j, k, l, strxb - 1 + i) = &
-                                        (s_M*(rho_R*vel_R(dir_idx(1)) &
-                                              *tau_e_R(i)) &
-                                         - s_P*(rho_L*vel_L(dir_idx(1)) &
-                                                *tau_e_L(i)) &
-                                         + s_M*s_P*(rho_L*tau_e_L(i) &
-                                                    - rho_R*tau_e_R(i))) &
-                                        /(s_M - s_P)
-                                end do
-                            end if
-
-                            ! Advection
-                            $:GPU_LOOP(parallelism='[seq]')
-                            do i = advxb, advxe
-                                flux_rs${XYZ}$_vf(j, k, l, i) = &
-                                    (qL_prim_rs${XYZ}$_vf(j, k, l, i) &
-                                     - qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)) &
-                                    *s_M*s_P/(s_M - s_P)
-                                flux_src_rs${XYZ}$_vf(j, k, l, i) = &
-                                    (s_M*qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) &
-                                     - s_P*qL_prim_rs${XYZ}$_vf(j, k, l, i)) &
-                                    /(s_M - s_P)
-                            end do
-
-                            if (bubbles_euler) then
-                                ! From HLLC: Kills mass transport @ bubble gas density
-                                if (num_fluids > 1) then
-                                    flux_rs${XYZ}$_vf(j, k, l, contxe) = 0._wp
-                                end if
-                            end if
-
-                            if (chemistry) then
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = chemxb, chemxe
-                                    Y_L = qL_prim_rs${XYZ}$_vf(j, k, l, i)
-                                    Y_R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)
-
-                                    flux_rs${XYZ}$_vf(j, k, l, i) = (s_M*Y_R*rho_R*vel_R(dir_idx(1)) &
-                                                                     - s_P*Y_L*rho_L*vel_L(dir_idx(1)) &
-                                                                     + s_M*s_P*(Y_L*rho_L - Y_R*rho_R)) &
-                                                                    /(s_M - s_P)
-                                    flux_src_rs${XYZ}$_vf(j, k, l, i) = 0._wp
-                                end do
-                            end if
-
-                            if (mhd) then
-                                if (n == 0) then ! 1D: d/dx flux only & Bx = Bx0 = const.
-                                    ! B_y flux = v_x * B_y - v_y * Bx0
-                                    ! B_z flux = v_x * B_z - v_z * Bx0
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = 0, 1
-                                        flux_rsx_vf(j, k, l, B_idx%beg + i) = (s_M*(vel_R(1)*B%R(2 + i) - vel_R(2 + i)*Bx0) &
-                                                                               - s_P*(vel_L(1)*B%L(2 + i) - vel_L(2 + i)*Bx0) &
-                                                                               + s_M*s_P*(B%L(2 + i) - B%R(2 + i)))/(s_M - s_P)
-                                    end do
-                                else ! 2D/3D: Bx, By, Bz /= const. but zero flux component in the same direction
-                                    ! B_x d/d${XYZ}$ flux = (1 - delta(x,${XYZ}$)) * (v_${XYZ}$ * B_x - v_x * B_${XYZ}$)
-                                    ! B_y d/d${XYZ}$ flux = (1 - delta(y,${XYZ}$)) * (v_${XYZ}$ * B_y - v_y * B_${XYZ}$)
-                                    ! B_z d/d${XYZ}$ flux = (1 - delta(z,${XYZ}$)) * (v_${XYZ}$ * B_z - v_z * B_${XYZ}$)
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = 0, 2
-                                        flux_rs${XYZ}$_vf(j, k, l, B_idx%beg + i) = (s_M*(vel_R(dir_idx(1))*B%R(i + 1) - vel_R(i + 1)*B%R(norm_dir)) - &
-                                                                                     s_P*(vel_L(dir_idx(1))*B%L(i + 1) - vel_L(i + 1)*B%L(norm_dir)) + &
-                                                                                     s_M*s_P*(B%L(i + 1) - B%R(i + 1)))/(s_M - s_P)
-                                    end do
-
-                                    if (hyper_cleaning) then
-                                        ! propagate magnetic field divergence as a wave
-                                        flux_rs${XYZ}$_vf(j, k, l, B_idx%beg + norm_dir - 1) = flux_rs${XYZ}$_vf(j, k, l, B_idx%beg + norm_dir - 1) + &
-                                                                                               (s_M*qR_prim_rs${XYZ}$_vf(j + 1, k, l, psi_idx) - s_P*qL_prim_rs${XYZ}$_vf(j, k, l, psi_idx))/(s_M - s_P)
-
-                                        flux_rs${XYZ}$_vf(j, k, l, psi_idx) = (hyper_cleaning_speed**2*(s_M*B%R(norm_dir) - s_P*B%L(norm_dir)) + s_M*s_P*(qL_prim_rs${XYZ}$_vf(j, k, l, psi_idx) - qR_prim_rs${XYZ}$_vf(j + 1, k, l, psi_idx)))/(s_M - s_P)
-                                    else
-                                        flux_rs${XYZ}$_vf(j, k, l, B_idx%beg + norm_dir - 1) = 0._wp ! Without hyperbolic cleaning, make sure flux of B_normal is identically zero
-                                    end if
-                                end if
-                                flux_src_rs${XYZ}$_vf(j, k, l, advxb) = 0._wp
-                            end if
-
-                            #:if (NORM_DIR == 2)
-                                if (cyl_coord) then
-                                    !Substituting the advective flux into the inviscid geometrical source flux
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = 1, E_idx
-                                        flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i)
-                                    end do
-                                    ! Recalculating the radial momentum geometric source flux
-                                    flux_gsrc_rs${XYZ}$_vf(j, k, l, contxe + 2) = &
-                                        flux_rs${XYZ}$_vf(j, k, l, contxe + 2) &
-                                        - (s_M*pres_R - s_P*pres_L)/(s_M - s_P)
-                                    ! Geometrical source of the void fraction(s) is zero
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = advxb, advxe
-                                        flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i)
-                                    end do
-                                end if
-
-                                if (cyl_coord .and. hypoelasticity) then
-                                    ! += tau_sigmasigma using HLL
-                                    flux_gsrc_rs${XYZ}$_vf(j, k, l, contxe + 2) = &
-                                        flux_gsrc_rs${XYZ}$_vf(j, k, l, contxe + 2) + &
-                                        (s_M*tau_e_R(4) - s_P*tau_e_L(4)) &
-                                        /(s_M - s_P)
-
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = strxb, strxe
-                                        flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i)
-                                    end do
-                                end if
-                            #:endif
-
-                        end do
-                    end do
-                end do
-                $:END_GPU_PARALLEL_LOOP()
-            end if
-
-        #:endfor
-
-        if (viscous .or. dummy) then
-            if (weno_Re_flux) then
-
-                call s_compute_viscous_source_flux( &
-                    qL_prim_vf(momxb:momxe), &
-                    dqL_prim_dx_vf(momxb:momxe), &
-                    dqL_prim_dy_vf(momxb:momxe), &
-                    dqL_prim_dz_vf(momxb:momxe), &
-                    qR_prim_vf(momxb:momxe), &
-                    dqR_prim_dx_vf(momxb:momxe), &
-                    dqR_prim_dy_vf(momxb:momxe), &
-                    dqR_prim_dz_vf(momxb:momxe), &
-                    flux_src_vf, norm_dir, ix, iy, iz)
-            else
-                call s_compute_viscous_source_flux( &
-                    q_prim_vf(momxb:momxe), &
-                    dqL_prim_dx_vf(momxb:momxe), &
-                    dqL_prim_dy_vf(momxb:momxe), &
-                    dqL_prim_dz_vf(momxb:momxe), &
-                    q_prim_vf(momxb:momxe), &
-                    dqR_prim_dx_vf(momxb:momxe), &
-                    dqR_prim_dy_vf(momxb:momxe), &
-                    dqR_prim_dz_vf(momxb:momxe), &
-                    flux_src_vf, norm_dir, ix, iy, iz)
-            end if
-        end if
-
-        call s_finalize_riemann_solver(flux_vf, flux_src_vf, &
-                                       flux_gsrc_vf, &
-                                       norm_dir)
-
-    end subroutine s_hll_riemann_solver
-
-    !> @brief Computes intercell fluxes using the Lax-Friedrichs (LF) approximate Riemann solver.
-    subroutine s_lf_riemann_solver(qL_prim_rsx_vf, qL_prim_rsy_vf, qL_prim_rsz_vf, dqL_prim_dx_vf, &
-                                   dqL_prim_dy_vf, &
-                                   dqL_prim_dz_vf, &
-                                   qL_prim_vf, &
-                                   qR_prim_rsx_vf, qR_prim_rsy_vf, qR_prim_rsz_vf, dqR_prim_dx_vf, &
-                                   dqR_prim_dy_vf, &
-                                   dqR_prim_dz_vf, &
-                                   qR_prim_vf, &
-                                   q_prim_vf, &
-                                   flux_vf, flux_src_vf, &
-                                   flux_gsrc_vf, &
-                                   norm_dir, ix, iy, iz)
-
-        real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:), intent(inout) :: qL_prim_rsx_vf, qL_prim_rsy_vf, qL_prim_rsz_vf, qR_prim_rsx_vf, qR_prim_rsy_vf, qR_prim_rsz_vf
-        type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf
-
-        type(scalar_field), allocatable, dimension(:), intent(inout) :: qL_prim_vf, qR_prim_vf
-
-        type(scalar_field), &
-            allocatable, dimension(:), &
-            intent(inout) :: dqL_prim_dx_vf, dqR_prim_dx_vf, &
-                             dqL_prim_dy_vf, dqR_prim_dy_vf, &
-                             dqL_prim_dz_vf, dqR_prim_dz_vf
-
-        ! Intercell fluxes
-        type(scalar_field), &
-            dimension(sys_size), &
-            intent(inout) :: flux_vf, flux_src_vf, flux_gsrc_vf
-        real(wp) :: flux_tau_L, flux_tau_R
-
-        integer, intent(in) :: norm_dir
-        type(int_bounds_info), intent(in) :: ix, iy, iz
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
-            real(wp), dimension(3) :: alpha_rho_L, alpha_rho_R
-            real(wp), dimension(3) :: vel_L, vel_R
-            real(wp), dimension(3) :: alpha_L, alpha_R
-            real(wp), dimension(10) :: Ys_L, Ys_R
-            real(wp), dimension(10) :: Cp_iL, Cp_iR, Xs_L, Xs_R, Gamma_iL, Gamma_iR
-            real(wp), dimension(10) :: Yi_avg, Phi_avg, h_iL, h_iR, h_avg_2
-            real(wp), dimension(3, 3) :: vel_grad_L, vel_grad_R       !< Averaged velocity gradient tensor `d(vel_i)/d(coord_j)`.
-        #:else
-            real(wp), dimension(num_fluids) :: alpha_rho_L, alpha_rho_R
-            real(wp), dimension(num_vels) :: vel_L, vel_R
-            real(wp), dimension(num_fluids) :: alpha_L, alpha_R
-            real(wp), dimension(num_species) :: Ys_L, Ys_R
-            real(wp), dimension(num_species) :: Cp_iL, Cp_iR, Xs_L, Xs_R, Gamma_iL, Gamma_iR
-            real(wp), dimension(num_species) :: Yi_avg, Phi_avg, h_iL, h_iR, h_avg_2
-            real(wp), dimension(num_dims, num_dims) :: vel_grad_L, vel_grad_R       !< Averaged velocity gradient tensor `d(vel_i)/d(coord_j)`.
-        #:endif
-        real(wp) :: rho_L, rho_R
-
-        real(wp) :: pres_L, pres_R
-        real(wp) :: E_L, E_R
-        real(wp) :: H_L, H_R
-        real(wp) :: Cp_avg, Cv_avg, T_avg, eps, c_sum_Yi_Phi
-        real(wp) :: T_L, T_R
-        real(wp) :: Y_L, Y_R
-        real(wp) :: MW_L, MW_R
-        real(wp) :: R_gas_L, R_gas_R
-        real(wp) :: Cp_L, Cp_R
-        real(wp) :: Cv_L, Cv_R
-        real(wp) :: Gamm_L, Gamm_R
-        real(wp) :: gamma_L, gamma_R
-        real(wp) :: pi_inf_L, pi_inf_R
-        real(wp) :: qv_L, qv_R
-        real(wp) :: c_L, c_R
-        real(wp), dimension(6) :: tau_e_L, tau_e_R
-        real(wp) :: G_L, G_R
-        real(wp), dimension(2) :: Re_L, Re_R
-        real(wp), dimension(3) :: xi_field_L, xi_field_R
-
-        real(wp) :: rho_avg
-        real(wp) :: H_avg
-        real(wp) :: gamma_avg
-        real(wp) :: c_avg
-
-        real(wp) :: s_L, s_R, s_M, s_P, s_S
-        real(wp) :: xi_M, xi_P
-
-        real(wp) :: ptilde_L, ptilde_R
-        real(wp) :: vel_L_rms, vel_R_rms, vel_avg_rms
-        real(wp) :: vel_L_tmp, vel_R_tmp
-        real(wp) :: Ms_L, Ms_R, pres_SL, pres_SR
-        real(wp) :: alpha_L_sum, alpha_R_sum
-        real(wp) :: zcoef, pcorr !< low Mach number correction
-
-        type(riemann_states) :: c_fast, pres_mag
-        type(riemann_states_vec3) :: B
-
-        type(riemann_states) :: Ga ! Gamma (Lorentz factor)
-        type(riemann_states) :: vdotB, B2
-        type(riemann_states_vec3) :: b4 ! 4-magnetic field components (spatial: b4x, b4y, b4z)
-        type(riemann_states_vec3) :: cm ! Conservative momentum variables
-
-        integer :: i, j, k, l, q !< Generic loop iterators
-        integer, dimension(3) :: idx_right_phys                     !< Physical (j,k,l) indices for right state.
-
-        ! Populating the buffers of the left and right Riemann problem
-        ! states variables, based on the choice of boundary conditions
-        call s_populate_riemann_states_variables_buffers( &
-            qL_prim_rsx_vf, qL_prim_rsy_vf, qL_prim_rsz_vf, dqL_prim_dx_vf, &
-            dqL_prim_dy_vf, &
-            dqL_prim_dz_vf, &
-            qR_prim_rsx_vf, qR_prim_rsy_vf, qR_prim_rsz_vf, dqR_prim_dx_vf, &
-            dqR_prim_dy_vf, &
-            dqR_prim_dz_vf, &
-            norm_dir, ix, iy, iz)
-
-        ! Reshaping inputted data based on dimensional splitting direction
-        call s_initialize_riemann_solver( &
-            flux_src_vf, &
-            norm_dir)
-        #:for NORM_DIR, XYZ in [(1, 'x'), (2, 'y'), (3, 'z')]
-
-            if (norm_dir == ${NORM_DIR}$) then
-                $:GPU_PARALLEL_LOOP(collapse=3, private='[i,j,k,l, q, alpha_rho_L,alpha_rho_R,vel_L,vel_R,alpha_L,alpha_R,tau_e_L,tau_e_R,G_L,G_R,Re_L,Re_R,rho_avg,h_avg,gamma_avg,s_L,s_R,s_S,Ys_L,Ys_R,xi_field_L,xi_field_R,Cp_iL,Cp_iR,Xs_L,Xs_R,Gamma_iL,Gamma_iR,Yi_avg,Phi_avg,h_iL,h_iR,h_avg_2,c_fast,pres_mag,B,Ga,vdotB,B2,b4,cm,pcorr,zcoef,vel_grad_L,vel_grad_R,idx_right_phys,vel_L_rms,vel_R_rms,vel_avg_rms,vel_L_tmp,vel_R_tmp,Ms_L,Ms_R,pres_SL,pres_SR,alpha_L_sum,alpha_R_sum,c_avg,pres_L,pres_R,rho_L,rho_R,gamma_L,gamma_R,pi_inf_L,pi_inf_R,qv_L,qv_R,c_L,c_R,E_L,E_R,H_L,H_R,ptilde_L,ptilde_R,s_M,s_P,xi_M,xi_P,Cp_avg,Cv_avg,T_avg,eps,c_sum_Yi_Phi,Cp_L,Cp_R,Cv_L,Cv_R,R_gas_L,R_gas_R,MW_L,MW_R,T_L,T_R,Y_L,Y_R]')
-                do l = is3%beg, is3%end
-                    do k = is2%beg, is2%end
-                        do j = is1%beg, is1%end
-                            $:GPU_LOOP(parallelism='[seq]')
-                            do i = 1, contxe
-                                alpha_rho_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, i)
-                                alpha_rho_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)
-                            end do
-
-                            vel_L_rms = 0._wp; vel_R_rms = 0._wp
-
-                            $:GPU_LOOP(parallelism='[seq]')
-                            do i = 1, num_vels
-                                vel_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, contxe + i)
-                                vel_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, contxe + i)
-                                vel_L_rms = vel_L_rms + vel_L(i)**2._wp
-                                vel_R_rms = vel_R_rms + vel_R(i)**2._wp
-                            end do
-
-                            $:GPU_LOOP(parallelism='[seq]')
-                            do i = 1, num_fluids
-                                alpha_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)
-                                alpha_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)
-                            end do
-
-                            pres_L = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx)
-                            pres_R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx)
-
-                            if (mhd) then
-                                if (n == 0) then ! 1D: constant Bx; By, Bz as variables
-                                    B%L(1) = Bx0
-                                    B%R(1) = Bx0
-                                    B%L(2) = qL_prim_rs${XYZ}$_vf(j, k, l, B_idx%beg)
-                                    B%R(2) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, B_idx%beg)
-                                    B%L(3) = qL_prim_rs${XYZ}$_vf(j, k, l, B_idx%beg + 1)
-                                    B%R(3) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, B_idx%beg + 1)
-                                else ! 2D/3D: Bx, By, Bz as variables
-                                    B%L(1) = qL_prim_rs${XYZ}$_vf(j, k, l, B_idx%beg)
-                                    B%R(1) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, B_idx%beg)
-                                    B%L(2) = qL_prim_rs${XYZ}$_vf(j, k, l, B_idx%beg + 1)
-                                    B%R(2) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, B_idx%beg + 1)
-                                    B%L(3) = qL_prim_rs${XYZ}$_vf(j, k, l, B_idx%beg + 2)
-                                    B%R(3) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, B_idx%beg + 2)
-                                end if
-                            end if
-
-                            rho_L = 0._wp
-                            gamma_L = 0._wp
-                            pi_inf_L = 0._wp
-                            qv_L = 0._wp
-
-                            rho_R = 0._wp
-                            gamma_R = 0._wp
-                            pi_inf_R = 0._wp
-                            qv_R = 0._wp
-
-                            alpha_L_sum = 0._wp
-                            alpha_R_sum = 0._wp
-
-                            pres_mag%L = 0._wp
-                            pres_mag%R = 0._wp
-
-                            if (mpp_lim) then
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, num_fluids
-                                    alpha_rho_L(i) = max(0._wp, alpha_rho_L(i))
-                                    alpha_L(i) = min(max(0._wp, alpha_L(i)), 1._wp)
-                                    alpha_L_sum = alpha_L_sum + alpha_L(i)
-                                    alpha_rho_R(i) = max(0._wp, alpha_rho_R(i))
-                                    alpha_R(i) = min(max(0._wp, alpha_R(i)), 1._wp)
-                                    alpha_R_sum = alpha_R_sum + alpha_R(i)
-                                end do
-
-                                alpha_L = alpha_L/max(alpha_L_sum, sgm_eps)
-                                alpha_R = alpha_R/max(alpha_R_sum, sgm_eps)
-                            end if
-
-                            $:GPU_LOOP(parallelism='[seq]')
-                            do i = 1, num_fluids
-                                rho_L = rho_L + alpha_rho_L(i)
-                                gamma_L = gamma_L + alpha_L(i)*gammas(i)
-                                pi_inf_L = pi_inf_L + alpha_L(i)*pi_infs(i)
-                                qv_L = qv_L + alpha_rho_L(i)*qvs(i)
-
-                                rho_R = rho_R + alpha_rho_R(i)
-                                gamma_R = gamma_R + alpha_R(i)*gammas(i)
-                                pi_inf_R = pi_inf_R + alpha_R(i)*pi_infs(i)
-                                qv_R = qv_R + alpha_rho_R(i)*qvs(i)
-                            end do
-
-                            if (viscous) then
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, 2
-                                    Re_L(i) = dflt_real
-                                    Re_R(i) = dflt_real
-
-                                    if (Re_size(i) > 0) Re_L(i) = 0._wp
-                                    if (Re_size(i) > 0) Re_R(i) = 0._wp
-
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do q = 1, Re_size(i)
-                                        Re_L(i) = alpha_L(Re_idx(i, q))/Res_gs(i, q) &
-                                                  + Re_L(i)
-                                        Re_R(i) = alpha_R(Re_idx(i, q))/Res_gs(i, q) &
-                                                  + Re_R(i)
-                                    end do
-
-                                    Re_L(i) = 1._wp/max(Re_L(i), sgm_eps)
-                                    Re_R(i) = 1._wp/max(Re_R(i), sgm_eps)
-                                end do
-                            end if
-
-                            if (chemistry) then
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = chemxb, chemxe
-                                    Ys_L(i - chemxb + 1) = qL_prim_rs${XYZ}$_vf(j, k, l, i)
-                                    Ys_R(i - chemxb + 1) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)
-                                end do
-
-                                call get_mixture_molecular_weight(Ys_L, MW_L)
-                                call get_mixture_molecular_weight(Ys_R, MW_R)
-
-                                #:if USING_AMD
-                                    Xs_L(:) = Ys_L(:)*MW_L/molecular_weights_nonparameter(:)
-                                    Xs_R(:) = Ys_R(:)*MW_R/molecular_weights_nonparameter(:)
-                                #:else
-                                    Xs_L(:) = Ys_L(:)*MW_L/molecular_weights(:)
-                                    Xs_R(:) = Ys_R(:)*MW_R/molecular_weights(:)
-                                #:endif
-
-                                R_gas_L = gas_constant/MW_L
-                                R_gas_R = gas_constant/MW_R
-                                T_L = pres_L/rho_L/R_gas_L
-                                T_R = pres_R/rho_R/R_gas_R
-
-                                call get_species_specific_heats_r(T_L, Cp_iL)
-                                call get_species_specific_heats_r(T_R, Cp_iR)
-
-                                if (chem_params%gamma_method == 1) then
-                                    ! gamma_method = 1: Ref. Section 2.3.1 Formulation of doi:10.7907/ZKW8-ES97.
-                                    Gamma_iL = Cp_iL/(Cp_iL - 1.0_wp)
-                                    Gamma_iR = Cp_iR/(Cp_iR - 1.0_wp)
-
-                                    gamma_L = sum(Xs_L(:)/(Gamma_iL(:) - 1.0_wp))
-                                    gamma_R = sum(Xs_R(:)/(Gamma_iR(:) - 1.0_wp))
-                                else if (chem_params%gamma_method == 2) then
-                                    ! gamma_method = 2: c_p / c_v where c_p, c_v are specific heats.
-                                    call get_mixture_specific_heat_cp_mass(T_L, Ys_L, Cp_L)
-                                    call get_mixture_specific_heat_cp_mass(T_R, Ys_R, Cp_R)
-                                    call get_mixture_specific_heat_cv_mass(T_L, Ys_L, Cv_L)
-                                    call get_mixture_specific_heat_cv_mass(T_R, Ys_R, Cv_R)
-
-                                    Gamm_L = Cp_L/Cv_L
-                                    gamma_L = 1.0_wp/(Gamm_L - 1.0_wp)
-                                    Gamm_R = Cp_R/Cv_R
-                                    gamma_R = 1.0_wp/(Gamm_R - 1.0_wp)
-                                end if
-
-                                call get_mixture_energy_mass(T_L, Ys_L, E_L)
-                                call get_mixture_energy_mass(T_R, Ys_R, E_R)
-
-                                E_L = rho_L*E_L + 5.e-1*rho_L*vel_L_rms
-                                E_R = rho_R*E_R + 5.e-1*rho_R*vel_R_rms
-                                H_L = (E_L + pres_L)/rho_L
-                                H_R = (E_R + pres_R)/rho_R
-                            elseif (mhd .and. relativity) then
-                                #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
-                                    Ga%L = 1._wp/sqrt(1._wp - vel_L_rms)
-                                    Ga%R = 1._wp/sqrt(1._wp - vel_R_rms)
-                                    vdotB%L = vel_L(1)*B%L(1) + vel_L(2)*B%L(2) + vel_L(3)*B%L(3)
-                                    vdotB%R = vel_R(1)*B%R(1) + vel_R(2)*B%R(2) + vel_R(3)*B%R(3)
-
-                                    b4%L(1:3) = B%L(1:3)/Ga%L + Ga%L*vel_L(1:3)*vdotB%L
-                                    b4%R(1:3) = B%R(1:3)/Ga%R + Ga%R*vel_R(1:3)*vdotB%R
-                                    B2%L = B%L(1)**2._wp + B%L(2)**2._wp + B%L(3)**2._wp
-                                    B2%R = B%R(1)**2._wp + B%R(2)**2._wp + B%R(3)**2._wp
-
-                                    pres_mag%L = 0.5_wp*(B2%L/Ga%L**2._wp + vdotB%L**2._wp)
-                                    pres_mag%R = 0.5_wp*(B2%R/Ga%R**2._wp + vdotB%R**2._wp)
-
-                                    ! Hard-coded EOS
-                                    H_L = 1._wp + (gamma_L + 1)*pres_L/rho_L
-                                    H_R = 1._wp + (gamma_R + 1)*pres_R/rho_R
-
-                                    cm%L(1:3) = (rho_L*H_L*Ga%L**2 + B2%L)*vel_L(1:3) - vdotB%L*B%L(1:3)
-                                    cm%R(1:3) = (rho_R*H_R*Ga%R**2 + B2%R)*vel_R(1:3) - vdotB%R*B%R(1:3)
-
-                                    E_L = rho_L*H_L*Ga%L**2 - pres_L + 0.5_wp*(B2%L + vel_L_rms*B2%L - vdotB%L**2._wp) - rho_L*Ga%L
-                                    E_R = rho_R*H_R*Ga%R**2 - pres_R + 0.5_wp*(B2%R + vel_R_rms*B2%R - vdotB%R**2._wp) - rho_R*Ga%R
-                                #:endif
-                            elseif (mhd .and. .not. relativity) then
-                                pres_mag%L = 0.5_wp*(B%L(1)**2._wp + B%L(2)**2._wp + B%L(3)**2._wp)
-                                pres_mag%R = 0.5_wp*(B%R(1)**2._wp + B%R(2)**2._wp + B%R(3)**2._wp)
-                                E_L = gamma_L*pres_L + pi_inf_L + 0.5_wp*rho_L*vel_L_rms + qv_L + pres_mag%L
-                                E_R = gamma_R*pres_R + pi_inf_R + 0.5_wp*rho_R*vel_R_rms + qv_R + pres_mag%R ! includes magnetic energy
-                                H_L = (E_L + pres_L - pres_mag%L)/rho_L
-                                H_R = (E_R + pres_R - pres_mag%R)/rho_R ! stagnation enthalpy here excludes magnetic energy (only used to find speed of sound)
-                            else
-                                E_L = gamma_L*pres_L + pi_inf_L + 5.e-1*rho_L*vel_L_rms + qv_L
-                                E_R = gamma_R*pres_R + pi_inf_R + 5.e-1*rho_R*vel_R_rms + qv_R
-                                H_L = (E_L + pres_L)/rho_L
-                                H_R = (E_R + pres_R)/rho_R
-                            end if
-
-                            ! elastic energy update
-                            if (hypoelasticity) then
-                                G_L = 0._wp; G_R = 0._wp
-
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, num_fluids
-                                    G_L = G_L + alpha_L(i)*Gs_rs(i)
-                                    G_R = G_R + alpha_R(i)*Gs_rs(i)
-                                end do
-
-                                if (cont_damage) then
-                                    G_L = G_L*max((1._wp - qL_prim_rs${XYZ}$_vf(j, k, l, damage_idx)), 0._wp)
-                                    G_R = G_R*max((1._wp - qR_prim_rs${XYZ}$_vf(j, k, l, damage_idx)), 0._wp)
-                                end if
-
-                                do i = 1, strxe - strxb + 1
-                                    tau_e_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, strxb - 1 + i)
-                                    tau_e_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, strxb - 1 + i)
-                                    ! Elastic contribution to energy if G large enough
-                                    !TODO take out if statement if stable without
-                                    if ((G_L > 1000) .and. (G_R > 1000)) then
-                                        E_L = E_L + (tau_e_L(i)*tau_e_L(i))/(4._wp*G_L)
-                                        E_R = E_R + (tau_e_R(i)*tau_e_R(i))/(4._wp*G_R)
-                                        ! Double for shear stresses
-                                        if (any(strxb - 1 + i == shear_indices)) then
-                                            E_L = E_L + (tau_e_L(i)*tau_e_L(i))/(4._wp*G_L)
-                                            E_R = E_R + (tau_e_R(i)*tau_e_R(i))/(4._wp*G_R)
-                                        end if
-                                    end if
-                                end do
-                            end if
-
-                            call s_compute_speed_of_sound(pres_L, rho_L, gamma_L, pi_inf_L, H_L, alpha_L, &
-                                                          vel_L_rms, 0._wp, c_L, qv_L)
-
-                            call s_compute_speed_of_sound(pres_R, rho_R, gamma_R, pi_inf_R, H_R, alpha_R, &
-                                                          vel_R_rms, 0._wp, c_R, qv_R)
-
-                            if (mhd) then
-                                call s_compute_fast_magnetosonic_speed(rho_L, c_L, B%L, norm_dir, c_fast%L, H_L)
-                                call s_compute_fast_magnetosonic_speed(rho_R, c_R, B%R, norm_dir, c_fast%R, H_R)
-                            end if
-
-                            s_L = 0._wp; s_R = 0._wp
-
-                            $:GPU_LOOP(parallelism='[seq]')
-                            do i = 1, num_dims
-                                s_L = s_L + vel_L(i)**2._wp
-                                s_R = s_R + vel_R(i)**2._wp
-                            end do
-
-                            s_L = sqrt(s_L)
-                            s_R = sqrt(s_R)
-
-                            s_P = max(s_L, s_R) + max(c_L, c_R)
-                            s_M = -s_P
-
-                            s_L = s_M
-                            s_R = s_P
-
-                            ! Low Mach correction
-                            if (low_Mach == 1) then
-                                @:compute_low_Mach_correction()
-                            else
-                                pcorr = 0._wp
-                            end if
-
-                            ! Mass
-                            if (.not. relativity) then
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, contxe
-                                    flux_rs${XYZ}$_vf(j, k, l, i) = &
-                                        (s_M*alpha_rho_R(i)*vel_R(norm_dir) &
-                                         - s_P*alpha_rho_L(i)*vel_L(norm_dir) &
-                                         + s_M*s_P*(alpha_rho_L(i) &
-                                                    - alpha_rho_R(i))) &
-                                        /(s_M - s_P)
-                                end do
-                            elseif (relativity) then
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, contxe
-                                    flux_rs${XYZ}$_vf(j, k, l, i) = &
-                                        (s_M*Ga%R*alpha_rho_R(i)*vel_R(norm_dir) &
-                                         - s_P*Ga%L*alpha_rho_L(i)*vel_L(norm_dir) &
-                                         + s_M*s_P*(Ga%L*alpha_rho_L(i) &
-                                                    - Ga%R*alpha_rho_R(i))) &
-                                        /(s_M - s_P)
-                                end do
-                            end if
-
-                            ! Momentum
-                            if (mhd .and. (.not. relativity)) then
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, 3
-                                    ! Flux of rho*v_i in the ${XYZ}$ direction
-                                    ! = rho * v_i * v_${XYZ}$ - B_i * B_${XYZ}$ + delta_(${XYZ}$,i) * p_tot
-                                    flux_rs${XYZ}$_vf(j, k, l, contxe + i) = &
-                                        (s_M*(rho_R*vel_R(i)*vel_R(norm_dir) &
-                                              - B%R(i)*B%R(norm_dir) &
-                                              + dir_flg(i)*(pres_R + pres_mag%R)) &
-                                         - s_P*(rho_L*vel_L(i)*vel_L(norm_dir) &
-                                                - B%L(i)*B%L(norm_dir) &
-                                                + dir_flg(i)*(pres_L + pres_mag%L)) &
-                                         + s_M*s_P*(rho_L*vel_L(i) - rho_R*vel_R(i))) &
-                                        /(s_M - s_P)
-                                end do
-                            elseif (mhd .and. relativity) then
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, 3
-                                    ! Flux of m_i in the ${XYZ}$ direction
-                                    ! = m_i * v_${XYZ}$ - b_i/Gamma * B_${XYZ}$ + delta_(${XYZ}$,i) * p_tot
-                                    flux_rs${XYZ}$_vf(j, k, l, contxe + i) = &
-                                        (s_M*(cm%R(i)*vel_R(norm_dir) &
-                                              - b4%R(i)/Ga%R*B%R(norm_dir) &
-                                              + dir_flg(i)*(pres_R + pres_mag%R)) &
-                                         - s_P*(cm%L(i)*vel_L(norm_dir) &
-                                                - b4%L(i)/Ga%L*B%L(norm_dir) &
-                                                + dir_flg(i)*(pres_L + pres_mag%L)) &
-                                         + s_M*s_P*(cm%L(i) - cm%R(i))) &
-                                        /(s_M - s_P)
-                                end do
-                            elseif (bubbles_euler) then
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, num_vels
-                                    flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = &
-                                        (s_M*(rho_R*vel_R(dir_idx(1)) &
-                                              *vel_R(dir_idx(i)) &
-                                              + dir_flg(dir_idx(i))*(pres_R - ptilde_R)) &
-                                         - s_P*(rho_L*vel_L(dir_idx(1)) &
-                                                *vel_L(dir_idx(i)) &
-                                                + dir_flg(dir_idx(i))*(pres_L - ptilde_L)) &
-                                         + s_M*s_P*(rho_L*vel_L(dir_idx(i)) &
-                                                    - rho_R*vel_R(dir_idx(i)))) &
-                                        /(s_M - s_P) &
-                                        + (s_M/s_L)*(s_P/s_R)*pcorr*(vel_R(dir_idx(i)) - vel_L(dir_idx(i)))
-                                end do
-                            else if (hypoelasticity) then
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, num_vels
-                                    flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = &
-                                        (s_M*(rho_R*vel_R(dir_idx(1)) &
-                                              *vel_R(dir_idx(i)) &
-                                              + dir_flg(dir_idx(i))*pres_R &
-                                              - tau_e_R(dir_idx_tau(i))) &
-                                         - s_P*(rho_L*vel_L(dir_idx(1)) &
-                                                *vel_L(dir_idx(i)) &
-                                                + dir_flg(dir_idx(i))*pres_L &
-                                                - tau_e_L(dir_idx_tau(i))) &
-                                         + s_M*s_P*(rho_L*vel_L(dir_idx(i)) &
-                                                    - rho_R*vel_R(dir_idx(i)))) &
-                                        /(s_M - s_P)
-                                end do
-                            else
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, num_vels
-                                    flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = &
-                                        (s_M*(rho_R*vel_R(dir_idx(1)) &
-                                              *vel_R(dir_idx(i)) &
-                                              + dir_flg(dir_idx(i))*pres_R) &
-                                         - s_P*(rho_L*vel_L(dir_idx(1)) &
-                                                *vel_L(dir_idx(i)) &
-                                                + dir_flg(dir_idx(i))*pres_L) &
-                                         + s_M*s_P*(rho_L*vel_L(dir_idx(i)) &
-                                                    - rho_R*vel_R(dir_idx(i)))) &
-                                        /(s_M - s_P) &
-                                        + (s_M/s_L)*(s_P/s_R)*pcorr*(vel_R(dir_idx(i)) - vel_L(dir_idx(i)))
-                                end do
-                            end if
-
-                            ! Energy
-                            if (mhd .and. (.not. relativity)) then
-                                ! energy flux = (E + p + p_mag) * v_${XYZ}$ - B_${XYZ}$ * (v_x*B_x + v_y*B_y + v_z*B_z)
-                                #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
-                                    flux_rs${XYZ}$_vf(j, k, l, E_idx) = &
-                                        (s_M*(vel_R(norm_dir)*(E_R + pres_R + pres_mag%R) - B%R(norm_dir)*(vel_R(1)*B%R(1) + vel_R(2)*B%R(2) + vel_R(3)*B%R(3))) &
-                                         - s_P*(vel_L(norm_dir)*(E_L + pres_L + pres_mag%L) - B%L(norm_dir)*(vel_L(1)*B%L(1) + vel_L(2)*B%L(2) + vel_L(3)*B%L(3))) &
-                                         + s_M*s_P*(E_L - E_R)) &
-                                        /(s_M - s_P)
-                                #:endif
-                            elseif (mhd .and. relativity) then
-                                ! energy flux = m_${XYZ}$ - mass flux
-                                ! Hard-coded for single-component for now
-                                flux_rs${XYZ}$_vf(j, k, l, E_idx) = &
-                                    (s_M*(cm%R(norm_dir) - Ga%R*alpha_rho_R(1)*vel_R(norm_dir)) &
-                                     - s_P*(cm%L(norm_dir) - Ga%L*alpha_rho_L(1)*vel_L(norm_dir)) &
-                                     + s_M*s_P*(E_L - E_R)) &
-                                    /(s_M - s_P)
-                            else if (bubbles_euler) then
-                                flux_rs${XYZ}$_vf(j, k, l, E_idx) = &
-                                    (s_M*vel_R(dir_idx(1))*(E_R + pres_R - ptilde_R) &
-                                     - s_P*vel_L(dir_idx(1))*(E_L + pres_L - ptilde_L) &
-                                     + s_M*s_P*(E_L - E_R)) &
-                                    /(s_M - s_P) &
-                                    + (s_M/s_L)*(s_P/s_R)*pcorr*(vel_R_rms - vel_L_rms)/2._wp
-                            else if (hypoelasticity) then
-                                flux_tau_L = 0._wp; flux_tau_R = 0._wp
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, num_dims
-                                    flux_tau_L = flux_tau_L + tau_e_L(dir_idx_tau(i))*vel_L(dir_idx(i))
-                                    flux_tau_R = flux_tau_R + tau_e_R(dir_idx_tau(i))*vel_R(dir_idx(i))
-                                end do
-                                flux_rs${XYZ}$_vf(j, k, l, E_idx) = &
-                                    (s_M*(vel_R(dir_idx(1))*(E_R + pres_R) - flux_tau_R) &
-                                     - s_P*(vel_L(dir_idx(1))*(E_L + pres_L) - flux_tau_L) &
-                                     + s_M*s_P*(E_L - E_R))/(s_M - s_P)
-                            else
-                                flux_rs${XYZ}$_vf(j, k, l, E_idx) = &
-                                    (s_M*vel_R(dir_idx(1))*(E_R + pres_R) &
-                                     - s_P*vel_L(dir_idx(1))*(E_L + pres_L) &
-                                     + s_M*s_P*(E_L - E_R)) &
-                                    /(s_M - s_P) &
-                                    + (s_M/s_L)*(s_P/s_R)*pcorr*(vel_R_rms - vel_L_rms)/2._wp
-                            end if
-
-                            ! Elastic Stresses
-                            if (hypoelasticity) then
-                                do i = 1, strxe - strxb + 1 !TODO: this indexing may be slow
-                                    flux_rs${XYZ}$_vf(j, k, l, strxb - 1 + i) = &
-                                        (s_M*(rho_R*vel_R(dir_idx(1)) &
-                                              *tau_e_R(i)) &
-                                         - s_P*(rho_L*vel_L(dir_idx(1)) &
-                                                *tau_e_L(i)) &
-                                         + s_M*s_P*(rho_L*tau_e_L(i) &
-                                                    - rho_R*tau_e_R(i))) &
-                                        /(s_M - s_P)
-                                end do
-                            end if
-
-                            ! Advection
-                            $:GPU_LOOP(parallelism='[seq]')
-                            do i = advxb, advxe
-                                flux_rs${XYZ}$_vf(j, k, l, i) = &
-                                    (qL_prim_rs${XYZ}$_vf(j, k, l, i) &
-                                     - qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)) &
-                                    *s_M*s_P/(s_M - s_P)
-                                flux_src_rs${XYZ}$_vf(j, k, l, i) = &
-                                    (s_M*qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) &
-                                     - s_P*qL_prim_rs${XYZ}$_vf(j, k, l, i)) &
-                                    /(s_M - s_P)
-                            end do
-
-                            if (bubbles_euler) then
-                                ! From HLLC: Kills mass transport @ bubble gas density
-                                if (num_fluids > 1) then
-                                    flux_rs${XYZ}$_vf(j, k, l, contxe) = 0._wp
-                                end if
-                            end if
-
-                            if (chemistry) then
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = chemxb, chemxe
-                                    Y_L = qL_prim_rs${XYZ}$_vf(j, k, l, i)
-                                    Y_R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)
-
-                                    flux_rs${XYZ}$_vf(j, k, l, i) = (s_M*Y_R*rho_R*vel_R(dir_idx(1)) &
-                                                                     - s_P*Y_L*rho_L*vel_L(dir_idx(1)) &
-                                                                     + s_M*s_P*(Y_L*rho_L - Y_R*rho_R)) &
-                                                                    /(s_M - s_P)
-                                    flux_src_rs${XYZ}$_vf(j, k, l, i) = 0._wp
-                                end do
-                            end if
-
-                            if (mhd) then
-                                if (n == 0) then ! 1D: d/dx flux only & Bx = Bx0 = const.
-                                    ! B_y flux = v_x * B_y - v_y * Bx0
-                                    ! B_z flux = v_x * B_z - v_z * Bx0
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = 0, 1
-                                        flux_rsx_vf(j, k, l, B_idx%beg + i) = (s_M*(vel_R(1)*B%R(2 + i) - vel_R(2 + i)*Bx0) &
-                                                                               - s_P*(vel_L(1)*B%L(2 + i) - vel_L(2 + i)*Bx0) &
-                                                                               + s_M*s_P*(B%L(2 + i) - B%R(2 + i)))/(s_M - s_P)
-                                    end do
-                                else ! 2D/3D: Bx, By, Bz /= const. but zero flux component in the same direction
-                                    ! B_x d/d${XYZ}$ flux = (1 - delta(x,${XYZ}$)) * (v_${XYZ}$ * B_x - v_x * B_${XYZ}$)
-                                    ! B_y d/d${XYZ}$ flux = (1 - delta(y,${XYZ}$)) * (v_${XYZ}$ * B_y - v_y * B_${XYZ}$)
-                                    ! B_z d/d${XYZ}$ flux = (1 - delta(z,${XYZ}$)) * (v_${XYZ}$ * B_z - v_z * B_${XYZ}$)
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = 0, 2
-                                        flux_rs${XYZ}$_vf(j, k, l, B_idx%beg + i) = (1 - dir_flg(i + 1))*( &
-                                                                                    s_M*(vel_R(dir_idx(1))*B%R(i + 1) - vel_R(i + 1)*B%R(norm_dir)) - &
-                                                                                    s_P*(vel_L(dir_idx(1))*B%L(i + 1) - vel_L(i + 1)*B%L(norm_dir)) + &
-                                                                                    s_M*s_P*(B%L(i + 1) - B%R(i + 1)))/(s_M - s_P)
-                                    end do
-                                end if
-                                flux_src_rs${XYZ}$_vf(j, k, l, advxb) = 0._wp
-                            end if
-
-                            #:if (NORM_DIR == 2)
-                                if (cyl_coord) then
-                                    !Substituting the advective flux into the inviscid geometrical source flux
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = 1, E_idx
-                                        flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i)
-                                    end do
-                                    ! Recalculating the radial momentum geometric source flux
-                                    flux_gsrc_rs${XYZ}$_vf(j, k, l, contxe + 2) = &
-                                        flux_rs${XYZ}$_vf(j, k, l, contxe + 2) &
-                                        - (s_M*pres_R - s_P*pres_L)/(s_M - s_P)
-                                    ! Geometrical source of the void fraction(s) is zero
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = advxb, advxe
-                                        flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i)
-                                    end do
-                                end if
-
-                                if (cyl_coord .and. hypoelasticity) then
-                                    ! += tau_sigmasigma using HLL
-                                    flux_gsrc_rs${XYZ}$_vf(j, k, l, contxe + 2) = &
-                                        flux_gsrc_rs${XYZ}$_vf(j, k, l, contxe + 2) + &
-                                        (s_M*tau_e_R(4) - s_P*tau_e_L(4)) &
-                                        /(s_M - s_P)
-
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = strxb, strxe
-                                        flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i)
-                                    end do
-                                end if
-                            #:endif
-                        end do
-                    end do
-                end do
-                $:END_GPU_PARALLEL_LOOP()
-            end if
-
-        #:endfor
-
-        if (viscous .or. dummy) then
-            $:GPU_PARALLEL_LOOP(collapse=3, private='[i,j,k,l, idx_right_phys, vel_grad_L, vel_grad_R, alpha_L, alpha_R, vel_L, vel_R, Re_L, Re_R]', copyin='[norm_dir]')
-            do l = isz%beg, isz%end
-                do k = isy%beg, isy%end
-                    do j = isx%beg, isx%end
-                        idx_right_phys(1) = j
-                        idx_right_phys(2) = k
-                        idx_right_phys(3) = l
-                        idx_right_phys(norm_dir) = idx_right_phys(norm_dir) + 1
-
-                        if (norm_dir == 1) then
-                            $:GPU_LOOP(parallelism='[seq]')
-                            do i = 1, num_fluids
-                                alpha_L(i) = qL_prim_rsx_vf(j, k, l, E_idx + i)
-                                alpha_R(i) = qR_prim_rsx_vf(j + 1, k, l, E_idx + i)
-                            end do
-
-                            $:GPU_LOOP(parallelism='[seq]')
-                            do i = 1, num_dims
-                                vel_L(i) = qL_prim_rsx_vf(j, k, l, momxb + i - 1)
-                                vel_R(i) = qR_prim_rsx_vf(j + 1, k, l, momxb + i - 1)
-                            end do
-                        else if (norm_dir == 2) then
-                            $:GPU_LOOP(parallelism='[seq]')
-                            do i = 1, num_fluids
-                                alpha_L(i) = qL_prim_rsy_vf(k, j, l, E_idx + i)
-                                alpha_R(i) = qR_prim_rsy_vf(k + 1, j, l, E_idx + i)
-                            end do
-                            $:GPU_LOOP(parallelism='[seq]')
-                            do i = 1, num_dims
-                                vel_L(i) = qL_prim_rsy_vf(k, j, l, momxb + i - 1)
-                                vel_R(i) = qR_prim_rsy_vf(k + 1, j, l, momxb + i - 1)
-                            end do
-                        else
-                            $:GPU_LOOP(parallelism='[seq]')
-                            do i = 1, num_fluids
-                                alpha_L(i) = qL_prim_rsz_vf(l, k, j, E_idx + i)
-                                alpha_R(i) = qR_prim_rsz_vf(l + 1, k, j, E_idx + i)
-                            end do
-
-                            $:GPU_LOOP(parallelism='[seq]')
-                            do i = 1, num_dims
-                                vel_L(i) = qL_prim_rsz_vf(l, k, j, momxb + i - 1)
-                                vel_R(i) = qR_prim_rsz_vf(l + 1, k, j, momxb + i - 1)
-                            end do
-                        end if
-
-                        $:GPU_LOOP(parallelism='[seq]')
-                        do i = 1, 2
-                            Re_L(i) = dflt_real
-                            Re_R(i) = dflt_real
-
-                            if (Re_size(i) > 0) Re_L(i) = 0._wp
-                            if (Re_size(i) > 0) Re_R(i) = 0._wp
-
-                            $:GPU_LOOP(parallelism='[seq]')
-                            do q = 1, Re_size(i)
-                                Re_L(i) = alpha_L(Re_idx(i, q))/Res_gs(i, q) &
-                                          + Re_L(i)
-                                Re_R(i) = alpha_R(Re_idx(i, q))/Res_gs(i, q) &
-                                          + Re_R(i)
-                            end do
-
-                            Re_L(i) = 1._wp/max(Re_L(i), sgm_eps)
-                            Re_R(i) = 1._wp/max(Re_R(i), sgm_eps)
-                        end do
-
-                        if (shear_stress) then
-
-                            $:GPU_LOOP(parallelism='[seq]')
-                            do i = 1, num_dims
-                                vel_grad_L(i, 1) = (dqL_prim_dx_vf(momxb + i - 1)%sf(j, k, l)/Re_L(1))
-                                vel_grad_R(i, 1) = (dqR_prim_dx_vf(momxb + i - 1)%sf(idx_right_phys(1), idx_right_phys(2), idx_right_phys(3))/Re_R(1))
-                                #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
-                                    if (num_dims > 1) then
-                                        vel_grad_L(i, 2) = (dqL_prim_dy_vf(momxb + i - 1)%sf(j, k, l)/Re_L(1))
-                                        vel_grad_R(i, 2) = (dqR_prim_dy_vf(momxb + i - 1)%sf(idx_right_phys(1), idx_right_phys(2), idx_right_phys(3))/Re_R(1))
-                                    end if
-                                    #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
-                                        if (num_dims > 2) then
-                                            vel_grad_L(i, 3) = (dqL_prim_dz_vf(momxb + i - 1)%sf(j, k, l)/Re_L(1))
-                                            vel_grad_R(i, 3) = (dqR_prim_dz_vf(momxb + i - 1)%sf(idx_right_phys(1), idx_right_phys(2), idx_right_phys(3))/Re_R(1))
-                                        end if
-                                    #:endif
-                                #:endif
-                            end do
-
-                            if (norm_dir == 1) then
-                                flux_src_vf(momxb)%sf(j, k, l) = flux_src_vf(momxb)%sf(j, k, l) - (4._wp/3._wp)*0.5_wp*(vel_grad_L(1, 1) + vel_grad_R(1, 1))
-                                flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - (4._wp/3._wp)*0.5_wp*(vel_grad_L(1, 1)*vel_L(1) + vel_grad_R(1, 1)*vel_R(1))
-                                #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
-                                    if (num_dims > 1) then
-                                        flux_src_vf(momxb)%sf(j, k, l) = flux_src_vf(momxb)%sf(j, k, l) - (-2._wp/3._wp)*0.5_wp*(vel_grad_L(2, 2) + vel_grad_R(2, 2))
-                                        flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - (-2._wp/3._wp)*0.5_wp*(vel_grad_L(2, 2)*vel_L(1) + vel_grad_R(2, 2)*vel_R(1))
-
-                                        flux_src_vf(momxb + 1)%sf(j, k, l) = flux_src_vf(momxb + 1)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 2) + vel_grad_R(1, 2)) - 0.5_wp*(vel_grad_L(2, 1) + vel_grad_R(2, 1))
-                                        flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 2)*vel_L(2) + vel_grad_R(1, 2)*vel_R(2)) - 0.5_wp*(vel_grad_L(2, 1)*vel_L(2) + vel_grad_R(2, 1)*vel_R(2))
-                                        #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
-                                            if (num_dims > 2) then
-                                                flux_src_vf(momxb)%sf(j, k, l) = flux_src_vf(momxb)%sf(j, k, l) - (-2._wp/3._wp)*0.5_wp*(vel_grad_L(3, 3) + vel_grad_R(3, 3))
-                                                flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - (-2._wp/3._wp)*0.5_wp*(vel_grad_L(3, 3)*vel_L(1) + vel_grad_R(3, 3)*vel_R(1))
-
-                                                flux_src_vf(momxb + 2)%sf(j, k, l) = flux_src_vf(momxb + 2)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 3) + vel_grad_R(1, 3)) - 0.5_wp*(vel_grad_L(3, 1) + vel_grad_R(3, 1))
-                                                flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 3)*vel_L(3) + vel_grad_R(1, 3)*vel_R(3)) - 0.5_wp*(vel_grad_L(3, 1)*vel_L(3) + vel_grad_R(3, 1)*vel_R(3))
-                                            end if
-                                        #:endif
-                                    end if
-                                #:endif
-
-                            else if (norm_dir == 2) then
-                                #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
-                                    flux_src_vf(momxb + 1)%sf(j, k, l) = flux_src_vf(momxb + 1)%sf(j, k, l) - (-2._wp/3._wp)*0.5_wp*(vel_grad_L(1, 1) + vel_grad_R(1, 1))
-                                    flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - (-2._wp/3._wp)*0.5_wp*(vel_grad_L(1, 1)*vel_L(2) + vel_grad_R(1, 1)*vel_R(2))
-
-                                    flux_src_vf(momxb + 1)%sf(j, k, l) = flux_src_vf(momxb + 1)%sf(j, k, l) - (4._wp/3._wp)*0.5_wp*(vel_grad_L(2, 2) + vel_grad_R(2, 2))
-                                    flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - (4._wp/3._wp)*0.5_wp*(vel_grad_L(2, 2)*vel_L(2) + vel_grad_R(2, 2)*vel_R(2))
-
-                                    flux_src_vf(momxb)%sf(j, k, l) = flux_src_vf(momxb)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 2) + vel_grad_R(1, 2)) - 0.5_wp*(vel_grad_L(2, 1) + vel_grad_R(2, 1))
-                                    flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 2)*vel_L(1) + vel_grad_R(1, 2)*vel_R(1)) - 0.5_wp*(vel_grad_L(2, 1)*vel_L(1) + vel_grad_R(2, 1)*vel_R(1))
-                                    #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
-                                        if (num_dims > 2) then
-                                            flux_src_vf(momxb + 1)%sf(j, k, l) = flux_src_vf(momxb + 1)%sf(j, k, l) - (-2._wp/3._wp)*0.5_wp*(vel_grad_L(3, 3) + vel_grad_R(3, 3))
-                                            flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - (-2._wp/3._wp)*0.5_wp*(vel_grad_L(3, 3)*vel_L(2) + vel_grad_R(3, 3)*vel_R(2))
-
-                                            flux_src_vf(momxb + 2)%sf(j, k, l) = flux_src_vf(momxb + 2)%sf(j, k, l) - 0.5_wp*(vel_grad_L(2, 3) + vel_grad_R(2, 3)) - 0.5_wp*(vel_grad_L(3, 2) + vel_grad_R(3, 2))
-                                            flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(2, 3)*vel_L(3) + vel_grad_R(2, 3)*vel_R(3)) - 0.5_wp*(vel_grad_L(3, 2)*vel_L(3) + vel_grad_R(3, 2)*vel_R(3))
-                                        end if
-                                    #:endif
-                                #:endif
-                            else
-                                #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
-                                    flux_src_vf(momxb + 2)%sf(j, k, l) = flux_src_vf(momxb + 2)%sf(j, k, l) - (-2._wp/3._wp)*0.5_wp*(vel_grad_L(1, 1) + vel_grad_R(1, 1))
-                                    flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - (-2._wp/3._wp)*0.5_wp*(vel_grad_L(1, 1)*vel_L(3) + vel_grad_R(1, 1)*vel_R(3))
-
-                                    flux_src_vf(momxb + 2)%sf(j, k, l) = flux_src_vf(momxb + 2)%sf(j, k, l) - (-2._wp/3._wp)*0.5_wp*(vel_grad_L(2, 2) + vel_grad_R(2, 2))
-                                    flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - (-2._wp/3._wp)*0.5_wp*(vel_grad_L(2, 2)*vel_L(3) + vel_grad_R(2, 2)*vel_R(3))
-
-                                    flux_src_vf(momxb)%sf(j, k, l) = flux_src_vf(momxb)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 3) + vel_grad_R(1, 3)) - 0.5_wp*(vel_grad_L(3, 1) + vel_grad_R(3, 1))
-                                    flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 3)*vel_L(1) + vel_grad_R(1, 3)*vel_R(1)) - 0.5_wp*(vel_grad_L(3, 1)*vel_L(1) + vel_grad_R(3, 1)*vel_R(1))
-
-                                    flux_src_vf(momxb + 2)%sf(j, k, l) = flux_src_vf(momxb + 2)%sf(j, k, l) - (4._wp/3._wp)*0.5_wp*(vel_grad_L(3, 3) + vel_grad_R(3, 3))
-                                    flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - (4._wp/3._wp)*0.5_wp*(vel_grad_L(3, 3)*vel_L(3) + vel_grad_R(3, 3)*vel_R(3))
-
-                                    flux_src_vf(momxb + 1)%sf(j, k, l) = flux_src_vf(momxb + 1)%sf(j, k, l) - 0.5_wp*(vel_grad_L(2, 3) + vel_grad_R(2, 3)) - 0.5_wp*(vel_grad_L(3, 2) + vel_grad_R(3, 2))
-                                    flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(2, 3)*vel_L(2) + vel_grad_R(2, 3)*vel_R(2)) - 0.5_wp*(vel_grad_L(3, 2)*vel_L(2) + vel_grad_R(3, 2)*vel_R(2))
-                                #:endif
-                            end if
-                        end if
-
-                        if (bulk_stress) then
-
-                            $:GPU_LOOP(parallelism='[seq]')
-                            do i = 1, num_dims
-                                vel_grad_L(i, 1) = (dqL_prim_dx_vf(momxb + i - 1)%sf(j, k, l)/Re_L(2))
-                                vel_grad_R(i, 1) = (dqR_prim_dx_vf(momxb + i - 1)%sf(idx_right_phys(1), idx_right_phys(2), idx_right_phys(3))/Re_R(2))
-                                #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
-                                    if (num_dims > 1) then
-                                        vel_grad_L(i, 2) = (dqL_prim_dy_vf(momxb + i - 1)%sf(j, k, l)/Re_L(2))
-                                        vel_grad_R(i, 2) = (dqR_prim_dy_vf(momxb + i - 1)%sf(idx_right_phys(1), idx_right_phys(2), idx_right_phys(3))/Re_R(2))
-                                    end if
-                                #:endif
-                                #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
-                                    if (num_dims > 2) then
-                                        vel_grad_L(i, 3) = (dqL_prim_dz_vf(momxb + i - 1)%sf(j, k, l)/Re_L(2))
-                                        vel_grad_R(i, 3) = (dqR_prim_dz_vf(momxb + i - 1)%sf(idx_right_phys(1), idx_right_phys(2), idx_right_phys(3))/Re_R(2))
-                                    end if
-                                #:endif
-                            end do
-
-                            if (norm_dir == 1) then
-                                flux_src_vf(momxb)%sf(j, k, l) = flux_src_vf(momxb)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 1) + vel_grad_R(1, 1))
-                                flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 1)*vel_L(1) + vel_grad_R(1, 1)*vel_R(1))
-                                #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
-                                    if (num_dims > 1) then
-                                        flux_src_vf(momxb)%sf(j, k, l) = flux_src_vf(momxb)%sf(j, k, l) - 0.5_wp*(vel_grad_L(2, 2) + vel_grad_R(2, 2))
-                                        flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(2, 2)*vel_L(1) + vel_grad_R(2, 2)*vel_R(1))
-
-                                        #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
-                                            if (num_dims > 2) then
-                                                flux_src_vf(momxb)%sf(j, k, l) = flux_src_vf(momxb)%sf(j, k, l) - 0.5_wp*(vel_grad_L(3, 3) + vel_grad_R(3, 3))
-                                                flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(3, 3)*vel_L(1) + vel_grad_R(3, 3)*vel_R(1))
-                                            end if
-                                        #:endif
-                                    end if
-                                #:endif
-
-                            else if (norm_dir == 2) then
-                                #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
-                                    flux_src_vf(momxb + 1)%sf(j, k, l) = flux_src_vf(momxb + 1)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 1) + vel_grad_R(1, 1))
-                                    flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 1)*vel_L(2) + vel_grad_R(1, 1)*vel_R(2))
-
-                                    flux_src_vf(momxb + 1)%sf(j, k, l) = flux_src_vf(momxb + 1)%sf(j, k, l) - 0.5_wp*(vel_grad_L(2, 2) + vel_grad_R(2, 2))
-                                    flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(2, 2)*vel_L(2) + vel_grad_R(2, 2)*vel_R(2))
-
-                                    #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
-                                        if (num_dims > 2) then
-                                            flux_src_vf(momxb + 1)%sf(j, k, l) = flux_src_vf(momxb + 1)%sf(j, k, l) - 0.5_wp*(vel_grad_L(3, 3) + vel_grad_R(3, 3))
-                                            flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(3, 3)*vel_L(2) + vel_grad_R(3, 3)*vel_R(2))
-                                        end if
-                                    #:endif
-                                #:endif
-                            else
-                                #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
-                                    flux_src_vf(momxb + 2)%sf(j, k, l) = flux_src_vf(momxb + 2)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 1) + vel_grad_R(1, 1))
-                                    flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 1)*vel_L(3) + vel_grad_R(1, 1)*vel_R(3))
-
-                                    flux_src_vf(momxb + 2)%sf(j, k, l) = flux_src_vf(momxb + 2)%sf(j, k, l) - 0.5_wp*(vel_grad_L(2, 2) + vel_grad_R(2, 2))
-                                    flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(2, 2)*vel_L(3) + vel_grad_R(2, 2)*vel_R(3))
-
-                                    flux_src_vf(momxb + 2)%sf(j, k, l) = flux_src_vf(momxb + 2)%sf(j, k, l) - 0.5_wp*(vel_grad_L(3, 3) + vel_grad_R(3, 3))
-                                    flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(3, 3)*vel_L(3) + vel_grad_R(3, 3)*vel_R(3))
-                                #:endif
-                            end if
-
-                        end if
-                    end do
-                end do
-            end do
-            $:END_GPU_PARALLEL_LOOP()
-
-        end if
-
-        call s_finalize_riemann_solver(flux_vf, flux_src_vf, &
-                                       flux_gsrc_vf, &
-                                       norm_dir)
-
-    end subroutine s_lf_riemann_solver
-
-    !> This procedure is the implementation of the Harten, Lax,
-        !!      van Leer, and contact (HLLC) approximate Riemann solver,
-        !!      see Toro (1999) and Johnsen (2007). The viscous and the
-        !!      surface tension effects have been included by modifying
-        !!      the exact Riemann solver of Perigaud and Saurel (2005).
-        !!  @param qL_prim_rsx_vf Left WENO-reconstructed cell-boundary values (x-dir)
-        !!  @param qL_prim_rsy_vf Left WENO-reconstructed cell-boundary values (y-dir)
-        !!  @param qL_prim_rsz_vf Left WENO-reconstructed cell-boundary values (z-dir)
-        !!  @param dqL_prim_dx_vf The left WENO-reconstructed cell-boundary values of the
-        !!      first-order x-dir spatial derivatives
-        !!  @param dqL_prim_dy_vf The left WENO-reconstructed cell-boundary values of the
-        !!      first-order y-dir spatial derivatives
-        !!  @param dqL_prim_dz_vf The left WENO-reconstructed cell-boundary values of the
-        !!      first-order z-dir spatial derivatives
-        !!  @param qL_prim_vf The left WENO-reconstructed cell-boundary values of the
-        !!      cell-average primitive variables
-        !!  @param qR_prim_rsx_vf Right WENO-reconstructed cell-boundary values (x-dir)
-        !!  @param qR_prim_rsy_vf Right WENO-reconstructed cell-boundary values (y-dir)
-        !!  @param qR_prim_rsz_vf Right WENO-reconstructed cell-boundary values (z-dir)
-        !!  @param dqR_prim_dx_vf The right WENO-reconstructed cell-boundary values of the
-        !!      first-order x-dir spatial derivatives
-        !!  @param dqR_prim_dy_vf The right WENO-reconstructed cell-boundary values of the
-        !!      first-order y-dir spatial derivatives
-        !!  @param dqR_prim_dz_vf The right WENO-reconstructed cell-boundary values of the
-        !!      first-order z-dir spatial derivatives
-        !!  @param qR_prim_vf The right WENO-reconstructed cell-boundary values of the
-        !!      cell-average primitive variables
-        !!  @param q_prim_vf Cell-averaged primitive variables
-        !!  @param flux_vf Intra-cell fluxes
-        !!  @param flux_src_vf Intra-cell fluxes sources
-        !!  @param flux_gsrc_vf Intra-cell geometric fluxes sources
-        !!  @param norm_dir Dir. splitting direction
-        !!  @param ix Index bounds in the x-dir
-        !!  @param iy Index bounds in the y-dir
-        !!  @param iz Index bounds in the z-dir
-    subroutine s_hllc_riemann_solver(qL_prim_rsx_vf, qL_prim_rsy_vf, qL_prim_rsz_vf, dqL_prim_dx_vf, &
-                                     dqL_prim_dy_vf, &
-                                     dqL_prim_dz_vf, &
-                                     qL_prim_vf, &
-                                     qR_prim_rsx_vf, qR_prim_rsy_vf, qR_prim_rsz_vf, dqR_prim_dx_vf, &
-                                     dqR_prim_dy_vf, &
-                                     dqR_prim_dz_vf, &
-                                     qR_prim_vf, &
-                                     q_prim_vf, &
-                                     flux_vf, flux_src_vf, &
-                                     flux_gsrc_vf, &
-                                     norm_dir, ix, iy, iz)
-
-        real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:), intent(inout) :: qL_prim_rsx_vf, qL_prim_rsy_vf, qL_prim_rsz_vf, qR_prim_rsx_vf, qR_prim_rsy_vf, qR_prim_rsz_vf
-        type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf
-        type(scalar_field), allocatable, dimension(:), intent(inout) :: qL_prim_vf, qR_prim_vf
-
-        type(scalar_field), &
-            allocatable, dimension(:), &
-            intent(inout) :: dqL_prim_dx_vf, dqR_prim_dx_vf, &
-                             dqL_prim_dy_vf, dqR_prim_dy_vf, &
-                             dqL_prim_dz_vf, dqR_prim_dz_vf
-
-        ! Intercell fluxes
-        type(scalar_field), &
-            dimension(sys_size), &
-            intent(inout) :: flux_vf, flux_src_vf, flux_gsrc_vf
-
-        integer, intent(in) :: norm_dir
-        type(int_bounds_info), intent(in) :: ix, iy, iz
-
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
-            real(wp), dimension(3) :: alpha_rho_L, alpha_rho_R
-            real(wp), dimension(3) :: alpha_L, alpha_R
-            real(wp), dimension(3) :: vel_L, vel_R
-        #:else
-            real(wp), dimension(num_fluids) :: alpha_rho_L, alpha_rho_R
-            real(wp), dimension(num_fluids) :: alpha_L, alpha_R
-            real(wp), dimension(num_dims) :: vel_L, vel_R
-        #:endif
-
-        real(wp) :: rho_L, rho_R
-        real(wp) :: pres_L, pres_R
-        real(wp) :: E_L, E_R
-        real(wp) :: H_L, H_R
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
-            real(wp), dimension(10) :: Ys_L, Ys_R, Xs_L, Xs_R, Gamma_iL, Gamma_iR, Cp_iL, Cp_iR
-            real(wp), dimension(10) :: Yi_avg, Phi_avg, h_iL, h_iR, h_avg_2
-        #:else
-            real(wp), dimension(num_species) :: Ys_L, Ys_R, Xs_L, Xs_R, Gamma_iL, Gamma_iR, Cp_iL, Cp_iR
-            real(wp), dimension(num_species) :: Yi_avg, Phi_avg, h_iL, h_iR, h_avg_2
-        #:endif
-        real(wp) :: Cp_avg, Cv_avg, T_avg, c_sum_Yi_Phi, eps
-        real(wp) :: T_L, T_R
-        real(wp) :: MW_L, MW_R
-        real(wp) :: R_gas_L, R_gas_R
-        real(wp) :: Cp_L, Cp_R
-        real(wp) :: Cv_L, Cv_R
-        real(wp) :: Gamm_L, Gamm_R
-        real(wp) :: Y_L, Y_R
-        real(wp) :: gamma_L, gamma_R
-        real(wp) :: pi_inf_L, pi_inf_R
-        real(wp) :: qv_L, qv_R
-        real(wp) :: c_L, c_R
-        real(wp), dimension(2) :: Re_L, Re_R
-
-        real(wp) :: rho_avg
-        real(wp) :: H_avg
-        real(wp) :: gamma_avg
-        real(wp) :: qv_avg
-        real(wp) :: c_avg
-
-        real(wp) :: s_L, s_R, s_M, s_P, s_S
-        real(wp) :: xi_L, xi_R !< Left and right wave speeds functions
-        real(wp) :: xi_M, xi_P
-        real(wp) :: xi_MP, xi_PP
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
-            real(wp), dimension(3) :: R0_L, R0_R
-            real(wp), dimension(3) :: V0_L, V0_R
-            real(wp), dimension(3) :: P0_L, P0_R
-            real(wp), dimension(3) :: pbw_L, pbw_R
-        #:else
-            real(wp), dimension(nb) :: R0_L, R0_R
-            real(wp), dimension(nb) :: V0_L, V0_R
-            real(wp), dimension(nb) :: P0_L, P0_R
-            real(wp), dimension(nb) :: pbw_L, pbw_R
-        #:endif
-
-        real(wp) :: alpha_L_sum, alpha_R_sum, nbub_L, nbub_R
-        real(wp) :: ptilde_L, ptilde_R
-
-        real(wp) :: PbwR3Lbar, PbwR3Rbar
-        real(wp) :: R3Lbar, R3Rbar
-        real(wp) :: R3V2Lbar, R3V2Rbar
-
-        real(wp), dimension(6) :: tau_e_L, tau_e_R
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
-            real(wp), dimension(3) :: xi_field_L, xi_field_R
-        #:else
-            real(wp), dimension(num_dims) :: xi_field_L, xi_field_R
-        #:endif
-        real(wp) :: G_L, G_R
-
-        real(wp) :: vel_L_rms, vel_R_rms, vel_avg_rms
-        real(wp) :: vel_L_tmp, vel_R_tmp
-        real(wp) :: rho_Star, E_Star, p_Star, p_K_Star, vel_K_star
-        real(wp) :: pres_SL, pres_SR, Ms_L, Ms_R
-        real(wp) :: flux_ene_e
-        real(wp) :: zcoef, pcorr !< low Mach number correction
-
-        integer :: Re_max, i, j, k, l, q !< Generic loop iterators
-
-        ! Populating the buffers of the left and right Riemann problem
-        ! states variables, based on the choice of boundary conditions
-
-        call s_populate_riemann_states_variables_buffers( &
-            qL_prim_rsx_vf, qL_prim_rsy_vf, qL_prim_rsz_vf, dqL_prim_dx_vf, &
-            dqL_prim_dy_vf, &
-            dqL_prim_dz_vf, &
-            qR_prim_rsx_vf, qR_prim_rsy_vf, qR_prim_rsz_vf, dqR_prim_dx_vf, &
-            dqR_prim_dy_vf, &
-            dqR_prim_dz_vf, &
-            norm_dir, ix, iy, iz)
-
-        ! Reshaping inputted data based on dimensional splitting direction
-
-        call s_initialize_riemann_solver( &
-            flux_src_vf, &
-            norm_dir)
-
-        #:for NORM_DIR, XYZ in [(1, 'x'), (2, 'y'), (3, 'z')]
-
-            if (norm_dir == ${NORM_DIR}$) then
-
-                ! 6-EQUATION MODEL WITH HLLC
-                if (model_eqns == 3) then
-                    !ME3
-                    $:GPU_PARALLEL_LOOP(collapse=3, private='[i,j,k,l, q, vel_L, vel_R, Re_L, Re_R, alpha_L, alpha_R, Ys_L, Ys_R, Xs_L, Xs_R, Gamma_iL, Gamma_iR, Cp_iL, Cp_iR, Yi_avg, Phi_avg, h_iL, h_iR, h_avg_2, tau_e_L, tau_e_R, flux_ene_e, xi_field_L, xi_field_R, pcorr, zcoef, rho_L, rho_R, pres_L, pres_R, E_L, E_R, H_L, H_R, Cp_avg, Cv_avg, T_avg, eps, c_sum_Yi_Phi, T_L, T_R, Y_L, Y_R, MW_L, MW_R, R_gas_L, R_gas_R, Cp_L, Cp_R, Cv_L, Cv_R, Gamm_L, Gamm_R, gamma_L, gamma_R, pi_inf_L, pi_inf_R, qv_L, qv_R, qv_avg, c_L, c_R, G_L, G_R, rho_avg, H_avg, c_avg, gamma_avg, ptilde_L, ptilde_R, vel_L_rms, vel_R_rms, vel_avg_rms, vel_L_tmp, vel_R_tmp, Ms_L, Ms_R, pres_SL, pres_SR, alpha_L_sum, alpha_R_sum, rho_Star, E_Star, p_Star, p_K_Star, vel_K_star, s_L, s_R, s_M, s_P, s_S, xi_M, xi_P, xi_L, xi_R, xi_MP, xi_PP]')
-                    do l = is3%beg, is3%end
-                        do k = is2%beg, is2%end
-                            do j = is1%beg, is1%end
-
-                                vel_L_rms = 0._wp; vel_R_rms = 0._wp
-                                rho_L = 0._wp; rho_R = 0._wp
-                                gamma_L = 0._wp; gamma_R = 0._wp
-                                pi_inf_L = 0._wp; pi_inf_R = 0._wp
-                                qv_L = 0._wp; qv_R = 0._wp
-                                alpha_L_sum = 0._wp; alpha_R_sum = 0._wp
-
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, num_dims
-                                    vel_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, contxe + i)
-                                    vel_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, contxe + i)
-                                    vel_L_rms = vel_L_rms + vel_L(i)**2._wp
-                                    vel_R_rms = vel_R_rms + vel_R(i)**2._wp
-                                end do
-
-                                pres_L = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx)
-                                pres_R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx)
-
-                                rho_L = 0._wp
-                                gamma_L = 0._wp
-                                pi_inf_L = 0._wp
-                                qv_L = 0._wp
-
-                                rho_R = 0._wp
-                                gamma_R = 0._wp
-                                pi_inf_R = 0._wp
-                                qv_R = 0._wp
-
-                                alpha_L_sum = 0._wp
-                                alpha_R_sum = 0._wp
-
-                                if (mpp_lim) then
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = 1, num_fluids
-                                        qL_prim_rs${XYZ}$_vf(j, k, l, i) = max(0._wp, qL_prim_rs${XYZ}$_vf(j, k, l, i))
-                                        qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i) = min(max(0._wp, qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)), 1._wp)
-                                        alpha_L_sum = alpha_L_sum + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)
-                                    end do
-
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = 1, num_fluids
-                                        qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) = max(0._wp, qR_prim_rs${XYZ}$_vf(j + 1, k, l, i))
-                                        qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i) = min(max(0._wp, qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)), 1._wp)
-                                        alpha_R_sum = alpha_R_sum + qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)
-                                    end do
-
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = 1, num_fluids
-                                        qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)/max(alpha_L_sum, sgm_eps)
-                                        qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)/max(alpha_R_sum, sgm_eps)
-                                    end do
-                                end if
-
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, num_fluids
-                                    rho_L = rho_L + qL_prim_rs${XYZ}$_vf(j, k, l, i)
-                                    gamma_L = gamma_L + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)*gammas(i)
-                                    pi_inf_L = pi_inf_L + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)*pi_infs(i)
-                                    qv_L = qv_L + qL_prim_rs${XYZ}$_vf(j, k, l, i)*qvs(i)
-
-                                    rho_R = rho_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)
-                                    gamma_R = gamma_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)*gammas(i)
-                                    pi_inf_R = pi_inf_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)*pi_infs(i)
-                                    qv_R = qv_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)*qvs(i)
-
-                                    alpha_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, advxb + i - 1)
-                                    alpha_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, advxb + i - 1)
-                                end do
-
-                                if (viscous) then
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = 1, 2
-                                        Re_L(i) = dflt_real
-                                        Re_R(i) = dflt_real
-                                        if (Re_size(i) > 0) Re_L(i) = 0._wp
-                                        if (Re_size(i) > 0) Re_R(i) = 0._wp
-                                        $:GPU_LOOP(parallelism='[seq]')
-                                        do q = 1, Re_size(i)
-                                            Re_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + Re_idx(i, q))/Res_gs(i, q) &
-                                                      + Re_L(i)
-                                            Re_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + Re_idx(i, q))/Res_gs(i, q) &
-                                                      + Re_R(i)
-                                        end do
-                                        Re_L(i) = 1._wp/max(Re_L(i), sgm_eps)
-                                        Re_R(i) = 1._wp/max(Re_R(i), sgm_eps)
-                                    end do
-                                end if
-
-                                E_L = gamma_L*pres_L + pi_inf_L + 5.e-1_wp*rho_L*vel_L_rms + qv_L
-                                E_R = gamma_R*pres_R + pi_inf_R + 5.e-1_wp*rho_R*vel_R_rms + qv_R
-
-                                ! ENERGY ADJUSTMENTS FOR HYPOELASTIC ENERGY
-                                if (hypoelasticity) then
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = 1, strxe - strxb + 1
-                                        tau_e_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, strxb - 1 + i)
-                                        tau_e_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, strxb - 1 + i)
-                                    end do
-                                    G_L = 0._wp; G_R = 0._wp
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = 1, num_fluids
-                                        G_L = G_L + alpha_L(i)*Gs_rs(i)
-                                        G_R = G_R + alpha_R(i)*Gs_rs(i)
-                                    end do
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = 1, strxe - strxb + 1
-                                        ! Elastic contribution to energy if G large enough
-                                        if ((G_L > verysmall) .and. (G_R > verysmall)) then
-                                            E_L = E_L + (tau_e_L(i)*tau_e_L(i))/(4._wp*G_L)
-                                            E_R = E_R + (tau_e_R(i)*tau_e_R(i))/(4._wp*G_R)
-                                            ! Additional terms in 2D and 3D
-                                            if ((i == 2) .or. (i == 4) .or. (i == 5)) then
-                                                E_L = E_L + (tau_e_L(i)*tau_e_L(i))/(4._wp*G_L)
-                                                E_R = E_R + (tau_e_R(i)*tau_e_R(i))/(4._wp*G_R)
-                                            end if
-                                        end if
-                                    end do
-                                end if
-
-                                ! ENERGY ADJUSTMENTS FOR HYPERELASTIC ENERGY
-                                if (hyperelasticity) then
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = 1, num_dims
-                                        xi_field_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, xibeg - 1 + i)
-                                        xi_field_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, xibeg - 1 + i)
-                                    end do
-                                    G_L = 0._wp; G_R = 0._wp; 
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = 1, num_fluids
-                                        ! Mixture left and right shear modulus
-                                        G_L = G_L + alpha_L(i)*Gs_rs(i)
-                                        G_R = G_R + alpha_R(i)*Gs_rs(i)
-                                    end do
-                                    ! Elastic contribution to energy if G large enough
-                                    if (G_L > verysmall .and. G_R > verysmall) then
-                                        E_L = E_L + G_L*qL_prim_rs${XYZ}$_vf(j, k, l, xiend + 1)
-                                        E_R = E_R + G_R*qR_prim_rs${XYZ}$_vf(j + 1, k, l, xiend + 1)
-                                    end if
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = 1, b_size - 1
-                                        tau_e_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, strxb - 1 + i)
-                                        tau_e_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, strxb - 1 + i)
-                                    end do
-                                end if
-
-                                H_L = (E_L + pres_L)/rho_L
-                                H_R = (E_R + pres_R)/rho_R
-
-                                @:compute_average_state()
-
-                                call s_compute_speed_of_sound(pres_L, rho_L, gamma_L, pi_inf_L, H_L, alpha_L, &
-                                                              vel_L_rms, 0._wp, c_L, qv_L)
-
-                                call s_compute_speed_of_sound(pres_R, rho_R, gamma_R, pi_inf_R, H_R, alpha_R, &
-                                                              vel_R_rms, 0._wp, c_R, qv_R)
-
-                                !> The computation of c_avg does not require all the variables, and therefore the non '_avg'
-                                ! variables are placeholders to call the subroutine.
-                                call s_compute_speed_of_sound(pres_R, rho_avg, gamma_avg, pi_inf_R, H_avg, alpha_R, &
-                                                              vel_avg_rms, 0._wp, c_avg, qv_avg)
-
-                                if (viscous) then
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = 1, 2
-                                        Re_avg_rs${XYZ}$_vf(j, k, l, i) = 2._wp/(1._wp/Re_L(i) + 1._wp/Re_R(i))
-                                    end do
-                                end if
-
-                                ! Low Mach correction
-                                if (low_Mach == 2) then
-                                    @:compute_low_Mach_correction()
-                                end if
-
-                                ! COMPUTING THE DIRECT WAVE SPEEDS
-                                if (wave_speeds == 1) then
-                                    if (elasticity) then
-                                        s_L = min(vel_L(dir_idx(1)) - sqrt(c_L*c_L + &
-                                                                           (((4._wp*G_L)/3._wp) + tau_e_L(dir_idx_tau(1)))/rho_L), vel_R(dir_idx(1)) - sqrt(c_R*c_R + &
-                                                                                                                                                            (((4._wp*G_R)/3._wp) + tau_e_R(dir_idx_tau(1)))/rho_R))
-                                        s_R = max(vel_R(dir_idx(1)) + sqrt(c_R*c_R + &
-                                                                           (((4._wp*G_R)/3._wp) + tau_e_R(dir_idx_tau(1)))/rho_R), vel_L(dir_idx(1)) + sqrt(c_L*c_L + &
-                                                                                                                                                            (((4._wp*G_L)/3._wp) + tau_e_L(dir_idx_tau(1)))/rho_L))
-                                        s_S = (pres_R - tau_e_R(dir_idx_tau(1)) - pres_L + &
-                                               tau_e_L(dir_idx_tau(1)) + rho_L*vel_L(dir_idx(1))*(s_L - vel_L(dir_idx(1))) - &
-                                               rho_R*vel_R(dir_idx(1))*(s_R - vel_R(dir_idx(1))))/(rho_L*(s_L - vel_L(dir_idx(1))) - &
-                                                                                                   rho_R*(s_R - vel_R(dir_idx(1))))
-                                    else
-                                        s_L = min(vel_L(dir_idx(1)) - c_L, vel_R(dir_idx(1)) - c_R)
-                                        s_R = max(vel_R(dir_idx(1)) + c_R, vel_L(dir_idx(1)) + c_L)
-                                        s_S = (pres_R - pres_L + rho_L*vel_L(dir_idx(1))* &
-                                               (s_L - vel_L(dir_idx(1))) - rho_R*vel_R(dir_idx(1))*(s_R - vel_R(dir_idx(1)))) &
-                                              /(rho_L*(s_L - vel_L(dir_idx(1))) - rho_R*(s_R - vel_R(dir_idx(1))))
-
-                                    end if
-                                elseif (wave_speeds == 2) then
-                                    pres_SL = 5.e-1_wp*(pres_L + pres_R + rho_avg*c_avg* &
-                                                        (vel_L(dir_idx(1)) - &
-                                                         vel_R(dir_idx(1))))
-
-                                    pres_SR = pres_SL
-
-                                    Ms_L = max(1._wp, sqrt(1._wp + ((5.e-1_wp + gamma_L)/(1._wp + gamma_L))* &
-                                                           (pres_SL/pres_L - 1._wp)*pres_L/ &
-                                                           ((pres_L + pi_inf_L/(1._wp + gamma_L)))))
-                                    Ms_R = max(1._wp, sqrt(1._wp + ((5.e-1_wp + gamma_R)/(1._wp + gamma_R))* &
-                                                           (pres_SR/pres_R - 1._wp)*pres_R/ &
-                                                           ((pres_R + pi_inf_R/(1._wp + gamma_R)))))
-
-                                    s_L = vel_L(dir_idx(1)) - c_L*Ms_L
-                                    s_R = vel_R(dir_idx(1)) + c_R*Ms_R
-
-                                    s_S = 5.e-1_wp*((vel_L(dir_idx(1)) + vel_R(dir_idx(1))) + &
-                                                    (pres_L - pres_R)/ &
-                                                    (rho_avg*c_avg))
-                                end if
-
-                                ! follows Einfeldt et al.
-                                ! s_M/P = min/max(0.,s_L/R)
-                                s_M = min(0._wp, s_L); s_P = max(0._wp, s_R)
-
-                                ! goes with q_star_L/R = xi_L/R * (variable)
-                                ! xi_L/R = ( ( s_L/R - u_L/R )/(s_L/R - s_star) )
-                                xi_L = (s_L - vel_L(dir_idx(1)))/(s_L - s_S)
-                                xi_R = (s_R - vel_R(dir_idx(1)))/(s_R - s_S)
-
-                                ! goes with numerical star velocity in x/y/z directions
-                                ! xi_P/M = 0.5 +/m sgn(0.5,s_star)
-                                xi_M = (5.e-1_wp + sign(0.5_wp, s_S))
-                                xi_P = (5.e-1_wp - sign(0.5_wp, s_S))
-
-                                ! goes with the numerical velocity in x/y/z directions
-                                ! xi_P/M (pressure) = min/max(0. sgn(1,sL/sR))
-                                xi_MP = -min(0._wp, sign(1._wp, s_L))
-                                xi_PP = max(0._wp, sign(1._wp, s_R))
-
-                                E_star = xi_M*(E_L + xi_MP*(xi_L*(E_L + (s_S - vel_L(dir_idx(1)))* &
-                                                                  (rho_L*s_S + pres_L/(s_L - vel_L(dir_idx(1))))) - E_L)) + &
-                                         xi_P*(E_R + xi_PP*(xi_R*(E_R + (s_S - vel_R(dir_idx(1)))* &
-                                                                  (rho_R*s_S + pres_R/(s_R - vel_R(dir_idx(1))))) - E_R))
-                                p_Star = xi_M*(pres_L + xi_MP*(rho_L*(s_L - vel_L(dir_idx(1)))*(s_S - vel_L(dir_idx(1))))) + &
-                                         xi_P*(pres_R + xi_PP*(rho_R*(s_R - vel_R(dir_idx(1)))*(s_S - vel_R(dir_idx(1)))))
-
-                                rho_Star = xi_M*(rho_L*(xi_MP*xi_L + 1._wp - xi_MP)) + &
-                                           xi_P*(rho_R*(xi_PP*xi_R + 1._wp - xi_PP))
-
-                                vel_K_Star = vel_L(dir_idx(1))*(1._wp - xi_MP) + xi_MP*vel_R(dir_idx(1)) + &
-                                             xi_MP*xi_PP*(s_S - vel_R(dir_idx(1)))
-
-                                ! Low Mach correction
-                                if (low_Mach == 1) then
-                                    @:compute_low_Mach_correction()
-                                else
-                                    pcorr = 0._wp
-                                end if
-
-                                ! COMPUTING FLUXES
-                                ! MASS FLUX.
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, contxe
-                                    flux_rs${XYZ}$_vf(j, k, l, i) = &
-                                        xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i)*(vel_L(dir_idx(1)) + s_M*(xi_L - 1._wp)) + &
-                                        xi_P*qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)*(vel_R(dir_idx(1)) + s_P*(xi_R - 1._wp))
-                                end do
-
-                                ! MOMENTUM FLUX.
-                                ! f = \rho u u - \sigma, q = \rho u, q_star = \xi * \rho*(s_star, v, w)
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, num_dims
-                                    flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = rho_Star*vel_K_Star* &
-                                                                                      (dir_flg(dir_idx(i))*vel_K_Star + (1._wp - dir_flg(dir_idx(i)))*(xi_M*vel_L(dir_idx(i)) + xi_P*vel_R(dir_idx(i)))) + dir_flg(dir_idx(i))*p_Star &
-                                                                                      + (s_M/s_L)*(s_P/s_R)*dir_flg(dir_idx(i))*pcorr
-                                end do
-
-                                ! ENERGY FLUX.
-                                ! f = u*(E-\sigma), q = E, q_star = \xi*E+(s-u)(\rho s_star - \sigma/(s-u))
-                                flux_rs${XYZ}$_vf(j, k, l, E_idx) = (E_star + p_Star)*vel_K_Star &
-                                                                    + (s_M/s_L)*(s_P/s_R)*pcorr*s_S
-
-                                ! ELASTICITY. Elastic shear stress additions for the momentum and energy flux
-                                if (elasticity) then
-                                    flux_ene_e = 0._wp; 
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = 1, num_dims
-                                        ! MOMENTUM ELASTIC FLUX.
-                                        flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = &
-                                            flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) &
-                                            - xi_M*tau_e_L(dir_idx_tau(i)) - xi_P*tau_e_R(dir_idx_tau(i))
-                                        ! ENERGY ELASTIC FLUX.
-                                        flux_ene_e = flux_ene_e - &
-                                                     xi_M*(vel_L(dir_idx(i))*tau_e_L(dir_idx_tau(i)) + &
-                                                           s_M*(xi_L*((s_S - vel_L(i))*(tau_e_L(dir_idx_tau(i))/(s_L - vel_L(i)))))) - &
-                                                     xi_P*(vel_R(dir_idx(i))*tau_e_R(dir_idx_tau(i)) + &
-                                                           s_P*(xi_R*((s_S - vel_R(i))*(tau_e_R(dir_idx_tau(i))/(s_R - vel_R(i))))))
-                                    end do
-                                    flux_rs${XYZ}$_vf(j, k, l, E_idx) = flux_rs${XYZ}$_vf(j, k, l, E_idx) + flux_ene_e
-                                end if
-
-                                ! VOLUME FRACTION FLUX.
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = advxb, advxe
-                                    flux_rs${XYZ}$_vf(j, k, l, i) = &
-                                        xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i)*s_S + &
-                                        xi_P*qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)*s_S
-                                end do
-
-                                ! SOURCE TERM FOR VOLUME FRACTION ADVECTION FLUX.
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, num_dims
-                                    vel_src_rs${XYZ}$_vf(j, k, l, dir_idx(i)) = &
-                                        xi_M*(vel_L(dir_idx(i)) + dir_flg(dir_idx(i))*(s_S*(xi_MP*(xi_L - 1) + 1) - vel_L(dir_idx(i)))) + &
-                                        xi_P*(vel_R(dir_idx(i)) + dir_flg(dir_idx(i))*(s_S*(xi_PP*(xi_R - 1) + 1) - vel_R(dir_idx(i))))
-                                end do
-
-                                ! INTERNAL ENERGIES ADVECTION FLUX.
-                                ! K-th pressure and velocity in preparation for the internal energy flux
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, num_fluids
-                                    p_K_Star = xi_M*(xi_MP*((pres_L + pi_infs(i)/(1._wp + gammas(i)))* &
-                                                            xi_L**(1._wp/gammas(i) + 1._wp) - pi_infs(i)/(1._wp + gammas(i)) - pres_L) + pres_L) + &
-                                               xi_P*(xi_PP*((pres_R + pi_infs(i)/(1._wp + gammas(i)))* &
-                                                            xi_R**(1._wp/gammas(i) + 1._wp) - pi_infs(i)/(1._wp + gammas(i)) - pres_R) + pres_R)
-
-                                    flux_rs${XYZ}$_vf(j, k, l, i + intxb - 1) = &
-                                        ((xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i + advxb - 1) + xi_P*qR_prim_rs${XYZ}$_vf(j + 1, k, l, i + advxb - 1))* &
-                                         (gammas(i)*p_K_Star + pi_infs(i)) + &
-                                         (xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i + contxb - 1) + xi_P*qR_prim_rs${XYZ}$_vf(j + 1, k, l, i + contxb - 1))* &
-                                         qvs(i))*vel_K_Star &
-                                        + (s_M/s_L)*(s_P/s_R)*pcorr*s_S*(xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i + advxb - 1) + xi_P*qR_prim_rs${XYZ}$_vf(j + 1, k, l, i + advxb - 1))
-                                end do
-
-                                flux_src_rs${XYZ}$_vf(j, k, l, advxb) = vel_src_rs${XYZ}$_vf(j, k, l, dir_idx(1))
-
-                                ! HYPOELASTIC STRESS EVOLUTION FLUX.
-                                if (hypoelasticity) then
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = 1, strxe - strxb + 1
-                                        flux_rs${XYZ}$_vf(j, k, l, strxb - 1 + i) = &
-                                            xi_M*(s_S/(s_L - s_S))*(s_L*rho_L*tau_e_L(i) - rho_L*vel_L(dir_idx(1))*tau_e_L(i)) + &
-                                            xi_P*(s_S/(s_R - s_S))*(s_R*rho_R*tau_e_R(i) - rho_R*vel_R(dir_idx(1))*tau_e_R(i))
-                                    end do
-                                end if
-
-                                ! REFERENCE MAP FLUX.
-                                if (hyperelasticity) then
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = 1, num_dims
-                                        flux_rs${XYZ}$_vf(j, k, l, xibeg - 1 + i) = &
-                                            xi_M*(s_S/(s_L - s_S))*(s_L*rho_L*xi_field_L(i) &
-                                                                    - rho_L*vel_L(dir_idx(1))*xi_field_L(i)) + &
-                                            xi_P*(s_S/(s_R - s_S))*(s_R*rho_R*xi_field_R(i) &
-                                                                    - rho_R*vel_R(dir_idx(1))*xi_field_R(i))
-                                    end do
-                                end if
-
-                                ! COLOR FUNCTION FLUX
-                                if (surface_tension) then
-                                    flux_rs${XYZ}$_vf(j, k, l, c_idx) = &
-                                        (xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, c_idx) + &
-                                         xi_P*qR_prim_rs${XYZ}$_vf(j + 1, k, l, c_idx))*s_S
-                                end if
-
-                                ! Geometrical source flux for cylindrical coordinates
-                                #:if (NORM_DIR == 2)
-                                    if (cyl_coord) then
-                                        !Substituting the advective flux into the inviscid geometrical source flux
-                                        $:GPU_LOOP(parallelism='[seq]')
-                                        do i = 1, E_idx
-                                            flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i)
-                                        end do
-                                        $:GPU_LOOP(parallelism='[seq]')
-                                        do i = intxb, intxe
-                                            flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i)
-                                        end do
-                                        ! Recalculating the radial momentum geometric source flux
-                                        flux_gsrc_rs${XYZ}$_vf(j, k, l, momxb - 1 + dir_idx(1)) = &
-                                            flux_gsrc_rs${XYZ}$_vf(j, k, l, momxb - 1 + dir_idx(1)) - p_Star
-                                        ! Geometrical source of the void fraction(s) is zero
-                                        $:GPU_LOOP(parallelism='[seq]')
-                                        do i = advxb, advxe
-                                            flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0._wp
-                                        end do
-                                    end if
-                                #:endif
-                                #:if (NORM_DIR == 3)
-                                    if (grid_geometry == 3) then
-                                        $:GPU_LOOP(parallelism='[seq]')
-                                        do i = 1, sys_size
-                                            flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0._wp
-                                        end do
-                                        flux_gsrc_rs${XYZ}$_vf(j, k, l, momxb - 1 + dir_idx(1)) = &
-                                            flux_gsrc_rs${XYZ}$_vf(j, k, l, momxb - 1 + dir_idx(1)) - p_Star
-
-                                        flux_gsrc_rs${XYZ}$_vf(j, k, l, momxe) = flux_rs${XYZ}$_vf(j, k, l, momxb + 1)
-                                    end if
-                                #:endif
-
-                            end do
-                        end do
-                    end do
-                    $:END_GPU_PARALLEL_LOOP()
-
-                elseif (model_eqns == 4) then
-                    !ME4
-                    $:GPU_PARALLEL_LOOP(collapse=3, private='[i, q, alpha_rho_L, alpha_rho_R, vel_L, vel_R, alpha_L, alpha_R, nbub_L, nbub_R, rho_L, rho_R, pres_L, pres_R, E_L, E_R, H_L, H_R, Cp_avg, Cv_avg, T_avg, eps, c_sum_Yi_Phi, T_L, T_R, Y_L, Y_R, MW_L, MW_R, R_gas_L, R_gas_R, Cp_L, Cp_R, Gamm_L, Gamm_R, gamma_L, gamma_R, pi_inf_L, pi_inf_R, qv_L, qv_R, qv_avg,c_L, c_R, G_L, G_R, rho_avg, H_avg, c_avg, gamma_avg, ptilde_L, ptilde_R, vel_L_rms, vel_R_rms, vel_avg_rms, vel_L_tmp, vel_R_tmp, Ms_L, Ms_R, pres_SL, pres_SR, alpha_L_sum, alpha_R_sum, rho_Star, E_Star, p_Star, p_K_Star, vel_K_star, s_L, s_R, s_M, s_P, s_S, xi_M, xi_P, xi_L, xi_R, xi_MP, xi_PP]')
-                    do l = is3%beg, is3%end
-                        do k = is2%beg, is2%end
-                            do j = is1%beg, is1%end
-
-                                vel_L_rms = 0._wp; vel_R_rms = 0._wp
-                                rho_L = 0._wp; rho_R = 0._wp
-                                gamma_L = 0._wp; gamma_R = 0._wp
-                                pi_inf_L = 0._wp; pi_inf_R = 0._wp
-                                qv_L = 0._wp; qv_R = 0._wp
-
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, contxe
-                                    alpha_rho_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, i)
-                                    alpha_rho_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)
-                                end do
-
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, num_dims
-                                    vel_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, contxe + i)
-                                    vel_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, contxe + i)
-                                    vel_L_rms = vel_L_rms + vel_L(i)**2._wp
-                                    vel_R_rms = vel_R_rms + vel_R(i)**2._wp
-                                end do
-
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, num_fluids
-                                    alpha_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)
-                                    alpha_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)
-                                end do
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, num_fluids
-                                    alpha_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)
-                                    alpha_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)
-                                end do
-
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, num_fluids
-                                    rho_L = rho_L + alpha_rho_L(i)
-                                    gamma_L = gamma_L + alpha_L(i)*gammas(i)
-                                    pi_inf_L = pi_inf_L + alpha_L(i)*pi_infs(i)
-                                    qv_L = qv_L + alpha_rho_L(i)*qvs(i)
-
-                                    rho_R = rho_R + alpha_rho_R(i)
-                                    gamma_R = gamma_R + alpha_R(i)*gammas(i)
-                                    pi_inf_R = pi_inf_R + alpha_R(i)*pi_infs(i)
-                                    qv_R = qv_R + alpha_rho_R(i)*qvs(i)
-                                end do
-
-                                pres_L = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx)
-                                pres_R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx)
-
-                                E_L = gamma_L*pres_L + pi_inf_L + 5.e-1_wp*rho_L*vel_L_rms + qv_L
-                                E_R = gamma_R*pres_R + pi_inf_R + 5.e-1_wp*rho_R*vel_R_rms + qv_R
-
-                                H_L = (E_L + pres_L)/rho_L
-                                H_R = (E_R + pres_R)/rho_R
-
-                                @:compute_average_state()
-
-                                call s_compute_speed_of_sound(pres_L, rho_L, gamma_L, pi_inf_L, H_L, alpha_L, &
-                                                              vel_L_rms, 0._wp, c_L, qv_L)
-
-                                call s_compute_speed_of_sound(pres_R, rho_R, gamma_R, pi_inf_R, H_R, alpha_R, &
-                                                              vel_R_rms, 0._wp, c_R, qv_R)
-
-                                !> The computation of c_avg does not require all the variables, and therefore the non '_avg'
-                                ! variables are placeholders to call the subroutine.
-
-                                call s_compute_speed_of_sound(pres_R, rho_avg, gamma_avg, pi_inf_R, H_avg, alpha_R, &
-                                                              vel_avg_rms, 0._wp, c_avg, qv_avg)
-
-                                if (wave_speeds == 1) then
-                                    s_L = min(vel_L(dir_idx(1)) - c_L, vel_R(dir_idx(1)) - c_R)
-                                    s_R = max(vel_R(dir_idx(1)) + c_R, vel_L(dir_idx(1)) + c_L)
-
-                                    s_S = (pres_R - pres_L + rho_L*vel_L(dir_idx(1))* &
-                                           (s_L - vel_L(dir_idx(1))) - &
-                                           rho_R*vel_R(dir_idx(1))* &
-                                           (s_R - vel_R(dir_idx(1)))) &
-                                          /(rho_L*(s_L - vel_L(dir_idx(1))) - &
-                                            rho_R*(s_R - vel_R(dir_idx(1))))
-                                elseif (wave_speeds == 2) then
-                                    pres_SL = 5.e-1_wp*(pres_L + pres_R + rho_avg*c_avg* &
-                                                        (vel_L(dir_idx(1)) - &
-                                                         vel_R(dir_idx(1))))
-
-                                    pres_SR = pres_SL
-
-                                    Ms_L = max(1._wp, sqrt(1._wp + ((5.e-1_wp + gamma_L)/(1._wp + gamma_L))* &
-                                                           (pres_SL/pres_L - 1._wp)*pres_L/ &
-                                                           ((pres_L + pi_inf_L/(1._wp + gamma_L)))))
-                                    Ms_R = max(1._wp, sqrt(1._wp + ((5.e-1_wp + gamma_R)/(1._wp + gamma_R))* &
-                                                           (pres_SR/pres_R - 1._wp)*pres_R/ &
-                                                           ((pres_R + pi_inf_R/(1._wp + gamma_R)))))
-
-                                    s_L = vel_L(dir_idx(1)) - c_L*Ms_L
-                                    s_R = vel_R(dir_idx(1)) + c_R*Ms_R
-
-                                    s_S = 5.e-1_wp*((vel_L(dir_idx(1)) + vel_R(dir_idx(1))) + &
-                                                    (pres_L - pres_R)/ &
-                                                    (rho_avg*c_avg))
-                                end if
-
-                                ! follows Einfeldt et al.
-                                ! s_M/P = min/max(0.,s_L/R)
-                                s_M = min(0._wp, s_L); s_P = max(0._wp, s_R)
-
-                                ! goes with q_star_L/R = xi_L/R * (variable)
-                                ! xi_L/R = ( ( s_L/R - u_L/R )/(s_L/R - s_star) )
-                                xi_L = (s_L - vel_L(dir_idx(1)))/(s_L - s_S)
-                                xi_R = (s_R - vel_R(dir_idx(1)))/(s_R - s_S)
-
-                                ! goes with numerical velocity in x/y/z directions
-                                ! xi_P/M = 0.5 +/m sgn(0.5,s_star)
-                                xi_M = (5.e-1_wp + sign(5.e-1_wp, s_S))
-                                xi_P = (5.e-1_wp - sign(5.e-1_wp, s_S))
-
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, contxe
-                                    flux_rs${XYZ}$_vf(j, k, l, i) = &
-                                        xi_M*alpha_rho_L(i) &
-                                        *(vel_L(dir_idx(1)) + s_M*(xi_L - 1._wp)) &
-                                        + xi_P*alpha_rho_R(i) &
-                                        *(vel_R(dir_idx(1)) + s_P*(xi_R - 1._wp))
-                                end do
-
-                                ! Momentum flux.
-                                ! f = \rho u u + p I, q = \rho u, q_star = \xi * \rho*(s_star, v, w)
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, num_dims
-                                    flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = &
-                                        xi_M*(rho_L*(vel_L(dir_idx(1))* &
-                                                     vel_L(dir_idx(i)) + &
-                                                     s_M*(xi_L*(dir_flg(dir_idx(i))*s_S + &
-                                                                (1._wp - dir_flg(dir_idx(i)))* &
-                                                                vel_L(dir_idx(i))) - vel_L(dir_idx(i)))) + &
-                                              dir_flg(dir_idx(i))*pres_L) &
-                                        + xi_P*(rho_R*(vel_R(dir_idx(1))* &
-                                                       vel_R(dir_idx(i)) + &
-                                                       s_P*(xi_R*(dir_flg(dir_idx(i))*s_S + &
-                                                                  (1._wp - dir_flg(dir_idx(i)))* &
-                                                                  vel_R(dir_idx(i))) - vel_R(dir_idx(i)))) + &
-                                                dir_flg(dir_idx(i))*pres_R)
-                                end do
-
-                                if (bubbles_euler) then
-                                    ! Put p_tilde in
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = 1, num_dims
-                                        flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = &
-                                            flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) + &
-                                            xi_M*(dir_flg(dir_idx(i))*(-1._wp*ptilde_L)) &
-                                            + xi_P*(dir_flg(dir_idx(i))*(-1._wp*ptilde_R))
-                                    end do
-                                end if
-
-                                flux_rs${XYZ}$_vf(j, k, l, E_idx) = 0._wp
-
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = alf_idx, alf_idx !only advect the void fraction
-                                    flux_rs${XYZ}$_vf(j, k, l, i) = &
-                                        xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i) &
-                                        *(vel_L(dir_idx(1)) + s_M*(xi_L - 1._wp)) &
-                                        + xi_P*qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) &
-                                        *(vel_R(dir_idx(1)) + s_P*(xi_R - 1._wp))
-                                end do
-
-                                ! Source for volume fraction advection equation
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, num_dims
-
-                                    vel_src_rs${XYZ}$_vf(j, k, l, dir_idx(i)) = 0._wp
-                                    !IF ( (model_eqns == 4) .or. (num_fluids==1) ) vel_src_rs_vf(dir_idx(i))%sf(j,k,l) = 0._wp
-                                end do
-
-                                flux_src_rs${XYZ}$_vf(j, k, l, advxb) = vel_src_rs${XYZ}$_vf(j, k, l, dir_idx(1))
-
-                                ! Add advection flux for bubble variables
-                                if (bubbles_euler) then
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = bubxb, bubxe
-                                        flux_rs${XYZ}$_vf(j, k, l, i) = &
-                                            xi_M*nbub_L*qL_prim_rs${XYZ}$_vf(j, k, l, i) &
-                                            *(vel_L(dir_idx(1)) + s_M*(xi_L - 1._wp)) &
-                                            + xi_P*nbub_R*qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) &
-                                            *(vel_R(dir_idx(1)) + s_P*(xi_R - 1._wp))
-                                    end do
-                                end if
-
-                                ! Geometrical source flux for cylindrical coordinates
-
-                                #:if (NORM_DIR == 2)
-                                    if (cyl_coord) then
-                                        ! Substituting the advective flux into the inviscid geometrical source flux
-                                        $:GPU_LOOP(parallelism='[seq]')
-                                        do i = 1, E_idx
-                                            flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i)
-                                        end do
-                                        ! Recalculating the radial momentum geometric source flux
-                                        flux_gsrc_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(1)) = &
-                                            xi_M*(rho_L*(vel_L(dir_idx(1))* &
-                                                         vel_L(dir_idx(1)) + &
-                                                         s_M*(xi_L*(dir_flg(dir_idx(1))*s_S + &
-                                                                    (1._wp - dir_flg(dir_idx(1)))* &
-                                                                    vel_L(dir_idx(1))) - vel_L(dir_idx(1))))) &
-                                            + xi_P*(rho_R*(vel_R(dir_idx(1))* &
-                                                           vel_R(dir_idx(1)) + &
-                                                           s_P*(xi_R*(dir_flg(dir_idx(1))*s_S + &
-                                                                      (1._wp - dir_flg(dir_idx(1)))* &
-                                                                      vel_R(dir_idx(1))) - vel_R(dir_idx(1)))))
-                                        ! Geometrical source of the void fraction(s) is zero
-                                        $:GPU_LOOP(parallelism='[seq]')
-                                        do i = advxb, advxe
-                                            flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0._wp
-                                        end do
-                                    end if
-                                #:endif
-                                #:if (NORM_DIR == 3)
-                                    if (grid_geometry == 3) then
-                                        $:GPU_LOOP(parallelism='[seq]')
-                                        do i = 1, sys_size
-                                            flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0._wp
-                                        end do
-                                        flux_gsrc_rs${XYZ}$_vf(j, k, l, momxb + 1) = &
-                                            -xi_M*(rho_L*(vel_L(dir_idx(1))* &
-                                                          vel_L(dir_idx(1)) + &
-                                                          s_M*(xi_L*(dir_flg(dir_idx(1))*s_S + &
-                                                                     (1._wp - dir_flg(dir_idx(1)))* &
-                                                                     vel_L(dir_idx(1))) - vel_L(dir_idx(1))))) &
-                                            - xi_P*(rho_R*(vel_R(dir_idx(1))* &
-                                                           vel_R(dir_idx(1)) + &
-                                                           s_P*(xi_R*(dir_flg(dir_idx(1))*s_S + &
-                                                                      (1._wp - dir_flg(dir_idx(1)))* &
-                                                                      vel_R(dir_idx(1))) - vel_R(dir_idx(1)))))
-                                        flux_gsrc_rs${XYZ}$_vf(j, k, l, momxe) = flux_rs${XYZ}$_vf(j, k, l, momxb + 1)
-                                    end if
-                                #:endif
-                            end do
-                        end do
-                    end do
-                    $:END_GPU_PARALLEL_LOOP()
-
-                elseif (model_eqns == 2 .and. bubbles_euler) then
-                    $:GPU_PARALLEL_LOOP(collapse=3, private='[i, q, R0_L, R0_R, V0_L, V0_R, P0_L, P0_R, pbw_L, pbw_R, vel_L, vel_R, rho_avg, alpha_L, alpha_R, h_avg, gamma_avg, Re_L, Re_R, pcorr, zcoef, rho_L, rho_R, pres_L, pres_R, E_L, E_R, H_L, H_R, gamma_L, gamma_R, pi_inf_L, pi_inf_R, qv_L, qv_R, qv_avg, c_L, c_R, c_avg, vel_L_rms, vel_R_rms, vel_avg_rms, vel_L_tmp, vel_R_tmp, Ms_L, Ms_R, pres_SL, pres_SR, alpha_L_sum, alpha_R_sum, s_L, s_R, s_M, s_P, s_S, xi_M, xi_P, xi_L, xi_R, xi_MP, xi_PP, nbub_L, nbub_R, PbwR3Lbar, PbwR3Rbar, R3Lbar, R3Rbar, R3V2Lbar, R3V2Rbar]')
-                    do l = is3%beg, is3%end
-                        do k = is2%beg, is2%end
-                            do j = is1%beg, is1%end
-
-                                vel_L_rms = 0._wp; vel_R_rms = 0._wp
-                                rho_L = 0._wp; rho_R = 0._wp
-                                gamma_L = 0._wp; gamma_R = 0._wp
-                                pi_inf_L = 0._wp; pi_inf_R = 0._wp
-                                qv_L = 0._wp; qv_R = 0._wp
-
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, num_fluids
-                                    alpha_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)
-                                    alpha_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)
-                                end do
-
-                                vel_L_rms = 0._wp; vel_R_rms = 0._wp
-
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, num_dims
-                                    vel_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, contxe + i)
-                                    vel_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, contxe + i)
-                                    vel_L_rms = vel_L_rms + vel_L(i)**2._wp
-                                    vel_R_rms = vel_R_rms + vel_R(i)**2._wp
-                                end do
-
-                                ! Retain this in the refactor
-                                if (mpp_lim .and. (num_fluids > 2)) then
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = 1, num_fluids
-                                        rho_L = rho_L + qL_prim_rs${XYZ}$_vf(j, k, l, i)
-                                        gamma_L = gamma_L + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)*gammas(i)
-                                        pi_inf_L = pi_inf_L + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)*pi_infs(i)
-                                        qv_L = qv_L + qL_prim_rs${XYZ}$_vf(j, k, l, i)*qvs(i)
-                                        rho_R = rho_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)
-                                        gamma_R = gamma_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)*gammas(i)
-                                        pi_inf_R = pi_inf_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)*pi_infs(i)
-                                        qv_R = qv_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)*qvs(i)
-                                    end do
-                                else if (num_fluids > 2) then
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = 1, num_fluids - 1
-                                        rho_L = rho_L + qL_prim_rs${XYZ}$_vf(j, k, l, i)
-                                        gamma_L = gamma_L + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)*gammas(i)
-                                        pi_inf_L = pi_inf_L + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)*pi_infs(i)
-                                        qv_L = qv_L + qL_prim_rs${XYZ}$_vf(j, k, l, i)*qvs(i)
-                                        rho_R = rho_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)
-                                        gamma_R = gamma_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)*gammas(i)
-                                        pi_inf_R = pi_inf_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)*pi_infs(i)
-                                        qv_R = qv_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)*qvs(i)
-                                    end do
-                                else
-                                    rho_L = qL_prim_rs${XYZ}$_vf(j, k, l, 1)
-                                    gamma_L = gammas(1)
-                                    pi_inf_L = pi_infs(1)
-                                    qv_L = qvs(1)
-                                    rho_R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, 1)
-                                    gamma_R = gammas(1)
-                                    pi_inf_R = pi_infs(1)
-                                    qv_R = qvs(1)
-                                end if
-
-                                if (viscous) then
-                                    if (num_fluids == 1) then ! Need to consider case with num_fluids >= 2
-                                        $:GPU_LOOP(parallelism='[seq]')
-                                        do i = 1, 2
-                                            Re_L(i) = dflt_real
-                                            Re_R(i) = dflt_real
-
-                                            if (Re_size(i) > 0) Re_L(i) = 0._wp
-                                            if (Re_size(i) > 0) Re_R(i) = 0._wp
-
-                                            $:GPU_LOOP(parallelism='[seq]')
-                                            do q = 1, Re_size(i)
-                                                Re_L(i) = (1._wp - qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + Re_idx(i, q)))/Res_gs(i, q) &
-                                                          + Re_L(i)
-                                                Re_R(i) = (1._wp - qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + Re_idx(i, q)))/Res_gs(i, q) &
-                                                          + Re_R(i)
-                                            end do
-
-                                            Re_L(i) = 1._wp/max(Re_L(i), sgm_eps)
-                                            Re_R(i) = 1._wp/max(Re_R(i), sgm_eps)
-
-                                        end do
-                                    end if
-                                end if
-
-                                pres_L = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx)
-                                pres_R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx)
-
-                                E_L = gamma_L*pres_L + pi_inf_L + 5.e-1_wp*rho_L*vel_L_rms
-                                E_R = gamma_R*pres_R + pi_inf_R + 5.e-1_wp*rho_R*vel_R_rms
-
-                                H_L = (E_L + pres_L)/rho_L
-                                H_R = (E_R + pres_R)/rho_R
-
-                                if (avg_state == 2) then
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = 1, nb
-                                        R0_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, rs(i))
-                                        R0_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, rs(i))
-
-                                        V0_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, vs(i))
-                                        V0_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, vs(i))
-                                        if (.not. polytropic .and. .not. qbmm) then
-                                            P0_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, ps(i))
-                                            P0_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, ps(i))
-                                        end if
-                                    end do
-
-                                    if (.not. qbmm) then
-                                        if (adv_n) then
-                                            nbub_L = qL_prim_rs${XYZ}$_vf(j, k, l, n_idx)
-                                            nbub_R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, n_idx)
-                                        else
-                                            nbub_L = 0._wp
-                                            nbub_R = 0._wp
-                                            $:GPU_LOOP(parallelism='[seq]')
-                                            do i = 1, nb
-                                                nbub_L = nbub_L + (R0_L(i)**3._wp)*weight(i)
-                                                nbub_R = nbub_R + (R0_R(i)**3._wp)*weight(i)
-                                            end do
-
-                                            nbub_L = (3._wp/(4._wp*pi))*qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + num_fluids)/nbub_L
-                                            nbub_R = (3._wp/(4._wp*pi))*qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + num_fluids)/nbub_R
-                                        end if
-                                    else
-                                        !nb stored in 0th moment of first R0 bin in variable conversion module
-                                        nbub_L = qL_prim_rs${XYZ}$_vf(j, k, l, bubxb)
-                                        nbub_R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, bubxb)
-                                    end if
-
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = 1, nb
-                                        if (.not. qbmm) then
-                                            pbw_L(i) = f_cpbw_KM(R0(i), R0_L(i), V0_L(i), P0_L(i))
-                                            pbw_R(i) = f_cpbw_KM(R0(i), R0_R(i), V0_R(i), P0_R(i))
-                                        end if
-                                    end do
-
-                                    if (qbmm) then
-                                        PbwR3Lbar = mom_sp_rs${XYZ}$_vf(j, k, l, 4)
-                                        PbwR3Rbar = mom_sp_rs${XYZ}$_vf(j + 1, k, l, 4)
-
-                                        R3Lbar = mom_sp_rs${XYZ}$_vf(j, k, l, 1)
-                                        R3Rbar = mom_sp_rs${XYZ}$_vf(j + 1, k, l, 1)
-
-                                        R3V2Lbar = mom_sp_rs${XYZ}$_vf(j, k, l, 3)
-                                        R3V2Rbar = mom_sp_rs${XYZ}$_vf(j + 1, k, l, 3)
-                                    else
-
-                                        PbwR3Lbar = 0._wp
-                                        PbwR3Rbar = 0._wp
-
-                                        R3Lbar = 0._wp
-                                        R3Rbar = 0._wp
-
-                                        R3V2Lbar = 0._wp
-                                        R3V2Rbar = 0._wp
-
-                                        $:GPU_LOOP(parallelism='[seq]')
-                                        do i = 1, nb
-                                            PbwR3Lbar = PbwR3Lbar + pbw_L(i)*(R0_L(i)**3._wp)*weight(i)
-                                            PbwR3Rbar = PbwR3Rbar + pbw_R(i)*(R0_R(i)**3._wp)*weight(i)
-
-                                            R3Lbar = R3Lbar + (R0_L(i)**3._wp)*weight(i)
-                                            R3Rbar = R3Rbar + (R0_R(i)**3._wp)*weight(i)
-
-                                            R3V2Lbar = R3V2Lbar + (R0_L(i)**3._wp)*(V0_L(i)**2._wp)*weight(i)
-                                            R3V2Rbar = R3V2Rbar + (R0_R(i)**3._wp)*(V0_R(i)**2._wp)*weight(i)
-                                        end do
-                                    end if
-
-                                    rho_avg = 5.e-1_wp*(rho_L + rho_R)
-                                    H_avg = 5.e-1_wp*(H_L + H_R)
-                                    gamma_avg = 5.e-1_wp*(gamma_L + gamma_R)
-                                    qv_avg = 5.e-1_wp*(qv_L + qv_R)
-                                    vel_avg_rms = 0._wp
-
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = 1, num_dims
-                                        vel_avg_rms = vel_avg_rms + (5.e-1_wp*(vel_L(i) + vel_R(i)))**2._wp
-                                    end do
-
-                                end if
-
-                                call s_compute_speed_of_sound(pres_L, rho_L, gamma_L, pi_inf_L, H_L, alpha_L, &
-                                                              vel_L_rms, 0._wp, c_L, qv_L)
-
-                                call s_compute_speed_of_sound(pres_R, rho_R, gamma_R, pi_inf_R, H_R, alpha_R, &
-                                                              vel_R_rms, 0._wp, c_R, qv_R)
-
-                                !> The computation of c_avg does not require all the variables, and therefore the non '_avg'
-                                ! variables are placeholders to call the subroutine.
-                                call s_compute_speed_of_sound(pres_R, rho_avg, gamma_avg, pi_inf_R, H_avg, alpha_R, &
-                                                              vel_avg_rms, 0._wp, c_avg, qv_avg)
-
-                                if (viscous) then
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = 1, 2
-                                        Re_avg_rs${XYZ}$_vf(j, k, l, i) = 2._wp/(1._wp/Re_L(i) + 1._wp/Re_R(i))
-                                    end do
-                                end if
-
-                                ! Low Mach correction
-                                if (low_Mach == 2) then
-                                    @:compute_low_Mach_correction()
-                                end if
-
-                                if (wave_speeds == 1) then
-                                    s_L = min(vel_L(dir_idx(1)) - c_L, vel_R(dir_idx(1)) - c_R)
-                                    s_R = max(vel_R(dir_idx(1)) + c_R, vel_L(dir_idx(1)) + c_L)
-
-                                    s_S = (pres_R - pres_L + rho_L*vel_L(dir_idx(1))* &
-                                           (s_L - vel_L(dir_idx(1))) - &
-                                           rho_R*vel_R(dir_idx(1))* &
-                                           (s_R - vel_R(dir_idx(1)))) &
-                                          /(rho_L*(s_L - vel_L(dir_idx(1))) - &
-                                            rho_R*(s_R - vel_R(dir_idx(1))))
-                                elseif (wave_speeds == 2) then
-                                    pres_SL = 5.e-1_wp*(pres_L + pres_R + rho_avg*c_avg* &
-                                                        (vel_L(dir_idx(1)) - &
-                                                         vel_R(dir_idx(1))))
-
-                                    pres_SR = pres_SL
-
-                                    Ms_L = max(1._wp, sqrt(1._wp + ((5.e-1_wp + gamma_L)/(1._wp + gamma_L))* &
-                                                           (pres_SL/pres_L - 1._wp)*pres_L/ &
-                                                           ((pres_L + pi_inf_L/(1._wp + gamma_L)))))
-                                    Ms_R = max(1._wp, sqrt(1._wp + ((5.e-1_wp + gamma_R)/(1._wp + gamma_R))* &
-                                                           (pres_SR/pres_R - 1._wp)*pres_R/ &
-                                                           ((pres_R + pi_inf_R/(1._wp + gamma_R)))))
-
-                                    s_L = vel_L(dir_idx(1)) - c_L*Ms_L
-                                    s_R = vel_R(dir_idx(1)) + c_R*Ms_R
-
-                                    s_S = 5.e-1_wp*((vel_L(dir_idx(1)) + vel_R(dir_idx(1))) + &
-                                                    (pres_L - pres_R)/ &
-                                                    (rho_avg*c_avg))
-                                end if
-
-                                ! follows Einfeldt et al.
-                                ! s_M/P = min/max(0.,s_L/R)
-                                s_M = min(0._wp, s_L); s_P = max(0._wp, s_R)
-
-                                ! goes with q_star_L/R = xi_L/R * (variable)
-                                ! xi_L/R = ( ( s_L/R - u_L/R )/(s_L/R - s_star) )
-                                xi_L = (s_L - vel_L(dir_idx(1)))/(s_L - s_S)
-                                xi_R = (s_R - vel_R(dir_idx(1)))/(s_R - s_S)
-
-                                ! goes with numerical velocity in x/y/z directions
-                                ! xi_P/M = 0.5 +/m sgn(0.5,s_star)
-                                xi_M = (5.e-1_wp + sign(5.e-1_wp, s_S))
-                                xi_P = (5.e-1_wp - sign(5.e-1_wp, s_S))
-
-                                ! Low Mach correction
-                                if (low_Mach == 1) then
-                                    @:compute_low_Mach_correction()
-                                else
-                                    pcorr = 0._wp
-                                end if
-
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, contxe
-                                    flux_rs${XYZ}$_vf(j, k, l, i) = &
-                                        xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i) &
-                                        *(vel_L(dir_idx(1)) + s_M*(xi_L - 1._wp)) &
-                                        + xi_P*qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) &
-                                        *(vel_R(dir_idx(1)) + s_P*(xi_R - 1._wp))
-                                end do
-
-                                if (bubbles_euler .and. (num_fluids > 1)) then
-                                    ! Kill mass transport @ gas density
-                                    flux_rs${XYZ}$_vf(j, k, l, contxe) = 0._wp
-                                end if
-
-                                ! Momentum flux.
-                                ! f = \rho u u + p I, q = \rho u, q_star = \xi * \rho*(s_star, v, w)
-
-                                ! Include p_tilde
-
-                                if (avg_state == 2) then
-                                    if (alpha_L(num_fluids) < small_alf .or. R3Lbar < small_alf) then
-                                        pres_L = pres_L - alpha_L(num_fluids)*pres_L
-                                    else
-                                        pres_L = pres_L - alpha_L(num_fluids)*(pres_L - PbwR3Lbar/R3Lbar - &
-                                                                               rho_L*R3V2Lbar/R3Lbar)
-                                    end if
-
-                                    if (alpha_R(num_fluids) < small_alf .or. R3Rbar < small_alf) then
-                                        pres_R = pres_R - alpha_R(num_fluids)*pres_R
-                                    else
-                                        pres_R = pres_R - alpha_R(num_fluids)*(pres_R - PbwR3Rbar/R3Rbar - &
-                                                                               rho_R*R3V2Rbar/R3Rbar)
-                                    end if
-                                end if
-
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, num_dims
-                                    flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = &
-                                        xi_M*(rho_L*(vel_L(dir_idx(1))* &
-                                                     vel_L(dir_idx(i)) + &
-                                                     s_M*(xi_L*(dir_flg(dir_idx(i))*s_S + &
-                                                                (1._wp - dir_flg(dir_idx(i)))* &
-                                                                vel_L(dir_idx(i))) - vel_L(dir_idx(i)))) + &
-                                              dir_flg(dir_idx(i))*(pres_L)) &
-                                        + xi_P*(rho_R*(vel_R(dir_idx(1))* &
-                                                       vel_R(dir_idx(i)) + &
-                                                       s_P*(xi_R*(dir_flg(dir_idx(i))*s_S + &
-                                                                  (1._wp - dir_flg(dir_idx(i)))* &
-                                                                  vel_R(dir_idx(i))) - vel_R(dir_idx(i)))) + &
-                                                dir_flg(dir_idx(i))*(pres_R)) &
-                                        + (s_M/s_L)*(s_P/s_R)*dir_flg(dir_idx(i))*pcorr
-                                end do
-
-                                ! Energy flux.
-                                ! f = u*(E+p), q = E, q_star = \xi*E+(s-u)(\rho s_star + p/(s-u))
-                                flux_rs${XYZ}$_vf(j, k, l, E_idx) = &
-                                    xi_M*(vel_L(dir_idx(1))*(E_L + pres_L) + &
-                                          s_M*(xi_L*(E_L + (s_S - vel_L(dir_idx(1)))* &
-                                                     (rho_L*s_S + (pres_L)/ &
-                                                      (s_L - vel_L(dir_idx(1))))) - E_L)) &
-                                    + xi_P*(vel_R(dir_idx(1))*(E_R + pres_R) + &
-                                            s_P*(xi_R*(E_R + (s_S - vel_R(dir_idx(1)))* &
-                                                       (rho_R*s_S + (pres_R)/ &
-                                                        (s_R - vel_R(dir_idx(1))))) - E_R)) &
-                                    + (s_M/s_L)*(s_P/s_R)*pcorr*s_S
-
-                                ! Volume fraction flux
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = advxb, advxe
-                                    flux_rs${XYZ}$_vf(j, k, l, i) = &
-                                        xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i) &
-                                        *(vel_L(dir_idx(1)) + s_M*(xi_L - 1._wp)) &
-                                        + xi_P*qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) &
-                                        *(vel_R(dir_idx(1)) + s_P*(xi_R - 1._wp))
-                                end do
-
-                                ! Source for volume fraction advection equation
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, num_dims
-                                    vel_src_rs${XYZ}$_vf(j, k, l, dir_idx(i)) = &
-                                        xi_M*(vel_L(dir_idx(i)) + &
-                                              dir_flg(dir_idx(i))* &
-                                              s_M*(xi_L - 1._wp)) &
-                                        + xi_P*(vel_R(dir_idx(i)) + &
-                                                dir_flg(dir_idx(i))* &
-                                                s_P*(xi_R - 1._wp))
-
-                                    !IF ( (model_eqns == 4) .or. (num_fluids==1) ) vel_src_rs_vf(dir_idx(i))%sf(j,k,l) = 0._wp
-                                end do
-
-                                flux_src_rs${XYZ}$_vf(j, k, l, advxb) = vel_src_rs${XYZ}$_vf(j, k, l, dir_idx(1))
-
-                                ! Add advection flux for bubble variables
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = bubxb, bubxe
-                                    flux_rs${XYZ}$_vf(j, k, l, i) = &
-                                        xi_M*nbub_L*qL_prim_rs${XYZ}$_vf(j, k, l, i) &
-                                        *(vel_L(dir_idx(1)) + s_M*(xi_L - 1._wp)) &
-                                        + xi_P*nbub_R*qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) &
-                                        *(vel_R(dir_idx(1)) + s_P*(xi_R - 1._wp))
-                                end do
-
-                                if (qbmm) then
-                                    flux_rs${XYZ}$_vf(j, k, l, bubxb) = &
-                                        xi_M*nbub_L &
-                                        *(vel_L(dir_idx(1)) + s_M*(xi_L - 1._wp)) &
-                                        + xi_P*nbub_R &
-                                        *(vel_R(dir_idx(1)) + s_P*(xi_R - 1._wp))
-                                end if
-
-                                if (adv_n) then
-                                    flux_rs${XYZ}$_vf(j, k, l, n_idx) = &
-                                        xi_M*nbub_L &
-                                        *(vel_L(dir_idx(1)) + s_M*(xi_L - 1._wp)) &
-                                        + xi_P*nbub_R &
-                                        *(vel_R(dir_idx(1)) + s_P*(xi_R - 1._wp))
-                                end if
-
-                                ! Geometrical source flux for cylindrical coordinates
-                                #:if (NORM_DIR == 2)
-                                    if (cyl_coord) then
-                                        ! Substituting the advective flux into the inviscid geometrical source flux
-                                        $:GPU_LOOP(parallelism='[seq]')
-                                        do i = 1, E_idx
-                                            flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i)
-                                        end do
-                                        ! Recalculating the radial momentum geometric source flux
-                                        flux_gsrc_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(1)) = &
-                                            xi_M*(rho_L*(vel_L(dir_idx(1))* &
-                                                         vel_L(dir_idx(1)) + &
-                                                         s_M*(xi_L*(dir_flg(dir_idx(1))*s_S + &
-                                                                    (1._wp - dir_flg(dir_idx(1)))* &
-                                                                    vel_L(dir_idx(1))) - vel_L(dir_idx(1))))) &
-                                            + xi_P*(rho_R*(vel_R(dir_idx(1))* &
-                                                           vel_R(dir_idx(1)) + &
-                                                           s_P*(xi_R*(dir_flg(dir_idx(1))*s_S + &
-                                                                      (1._wp - dir_flg(dir_idx(1)))* &
-                                                                      vel_R(dir_idx(1))) - vel_R(dir_idx(1)))))
-                                        ! Geometrical source of the void fraction(s) is zero
-                                        $:GPU_LOOP(parallelism='[seq]')
-                                        do i = advxb, advxe
-                                            flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0._wp
-                                        end do
-                                    end if
-                                #:endif
-                                #:if (NORM_DIR == 3)
-                                    if (grid_geometry == 3) then
-                                        $:GPU_LOOP(parallelism='[seq]')
-                                        do i = 1, sys_size
-                                            flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0._wp
-                                        end do
-
-                                        flux_gsrc_rs${XYZ}$_vf(j, k, l, momxb + 1) = &
-                                            -xi_M*(rho_L*(vel_L(dir_idx(1))* &
-                                                          vel_L(dir_idx(1)) + &
-                                                          s_M*(xi_L*(dir_flg(dir_idx(1))*s_S + &
-                                                                     (1._wp - dir_flg(dir_idx(1)))* &
-                                                                     vel_L(dir_idx(1))) - vel_L(dir_idx(1))))) &
-                                            - xi_P*(rho_R*(vel_R(dir_idx(1))* &
-                                                           vel_R(dir_idx(1)) + &
-                                                           s_P*(xi_R*(dir_flg(dir_idx(1))*s_S + &
-                                                                      (1._wp - dir_flg(dir_idx(1)))* &
-                                                                      vel_R(dir_idx(1))) - vel_R(dir_idx(1)))))
-                                        flux_gsrc_rs${XYZ}$_vf(j, k, l, momxe) = flux_rs${XYZ}$_vf(j, k, l, momxb + 1)
-
-                                    end if
-                                #:endif
-                            end do
-                        end do
-                    end do
-                    $:END_GPU_PARALLEL_LOOP()
-                else
-                    ! 5-EQUATION MODEL WITH HLLC
-                    $:GPU_PARALLEL_LOOP(collapse=3, private='[Re_max, i, q, T_L, T_R, vel_L_rms, vel_R_rms, pres_L, pres_R, rho_L, gamma_L, pi_inf_L, qv_L, rho_R, gamma_R, pi_inf_R, qv_R, alpha_L_sum, alpha_R_sum, E_L, E_R, MW_L, MW_R, R_gas_L, R_gas_R, Cp_L, Cp_R, Cv_L, Cv_R, Gamm_L, Gamm_R, Y_L, Y_R, H_L, H_R, qv_avg, rho_avg, gamma_avg, H_avg, c_L, c_R, c_avg, s_P, s_M, xi_P, xi_M, xi_L, xi_R, Ms_L, Ms_R, pres_SL, pres_SR, vel_L, vel_R, Re_L, Re_R, alpha_L, alpha_R, s_L, s_R, s_S, vel_avg_rms, pcorr, zcoef, vel_L_tmp, vel_R_tmp, Ys_L, Ys_R, Xs_L, Xs_R, Gamma_iL, Gamma_iR, Cp_iL, Cp_iR, tau_e_L, tau_e_R, xi_field_L, xi_field_R, Yi_avg,Phi_avg, h_iL, h_iR, h_avg_2, G_L, G_R]', copyin='[is1, is2, is3]')
-                    do l = is3%beg, is3%end
-                        do k = is2%beg, is2%end
-                            do j = is1%beg, is1%end
-
-                                vel_L_rms = 0._wp; vel_R_rms = 0._wp
-                                rho_L = 0._wp; rho_R = 0._wp
-                                gamma_L = 0._wp; gamma_R = 0._wp
-                                pi_inf_L = 0._wp; pi_inf_R = 0._wp
-                                qv_L = 0._wp; qv_R = 0._wp
-                                alpha_L_sum = 0._wp; alpha_R_sum = 0._wp
-
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, num_fluids
-                                    alpha_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)
-                                    alpha_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)
-                                end do
-
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, num_dims
-                                    vel_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, contxe + i)
-                                    vel_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, contxe + i)
-                                    vel_L_rms = vel_L_rms + vel_L(i)**2._wp
-                                    vel_R_rms = vel_R_rms + vel_R(i)**2._wp
-                                end do
-
-                                pres_L = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx)
-                                pres_R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx)
-
-                                ! Change this by splitting it into the cases
-                                ! present in the bubbles_euler
-                                if (mpp_lim) then
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = 1, num_fluids
-                                        qL_prim_rs${XYZ}$_vf(j, k, l, i) = max(0._wp, qL_prim_rs${XYZ}$_vf(j, k, l, i))
-                                        qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i) = min(max(0._wp, qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)), 1._wp)
-                                        qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) = max(0._wp, qR_prim_rs${XYZ}$_vf(j + 1, k, l, i))
-                                        qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i) = min(max(0._wp, qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)), 1._wp)
-                                        alpha_L_sum = alpha_L_sum + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)
-                                        alpha_R_sum = alpha_R_sum + qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)
-                                    end do
-
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = 1, num_fluids
-                                        qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)/max(alpha_L_sum, sgm_eps)
-                                        qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)/max(alpha_R_sum, sgm_eps)
-                                    end do
-                                end if
-
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, num_fluids
-                                    rho_L = rho_L + qL_prim_rs${XYZ}$_vf(j, k, l, i)
-                                    gamma_L = gamma_L + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)*gammas(i)
-                                    pi_inf_L = pi_inf_L + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)*pi_infs(i)
-                                    qv_L = qv_L + qL_prim_rs${XYZ}$_vf(j, k, l, i)*qvs(i)
-
-                                    rho_R = rho_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)
-                                    gamma_R = gamma_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)*gammas(i)
-                                    pi_inf_R = pi_inf_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)*pi_infs(i)
-                                    qv_R = qv_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)*qvs(i)
-                                end do
-
-                                Re_max = 0
-                                if (Re_size(1) > 0) Re_max = 1
-                                if (Re_size(2) > 0) Re_max = 2
-
-                                if (viscous) then
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = 1, Re_max
-                                        Re_L(i) = 0._wp
-                                        Re_R(i) = 0._wp
-
-                                        $:GPU_LOOP(parallelism='[seq]')
-                                        do q = 1, Re_size(i)
-                                            Re_L(i) = alpha_L(Re_idx(i, q))/Res_gs(i, q) &
-                                                      + Re_L(i)
-                                            Re_R(i) = alpha_R(Re_idx(i, q))/Res_gs(i, q) &
-                                                      + Re_R(i)
-                                        end do
-
-                                        Re_L(i) = 1._wp/max(Re_L(i), sgm_eps)
-                                        Re_R(i) = 1._wp/max(Re_R(i), sgm_eps)
-                                    end do
-                                end if
-
-                                if (chemistry) then
-                                    c_sum_Yi_Phi = 0.0_wp
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = chemxb, chemxe
-                                        Ys_L(i - chemxb + 1) = qL_prim_rs${XYZ}$_vf(j, k, l, i)
-                                        Ys_R(i - chemxb + 1) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)
-                                    end do
-
-                                    call get_mixture_molecular_weight(Ys_L, MW_L)
-                                    call get_mixture_molecular_weight(Ys_R, MW_R)
-
-                                    #:if USING_AMD
-                                        Xs_L(:) = Ys_L(:)*MW_L/molecular_weights_nonparameter(:)
-                                        Xs_R(:) = Ys_R(:)*MW_R/molecular_weights_nonparameter(:)
-                                    #:else
-                                        Xs_L(:) = Ys_L(:)*MW_L/molecular_weights(:)
-                                        Xs_R(:) = Ys_R(:)*MW_R/molecular_weights(:)
-                                    #:endif
-
-                                    R_gas_L = gas_constant/MW_L
-                                    R_gas_R = gas_constant/MW_R
-
-                                    T_L = pres_L/rho_L/R_gas_L
-                                    T_R = pres_R/rho_R/R_gas_R
-
-                                    call get_species_specific_heats_r(T_L, Cp_iL)
-                                    call get_species_specific_heats_r(T_R, Cp_iR)
-
-                                    if (chem_params%gamma_method == 1) then
-                                        !> gamma_method = 1: Ref. Section 2.3.1 Formulation of doi:10.7907/ZKW8-ES97.
-                                        Gamma_iL = Cp_iL/(Cp_iL - 1.0_wp)
-                                        Gamma_iR = Cp_iR/(Cp_iR - 1.0_wp)
-
-                                        gamma_L = sum(Xs_L(:)/(Gamma_iL(:) - 1.0_wp))
-                                        gamma_R = sum(Xs_R(:)/(Gamma_iR(:) - 1.0_wp))
-                                    else if (chem_params%gamma_method == 2) then
-                                        !> gamma_method = 2: c_p / c_v where c_p, c_v are specific heats.
-                                        call get_mixture_specific_heat_cp_mass(T_L, Ys_L, Cp_L)
-                                        call get_mixture_specific_heat_cp_mass(T_R, Ys_R, Cp_R)
-                                        call get_mixture_specific_heat_cv_mass(T_L, Ys_L, Cv_L)
-                                        call get_mixture_specific_heat_cv_mass(T_R, Ys_R, Cv_R)
-
-                                        Gamm_L = Cp_L/Cv_L; Gamm_R = Cp_R/Cv_R
-                                        gamma_L = 1.0_wp/(Gamm_L - 1.0_wp); gamma_R = 1.0_wp/(Gamm_R - 1.0_wp)
-                                    end if
-
-                                    call get_mixture_energy_mass(T_L, Ys_L, E_L)
-                                    call get_mixture_energy_mass(T_R, Ys_R, E_R)
-
-                                    E_L = rho_L*E_L + 5.e-1*rho_L*vel_L_rms
-                                    E_R = rho_R*E_R + 5.e-1*rho_R*vel_R_rms
-                                    H_L = (E_L + pres_L)/rho_L
-                                    H_R = (E_R + pres_R)/rho_R
-                                else
-                                    E_L = gamma_L*pres_L + pi_inf_L + 5.e-1*rho_L*vel_L_rms + qv_L
-                                    E_R = gamma_R*pres_R + pi_inf_R + 5.e-1*rho_R*vel_R_rms + qv_R
-
-                                    H_L = (E_L + pres_L)/rho_L
-                                    H_R = (E_R + pres_R)/rho_R
-                                end if
-
-                                ! ENERGY ADJUSTMENTS FOR HYPOELASTIC ENERGY
-                                if (hypoelasticity) then
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = 1, strxe - strxb + 1
-                                        tau_e_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, strxb - 1 + i)
-                                        tau_e_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, strxb - 1 + i)
-                                    end do
-                                    G_L = 0._wp
-                                    G_R = 0._wp
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = 1, num_fluids
-                                        G_L = G_L + alpha_L(i)*Gs_rs(i)
-                                        G_R = G_R + alpha_R(i)*Gs_rs(i)
-                                    end do
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = 1, strxe - strxb + 1
-                                        ! Elastic contribution to energy if G large enough
-                                        if ((G_L > verysmall) .and. (G_R > verysmall)) then
-                                            E_L = E_L + (tau_e_L(i)*tau_e_L(i))/(4._wp*G_L)
-                                            E_R = E_R + (tau_e_R(i)*tau_e_R(i))/(4._wp*G_R)
-                                            ! Additional terms in 2D and 3D
-                                            if ((i == 2) .or. (i == 4) .or. (i == 5)) then
-                                                E_L = E_L + (tau_e_L(i)*tau_e_L(i))/(4._wp*G_L)
-                                                E_R = E_R + (tau_e_R(i)*tau_e_R(i))/(4._wp*G_R)
-                                            end if
-                                        end if
-                                    end do
-                                end if
-
-                                ! ENERGY ADJUSTMENTS FOR HYPERELASTIC ENERGY
-                                if (hyperelasticity) then
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = 1, num_dims
-                                        xi_field_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, xibeg - 1 + i)
-                                        xi_field_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, xibeg - 1 + i)
-                                    end do
-                                    G_L = 0._wp
-                                    G_R = 0._wp
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = 1, num_fluids
-                                        ! Mixture left and right shear modulus
-                                        G_L = G_L + alpha_L(i)*Gs_rs(i)
-                                        G_R = G_R + alpha_R(i)*Gs_rs(i)
-                                    end do
-                                    ! Elastic contribution to energy if G large enough
-                                    if (G_L > verysmall .and. G_R > verysmall) then
-                                        E_L = E_L + G_L*qL_prim_rs${XYZ}$_vf(j, k, l, xiend + 1)
-                                        E_R = E_R + G_R*qR_prim_rs${XYZ}$_vf(j + 1, k, l, xiend + 1)
-                                    end if
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = 1, b_size - 1
-                                        tau_e_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, strxb - 1 + i)
-                                        tau_e_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, strxb - 1 + i)
-                                    end do
-                                end if
-
-                                H_L = (E_L + pres_L)/rho_L
-                                H_R = (E_R + pres_R)/rho_R
-
-                                @:compute_average_state()
-
-                                call s_compute_speed_of_sound(pres_L, rho_L, gamma_L, pi_inf_L, H_L, alpha_L, &
-                                                              vel_L_rms, 0._wp, c_L, qv_L)
-
-                                call s_compute_speed_of_sound(pres_R, rho_R, gamma_R, pi_inf_R, H_R, alpha_R, &
-                                                              vel_R_rms, 0._wp, c_R, qv_R)
-
-                                !> The computation of c_avg does not require all the variables, and therefore the non '_avg'
-                                !  variables are placeholders to call the subroutine.
-                                call s_compute_speed_of_sound(pres_R, rho_avg, gamma_avg, pi_inf_R, H_avg, alpha_R, &
-                                                              vel_avg_rms, c_sum_Yi_Phi, c_avg, qv_avg)
-
-                                if (viscous) then
-                                    if (chemistry) then
-                                        call compute_viscosity_and_inversion(T_L, Ys_L, T_R, Ys_R, Re_L(1), Re_R(1))
-                                    end if
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = 1, 2
-                                        Re_avg_rs${XYZ}$_vf(j, k, l, i) = 2._wp/(1._wp/Re_L(i) + 1._wp/Re_R(i))
-                                    end do
-                                end if
-
-                                ! Low Mach correction
-                                if (low_Mach == 2) then
-                                    @:compute_low_Mach_correction()
-                                end if
-
-                                if (wave_speeds == 1) then
-                                    if (elasticity) then
-                                        s_L = min(vel_L(dir_idx(1)) - sqrt(c_L*c_L + &
-                                                                           (((4._wp*G_L)/3._wp) + tau_e_L(dir_idx_tau(1)))/rho_L), vel_R(dir_idx(1)) - sqrt(c_R*c_R + &
-                                                                                                                                                            (((4._wp*G_R)/3._wp) + tau_e_R(dir_idx_tau(1)))/rho_R))
-                                        s_R = max(vel_R(dir_idx(1)) + sqrt(c_R*c_R + &
-                                                                           (((4._wp*G_R)/3._wp) + tau_e_R(dir_idx_tau(1)))/rho_R), vel_L(dir_idx(1)) + sqrt(c_L*c_L + &
-                                                                                                                                                            (((4._wp*G_L)/3._wp) + tau_e_L(dir_idx_tau(1)))/rho_L))
-                                        s_S = (pres_R - tau_e_R(dir_idx_tau(1)) - pres_L + &
-                                               tau_e_L(dir_idx_tau(1)) + rho_L*vel_L(dir_idx(1))*(s_L - vel_L(dir_idx(1))) - &
-                                               rho_R*vel_R(dir_idx(1))*(s_R - vel_R(dir_idx(1))))/(rho_L*(s_L - vel_L(dir_idx(1))) - &
-                                                                                                   rho_R*(s_R - vel_R(dir_idx(1))))
-                                    else
-                                        s_L = min(vel_L(dir_idx(1)) - c_L, vel_R(dir_idx(1)) - c_R)
-                                        s_R = max(vel_R(dir_idx(1)) + c_R, vel_L(dir_idx(1)) + c_L)
-                                        s_S = (pres_R - pres_L + rho_L*vel_L(dir_idx(1))* &
-                                               (s_L - vel_L(dir_idx(1))) - rho_R*vel_R(dir_idx(1))*(s_R - vel_R(dir_idx(1)))) &
-                                              /(rho_L*(s_L - vel_L(dir_idx(1))) - rho_R*(s_R - vel_R(dir_idx(1))))
-
-                                    end if
-                                elseif (wave_speeds == 2) then
-                                    pres_SL = 5.e-1_wp*(pres_L + pres_R + rho_avg*c_avg* &
-                                                        (vel_L(dir_idx(1)) - &
-                                                         vel_R(dir_idx(1))))
-
-                                    pres_SR = pres_SL
-
-                                    Ms_L = max(1._wp, sqrt(1._wp + ((5.e-1_wp + gamma_L)/(1._wp + gamma_L))* &
-                                                           (pres_SL/pres_L - 1._wp)*pres_L/ &
-                                                           ((pres_L + pi_inf_L/(1._wp + gamma_L)))))
-                                    Ms_R = max(1._wp, sqrt(1._wp + ((5.e-1_wp + gamma_R)/(1._wp + gamma_R))* &
-                                                           (pres_SR/pres_R - 1._wp)*pres_R/ &
-                                                           ((pres_R + pi_inf_R/(1._wp + gamma_R)))))
-
-                                    s_L = vel_L(dir_idx(1)) - c_L*Ms_L
-                                    s_R = vel_R(dir_idx(1)) + c_R*Ms_R
-
-                                    s_S = 5.e-1_wp*((vel_L(dir_idx(1)) + vel_R(dir_idx(1))) + &
-                                                    (pres_L - pres_R)/ &
-                                                    (rho_avg*c_avg))
-                                end if
-
-                                ! follows Einfeldt et al.
-                                ! s_M/P = min/max(0.,s_L/R)
-                                s_M = min(0._wp, s_L); s_P = max(0._wp, s_R)
-
-                                ! goes with q_star_L/R = xi_L/R * (variable)
-                                ! xi_L/R = ( ( s_L/R - u_L/R )/(s_L/R - s_star) )
-                                xi_L = (s_L - vel_L(dir_idx(1)))/(s_L - s_S)
-                                xi_R = (s_R - vel_R(dir_idx(1)))/(s_R - s_S)
-
-                                ! goes with numerical velocity in x/y/z directions
-                                ! xi_P/M = 0.5 +/m sgn(0.5,s_star)
-                                xi_M = (5.e-1_wp + sign(5.e-1_wp, s_S))
-                                xi_P = (5.e-1_wp - sign(5.e-1_wp, s_S))
-
-                                ! Low Mach correction
-                                if (low_Mach == 1) then
-                                    @:compute_low_Mach_correction()
-                                else
-                                    pcorr = 0._wp
-                                end if
-
-                                ! COMPUTING THE HLLC FLUXES
-                                ! MASS FLUX.
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, contxe
-                                    flux_rs${XYZ}$_vf(j, k, l, i) = &
-                                        xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i) &
-                                        *(vel_L(dir_idx(1)) + s_M*(xi_L - 1._wp)) &
-                                        + xi_P*qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) &
-                                        *(vel_R(dir_idx(1)) + s_P*(xi_R - 1._wp))
-                                end do
-
-                                ! MOMENTUM FLUX.
-                                ! f = \rho u u - \sigma, q = \rho u, q_star = \xi * \rho*(s_star, v, w)
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, num_dims
-                                    flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = &
-                                        xi_M*(rho_L*(vel_L(dir_idx(1))* &
-                                                     vel_L(dir_idx(i)) + &
-                                                     s_M*(xi_L*(dir_flg(dir_idx(i))*s_S + &
-                                                                (1._wp - dir_flg(dir_idx(i)))* &
-                                                                vel_L(dir_idx(i))) - vel_L(dir_idx(i)))) + &
-                                              dir_flg(dir_idx(i))*(pres_L)) &
-                                        + xi_P*(rho_R*(vel_R(dir_idx(1))* &
-                                                       vel_R(dir_idx(i)) + &
-                                                       s_P*(xi_R*(dir_flg(dir_idx(i))*s_S + &
-                                                                  (1._wp - dir_flg(dir_idx(i)))* &
-                                                                  vel_R(dir_idx(i))) - vel_R(dir_idx(i)))) + &
-                                                dir_flg(dir_idx(i))*(pres_R)) &
-                                        + (s_M/s_L)*(s_P/s_R)*dir_flg(dir_idx(i))*pcorr
-                                end do
-
-                                ! ENERGY FLUX.
-                                ! f = u*(E-\sigma), q = E, q_star = \xi*E+(s-u)(\rho s_star - \sigma/(s-u))
-                                flux_rs${XYZ}$_vf(j, k, l, E_idx) = &
-                                    xi_M*(vel_L(dir_idx(1))*(E_L + pres_L) + &
-                                          s_M*(xi_L*(E_L + (s_S - vel_L(dir_idx(1)))* &
-                                                     (rho_L*s_S + pres_L/ &
-                                                      (s_L - vel_L(dir_idx(1))))) - E_L)) &
-                                    + xi_P*(vel_R(dir_idx(1))*(E_R + pres_R) + &
-                                            s_P*(xi_R*(E_R + (s_S - vel_R(dir_idx(1)))* &
-                                                       (rho_R*s_S + pres_R/ &
-                                                        (s_R - vel_R(dir_idx(1))))) - E_R)) &
-                                    + (s_M/s_L)*(s_P/s_R)*pcorr*s_S
-
-                                ! ELASTICITY. Elastic shear stress additions for the momentum and energy flux
-                                if (elasticity) then
-                                    flux_ene_e = 0._wp
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = 1, num_dims
-                                        ! MOMENTUM ELASTIC FLUX.
-                                        flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = &
-                                            flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) &
-                                            - xi_M*tau_e_L(dir_idx_tau(i)) - xi_P*tau_e_R(dir_idx_tau(i))
-                                        ! ENERGY ELASTIC FLUX.
-                                        flux_ene_e = flux_ene_e - &
-                                                     xi_M*(vel_L(dir_idx(i))*tau_e_L(dir_idx_tau(i)) + &
-                                                           s_M*(xi_L*((s_S - vel_L(i))*(tau_e_L(dir_idx_tau(i))/(s_L - vel_L(i)))))) - &
-                                                     xi_P*(vel_R(dir_idx(i))*tau_e_R(dir_idx_tau(i)) + &
-                                                           s_P*(xi_R*((s_S - vel_R(i))*(tau_e_R(dir_idx_tau(i))/(s_R - vel_R(i))))))
-                                    end do
-                                    flux_rs${XYZ}$_vf(j, k, l, E_idx) = flux_rs${XYZ}$_vf(j, k, l, E_idx) + flux_ene_e
-                                end if
-
-                                ! HYPOELASTIC STRESS EVOLUTION FLUX.
-                                if (hypoelasticity) then
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = 1, strxe - strxb + 1
-                                        flux_rs${XYZ}$_vf(j, k, l, strxb - 1 + i) = &
-                                            xi_M*(s_S/(s_L - s_S))*(s_L*rho_L*tau_e_L(i) - rho_L*vel_L(dir_idx(1))*tau_e_L(i)) + &
-                                            xi_P*(s_S/(s_R - s_S))*(s_R*rho_R*tau_e_R(i) - rho_R*vel_R(dir_idx(1))*tau_e_R(i))
-                                    end do
-                                end if
-
-                                ! VOLUME FRACTION FLUX.
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = advxb, advxe
-                                    flux_rs${XYZ}$_vf(j, k, l, i) = &
-                                        xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i) &
-                                        *(vel_L(dir_idx(1)) + s_M*(xi_L - 1._wp)) &
-                                        + xi_P*qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) &
-                                        *(vel_R(dir_idx(1)) + s_P*(xi_R - 1._wp))
-                                end do
-
-                                ! VOLUME FRACTION SOURCE FLUX.
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, num_dims
-                                    vel_src_rs${XYZ}$_vf(j, k, l, dir_idx(i)) = &
-                                        xi_M*(vel_L(dir_idx(i)) + &
-                                              dir_flg(dir_idx(i))* &
-                                              s_M*(xi_L - 1._wp)) &
-                                        + xi_P*(vel_R(dir_idx(i)) + &
-                                                dir_flg(dir_idx(i))* &
-                                                s_P*(xi_R - 1._wp))
-                                end do
-
-                                ! COLOR FUNCTION FLUX
-                                if (surface_tension) then
-                                    flux_rs${XYZ}$_vf(j, k, l, c_idx) = &
-                                        xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, c_idx) &
-                                        *(vel_L(dir_idx(1)) + s_M*(xi_L - 1._wp)) &
-                                        + xi_P*qR_prim_rs${XYZ}$_vf(j + 1, k, l, c_idx) &
-                                        *(vel_R(dir_idx(1)) + s_P*(xi_R - 1._wp))
-                                end if
-
-                                ! REFERENCE MAP FLUX.
-                                if (hyperelasticity) then
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = 1, num_dims
-                                        flux_rs${XYZ}$_vf(j, k, l, xibeg - 1 + i) = &
-                                            xi_M*(s_S/(s_L - s_S))*(s_L*rho_L*xi_field_L(i) &
-                                                                    - rho_L*vel_L(dir_idx(1))*xi_field_L(i)) + &
-                                            xi_P*(s_S/(s_R - s_S))*(s_R*rho_R*xi_field_R(i) &
-                                                                    - rho_R*vel_R(dir_idx(1))*xi_field_R(i))
-                                    end do
-                                end if
-
-                                flux_src_rs${XYZ}$_vf(j, k, l, advxb) = vel_src_rs${XYZ}$_vf(j, k, l, dir_idx(1))
-
-                                if (chemistry) then
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = chemxb, chemxe
-                                        Y_L = qL_prim_rs${XYZ}$_vf(j, k, l, i)
-                                        Y_R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)
-
-                                        flux_rs${XYZ}$_vf(j, k, l, i) = xi_M*rho_L*Y_L*(vel_L(dir_idx(1)) + s_M*(xi_L - 1._wp)) &
-                                                                        + xi_P*rho_R*Y_R*(vel_R(dir_idx(1)) + s_P*(xi_R - 1._wp))
-                                        flux_src_rs${XYZ}$_vf(j, k, l, i) = 0.0_wp
-                                    end do
-                                end if
-
-                                ! Geometrical source flux for cylindrical coordinates
-                                #:if (NORM_DIR == 2)
-                                    if (cyl_coord) then
-                                        !Substituting the advective flux into the inviscid geometrical source flux
-                                        $:GPU_LOOP(parallelism='[seq]')
-                                        do i = 1, E_idx
-                                            flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i)
-                                        end do
-                                        ! Recalculating the radial momentum geometric source flux
-                                        flux_gsrc_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(1)) = &
-                                            xi_M*(rho_L*(vel_L(dir_idx(1))* &
-                                                         vel_L(dir_idx(1)) + &
-                                                         s_M*(xi_L*(dir_flg(dir_idx(1))*s_S + &
-                                                                    (1._wp - dir_flg(dir_idx(1)))* &
-                                                                    vel_L(dir_idx(1))) - vel_L(dir_idx(1))))) &
-                                            + xi_P*(rho_R*(vel_R(dir_idx(1))* &
-                                                           vel_R(dir_idx(1)) + &
-                                                           s_P*(xi_R*(dir_flg(dir_idx(1))*s_S + &
-                                                                      (1._wp - dir_flg(dir_idx(1)))* &
-                                                                      vel_R(dir_idx(1))) - vel_R(dir_idx(1)))))
-                                        ! Geometrical source of the void fraction(s) is zero
-                                        $:GPU_LOOP(parallelism='[seq]')
-                                        do i = advxb, advxe
-                                            flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0._wp
-                                        end do
-                                    end if
-                                #:endif
-                                #:if (NORM_DIR == 3)
-                                    if (grid_geometry == 3) then
-                                        $:GPU_LOOP(parallelism='[seq]')
-                                        do i = 1, sys_size
-                                            flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0._wp
-                                        end do
-
-                                        flux_gsrc_rs${XYZ}$_vf(j, k, l, momxb + 1) = &
-                                            -xi_M*(rho_L*(vel_L(dir_idx(1))* &
-                                                          vel_L(dir_idx(1)) + &
-                                                          s_M*(xi_L*(dir_flg(dir_idx(1))*s_S + &
-                                                                     (1._wp - dir_flg(dir_idx(1)))* &
-                                                                     vel_L(dir_idx(1))) - vel_L(dir_idx(1))))) &
-                                            - xi_P*(rho_R*(vel_R(dir_idx(1))* &
-                                                           vel_R(dir_idx(1)) + &
-                                                           s_P*(xi_R*(dir_flg(dir_idx(1))*s_S + &
-                                                                      (1._wp - dir_flg(dir_idx(1)))* &
-                                                                      vel_R(dir_idx(1))) - vel_R(dir_idx(1)))))
-                                        flux_gsrc_rs${XYZ}$_vf(j, k, l, momxe) = flux_rs${XYZ}$_vf(j, k, l, momxb + 1)
-
-                                    end if
-                                #:endif
-
-                            end do
-                        end do
-                    end do
-                    $:END_GPU_PARALLEL_LOOP()
-                end if
-            end if
-        #:endfor
-        ! Computing HLLC flux and source flux for Euler system of equations
-
-        if (viscous .or. dummy) then
-            if (weno_Re_flux) then
-                call s_compute_viscous_source_flux( &
-                    qL_prim_vf(momxb:momxe), &
-                    dqL_prim_dx_vf(momxb:momxe), &
-                    dqL_prim_dy_vf(momxb:momxe), &
-                    dqL_prim_dz_vf(momxb:momxe), &
-                    qR_prim_vf(momxb:momxe), &
-                    dqR_prim_dx_vf(momxb:momxe), &
-                    dqR_prim_dy_vf(momxb:momxe), &
-                    dqR_prim_dz_vf(momxb:momxe), &
-                    flux_src_vf, norm_dir, ix, iy, iz)
-            else
-                call s_compute_viscous_source_flux( &
-                    q_prim_vf(momxb:momxe), &
-                    dqL_prim_dx_vf(momxb:momxe), &
-                    dqL_prim_dy_vf(momxb:momxe), &
-                    dqL_prim_dz_vf(momxb:momxe), &
-                    q_prim_vf(momxb:momxe), &
-                    dqR_prim_dx_vf(momxb:momxe), &
-                    dqR_prim_dy_vf(momxb:momxe), &
-                    dqR_prim_dz_vf(momxb:momxe), &
-                    flux_src_vf, norm_dir, ix, iy, iz)
-            end if
-        end if
-
-        if (surface_tension) then
-            call s_compute_capillary_source_flux( &
-                vel_src_rsx_vf, &
-                vel_src_rsy_vf, &
-                vel_src_rsz_vf, &
-                flux_src_vf, &
-                norm_dir, isx, isy, isz)
-        end if
-
-        call s_finalize_riemann_solver(flux_vf, flux_src_vf, &
-                                       flux_gsrc_vf, &
-                                       norm_dir)
-
-    end subroutine s_hllc_riemann_solver
-
-    !> HLLD Riemann solver resolves 5 of the 7 waves of MHD equations:
-        !!      1 entropy wave, 2 Alfvén waves, 2 fast magnetosonic waves.
-    subroutine s_hlld_riemann_solver(qL_prim_rsx_vf, qL_prim_rsy_vf, qL_prim_rsz_vf, &
-                                     dqL_prim_dx_vf, dqL_prim_dy_vf, dqL_prim_dz_vf, &
-                                     qL_prim_vf, &
-                                     qR_prim_rsx_vf, qR_prim_rsy_vf, qR_prim_rsz_vf, &
-                                     dqR_prim_dx_vf, dqR_prim_dy_vf, dqR_prim_dz_vf, &
-                                     qR_prim_vf, &
-                                     q_prim_vf, &
-                                     flux_vf, flux_src_vf, flux_gsrc_vf, &
-                                     norm_dir, ix, iy, iz)
-
-        real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:), intent(inout) :: qL_prim_rsx_vf, qL_prim_rsy_vf, qL_prim_rsz_vf, &
-                                                                                                     qR_prim_rsx_vf, qR_prim_rsy_vf, qR_prim_rsz_vf
-
-        type(scalar_field), allocatable, dimension(:), intent(inout) :: dqL_prim_dx_vf, dqR_prim_dx_vf, &
-                                                                        dqL_prim_dy_vf, dqR_prim_dy_vf, &
-                                                                        dqL_prim_dz_vf, dqR_prim_dz_vf
-
-        type(scalar_field), allocatable, dimension(:), intent(inout) :: qL_prim_vf, qR_prim_vf
-
-        type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf
-        type(scalar_field), dimension(sys_size), intent(inout) :: flux_vf, flux_src_vf, flux_gsrc_vf
-
-        integer, intent(in) :: norm_dir
-        type(int_bounds_info), intent(in) :: ix, iy, iz
-
-        ! Local variables:
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
-            real(wp), dimension(3) :: alpha_L, alpha_R, alpha_rho_L, alpha_rho_R
-        #:else
-            real(wp), dimension(num_fluids) :: alpha_L, alpha_R, alpha_rho_L, alpha_rho_R
-        #:endif
-        type(riemann_states_vec3) :: vel
-        type(riemann_states) :: rho, pres, E, H_no_mag
-        type(riemann_states) :: gamma, pi_inf, qv
-        type(riemann_states) :: vel_rms
-
-        type(riemann_states_vec3) :: B
-        type(riemann_states) :: c, c_fast, pres_mag
-
-        ! HLLD speeds and intermediate state variables:
-        real(wp) :: s_L, s_R, s_M, s_starL, s_starR
-        real(wp) :: pTot_L, pTot_R, p_star, rhoL_star, rhoR_star, E_starL, E_starR
-
-        real(wp), dimension(7) :: U_L, U_R, U_starL, U_starR, U_doubleL, U_doubleR
-        real(wp), dimension(7) :: F_L, F_R, F_starL, F_starR, F_hlld
-
-        ! Indices for U and F: (rho, rho*vel(1), rho*vel(2), rho*vel(3), By, Bz, E)
-        !   Note: vel and B are permutated, so vel(1) is the normal velocity, and x is the normal direction
-        !   Note: Bx is omitted as the magnetic flux is always zero in the normal direction
-
-        real(wp) :: sqrt_rhoL_star, sqrt_rhoR_star, denom_ds, sign_Bx
-        real(wp) :: vL_star, vR_star, wL_star, wR_star
-        real(wp) :: v_double, w_double, By_double, Bz_double, E_doubleL, E_doubleR, E_double
-
-        integer :: i, j, k, l
-
-        call s_populate_riemann_states_variables_buffers( &
-            qL_prim_rsx_vf, qL_prim_rsy_vf, qL_prim_rsz_vf, dqL_prim_dx_vf, &
-            dqL_prim_dy_vf, dqL_prim_dz_vf, &
-            qR_prim_rsx_vf, qR_prim_rsy_vf, qR_prim_rsz_vf, dqR_prim_dx_vf, &
-            dqR_prim_dy_vf, dqR_prim_dz_vf, &
-            norm_dir, ix, iy, iz)
-
-        call s_initialize_riemann_solver( &
-            flux_src_vf, norm_dir)
-
-        #:for NORM_DIR, XYZ in [(1, 'x'), (2, 'y'), (3, 'z')]
-            if (norm_dir == ${NORM_DIR}$) then
-                $:GPU_PARALLEL_LOOP(collapse=3, private='[alpha_rho_L, alpha_rho_R, vel, alpha_L, alpha_R, rho, pres,E, H_no_mag, gamma, pi_inf, qv, vel_rms, B, c, c_fast, pres_mag, U_L, U_R, U_starL, U_starR, U_doubleL, U_doubleR, F_L, F_R, F_starL, F_starR, F_hlld, s_L, s_R, s_M, s_starL, s_starR, pTot_L, pTot_R, p_star, rhoL_star, rhoR_star, E_starL, E_starR, sqrt_rhoL_star, sqrt_rhoR_star, denom_ds, sign_Bx, vL_star, vR_star, wL_star, wR_star, v_double, w_double, By_double, Bz_double, E_doubleL, E_doubleR, E_double]', copyin='[norm_dir]')
-                do l = is3%beg, is3%end
-                    do k = is2%beg, is2%end
-                        do j = is1%beg, is1%end
-
-                            ! (1) Extract the left/right primitive states
-                            do i = 1, contxe
-                                alpha_rho_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, i)
-                                alpha_rho_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)
-                            end do
-
-                            ! NOTE: unlike HLL & HLLC, vel_L here is permutated by dir_idx for simpler logic
-                            do i = 1, num_vels
-                                vel%L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i))
-                                vel%R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, contxe + dir_idx(i))
-                            end do
-
-                            vel_rms%L = sum(vel%L**2._wp)
-                            vel_rms%R = sum(vel%R**2._wp)
-
-                            do i = 1, num_fluids
-                                alpha_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)
-                                alpha_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)
-                            end do
-
-                            pres%L = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx)
-                            pres%R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx)
-
-                            ! NOTE: unlike HLL, Bx, By, Bz are permutated by dir_idx for simpler logic
-                            if (mhd) then
-                                if (n == 0) then ! 1D: constant Bx; By, Bz as variables; only in x so not permutated
-                                    B%L = [Bx0, qL_prim_rs${XYZ}$_vf(j, k, l, B_idx%beg), qL_prim_rs${XYZ}$_vf(j, k, l, B_idx%beg + 1)]
-                                    B%R = [Bx0, qR_prim_rs${XYZ}$_vf(j + 1, k, l, B_idx%beg), qR_prim_rs${XYZ}$_vf(j + 1, k, l, B_idx%beg + 1)]
-                                else ! 2D/3D: Bx, By, Bz as variables
-                                    B%L = [qL_prim_rs${XYZ}$_vf(j, k, l, B_idx%beg + dir_idx(1) - 1), &
-                                           qL_prim_rs${XYZ}$_vf(j, k, l, B_idx%beg + dir_idx(2) - 1), &
-                                           qL_prim_rs${XYZ}$_vf(j, k, l, B_idx%beg + dir_idx(3) - 1)]
-                                    B%R = [qR_prim_rs${XYZ}$_vf(j + 1, k, l, B_idx%beg + dir_idx(1) - 1), &
-                                           qR_prim_rs${XYZ}$_vf(j + 1, k, l, B_idx%beg + dir_idx(2) - 1), &
-                                           qR_prim_rs${XYZ}$_vf(j + 1, k, l, B_idx%beg + dir_idx(3) - 1)]
-                                end if
-                            end if
-
-                            ! Sum properties of all fluid components
-                            rho%L = 0._wp; gamma%L = 0._wp; pi_inf%L = 0._wp; qv%L = 0._wp
-                            rho%R = 0._wp; gamma%R = 0._wp; pi_inf%R = 0._wp; qv%R = 0._wp
-                            $:GPU_LOOP(parallelism='[seq]')
-                            do i = 1, num_fluids
-                                rho%L = rho%L + alpha_rho_L(i)
-                                gamma%L = gamma%L + alpha_L(i)*gammas(i)
-                                pi_inf%L = pi_inf%L + alpha_L(i)*pi_infs(i)
-                                qv%L = qv%L + alpha_rho_L(i)*qvs(i)
-
-                                rho%R = rho%R + alpha_rho_R(i)
-                                gamma%R = gamma%R + alpha_R(i)*gammas(i)
-                                pi_inf%R = pi_inf%R + alpha_R(i)*pi_infs(i)
-                                qv%R = qv%R + alpha_rho_R(i)*qvs(i)
-                            end do
-
-                            pres_mag%L = 0.5_wp*sum(B%L**2._wp)
-                            pres_mag%R = 0.5_wp*sum(B%R**2._wp)
-                            E%L = gamma%L*pres%L + pi_inf%L + 0.5_wp*rho%L*vel_rms%L + qv%L + pres_mag%L
-                            E%R = gamma%R*pres%R + pi_inf%R + 0.5_wp*rho%R*vel_rms%R + qv%R + pres_mag%R ! includes magnetic energy
-                            H_no_mag%L = (E%L + pres%L - pres_mag%L)/rho%L
-                            H_no_mag%R = (E%R + pres%R - pres_mag%R)/rho%R ! stagnation enthalpy here excludes magnetic energy (only used to find speed of sound)
-
-                            ! (2) Compute fast wave speeds
-                            call s_compute_speed_of_sound(pres%L, rho%L, gamma%L, pi_inf%L, H_no_mag%L, alpha_L, vel_rms%L, 0._wp, c%L, qv%L)
-                            call s_compute_speed_of_sound(pres%R, rho%R, gamma%R, pi_inf%R, H_no_mag%R, alpha_R, vel_rms%R, 0._wp, c%R, qv%R)
-                            call s_compute_fast_magnetosonic_speed(rho%L, c%L, B%L, norm_dir, c_fast%L, H_no_mag%L)
-                            call s_compute_fast_magnetosonic_speed(rho%R, c%R, B%R, norm_dir, c_fast%R, H_no_mag%R)
-
-                            ! (3) Compute contact speed s_M [Miyoshi Equ. (38)]
-                            s_L = min(vel%L(1) - c_fast%L, vel%R(1) - c_fast%R)
-                            s_R = max(vel%R(1) + c_fast%R, vel%L(1) + c_fast%L)
-
-                            pTot_L = pres%L + pres_mag%L
-                            pTot_R = pres%R + pres_mag%R
-
-                            s_M = (((s_R - vel%R(1))*rho%R*vel%R(1) - &
-                                    (s_L - vel%L(1))*rho%L*vel%L(1) - pTot_R + pTot_L)/ &
-                                   ((s_R - vel%R(1))*rho%R - (s_L - vel%L(1))*rho%L))
-
-                            ! (4) Compute star state variables
-                            rhoL_star = rho%L*(s_L - vel%L(1))/(s_L - s_M)
-                            rhoR_star = rho%R*(s_R - vel%R(1))/(s_R - s_M)
-                            p_star = pTot_L + rho%L*(s_L - vel%L(1))*(s_M - vel%L(1))/(s_L - s_M)
-                            E_starL = ((s_L - vel%L(1))*E%L - pTot_L*vel%L(1) + p_star*s_M)/(s_L - s_M)
-                            E_starR = ((s_R - vel%R(1))*E%R - pTot_R*vel%R(1) + p_star*s_M)/(s_R - s_M)
-
-                            ! (5) Compute left/right state vectors and fluxes
-                            U_L = [rho%L, rho%L*vel%L(1:3), B%L(2:3), E%L]
-                            U_starL = [rhoL_star, rhoL_star*s_M, rhoL_star*vel%L(2:3), B%L(2:3), E_starL]
-                            U_R = [rho%R, rho%R*vel%R(1:3), B%R(2:3), E%R]
-                            U_starR = [rhoR_star, rhoR_star*s_M, rhoR_star*vel%R(2:3), B%R(2:3), E_starR]
-
-                            ! Compute the left/right fluxes
-                            F_L(1) = U_L(2)
-                            F_L(2) = U_L(2)*vel%L(1) - B%L(1)*B%L(1) + pTot_L
-                            F_L(3:4) = U_L(2)*vel%L(2:3) - B%L(1)*B%L(2:3)
-                            F_L(5:6) = vel%L(1)*B%L(2:3) - vel%L(2:3)*B%L(1)
-                            F_L(7) = (E%L + pTot_L)*vel%L(1) - B%L(1)*(vel%L(1)*B%L(1) + vel%L(2)*B%L(2) + vel%L(3)*B%L(3))
-
-                            F_R(1) = U_R(2)
-                            F_R(2) = U_R(2)*vel%R(1) - B%R(1)*B%R(1) + pTot_R
-                            F_R(3:4) = U_R(2)*vel%R(2:3) - B%R(1)*B%R(2:3)
-                            F_R(5:6) = vel%R(1)*B%R(2:3) - vel%R(2:3)*B%R(1)
-                            F_R(7) = (E%R + pTot_R)*vel%R(1) - B%R(1)*(vel%R(1)*B%R(1) + vel%R(2)*B%R(2) + vel%R(3)*B%R(3))
-                            ! Compute the star flux using HLL relation
-                            F_starL = F_L + s_L*(U_starL - U_L)
-                            F_starR = F_R + s_R*(U_starR - U_R)
-                            ! Compute the rotational (Alfvén) speeds
-                            s_starL = s_M - abs(B%L(1))/sqrt(rhoL_star)
-                            s_starR = s_M + abs(B%L(1))/sqrt(rhoR_star)
-                            ! Compute the double–star states [Miyoshi Eqns. (59)-(62)]
-                            sqrt_rhoL_star = sqrt(rhoL_star); sqrt_rhoR_star = sqrt(rhoR_star)
-                            vL_star = vel%L(2); wL_star = vel%L(3)
-                            vR_star = vel%R(2); wR_star = vel%R(3)
-
-                            ! (6) Compute the double–star states [Miyoshi Eqns. (59)-(62)]
-                            denom_ds = sqrt_rhoL_star + sqrt_rhoR_star
-                            sign_Bx = sign(1._wp, B%L(1))
-                            v_double = (sqrt_rhoL_star*vL_star + sqrt_rhoR_star*vR_star + (B%R(2) - B%L(2))*sign_Bx)/denom_ds
-                            w_double = (sqrt_rhoL_star*wL_star + sqrt_rhoR_star*wR_star + (B%R(3) - B%L(3))*sign_Bx)/denom_ds
-                            By_double = (sqrt_rhoL_star*B%R(2) + sqrt_rhoR_star*B%L(2) + sqrt_rhoL_star*sqrt_rhoR_star*(vR_star - vL_star)*sign_Bx)/denom_ds
-                            Bz_double = (sqrt_rhoL_star*B%R(3) + sqrt_rhoR_star*B%L(3) + sqrt_rhoL_star*sqrt_rhoR_star*(wR_star - wL_star)*sign_Bx)/denom_ds
-
-                            E_doubleL = E_starL - sqrt_rhoL_star*((vL_star*B%L(2) + wL_star*B%L(3)) - (v_double*By_double + w_double*Bz_double))*sign_Bx
-                            E_doubleR = E_starR + sqrt_rhoR_star*((vR_star*B%R(2) + wR_star*B%R(3)) - (v_double*By_double + w_double*Bz_double))*sign_Bx
-                            E_double = 0.5_wp*(E_doubleL + E_doubleR)
-
-                            U_doubleL = [rhoL_star, rhoL_star*s_M, rhoL_star*v_double, rhoL_star*w_double, By_double, Bz_double, E_double]
-                            U_doubleR = [rhoR_star, rhoR_star*s_M, rhoR_star*v_double, rhoR_star*w_double, By_double, Bz_double, E_double]
-
-                            ! (11) Choose HLLD flux based on wave-speed regions
-                            if (0.0_wp <= s_L) then
-                                F_hlld = F_L
-                            else if (0.0_wp <= s_starL) then
-                                F_hlld = F_L + s_L*(U_starL - U_L)
-                            else if (0.0_wp <= s_M) then
-                                F_hlld = F_starL + s_starL*(U_doubleL - U_starL)
-                            else if (0.0_wp <= s_starR) then
-                                F_hlld = F_starR + s_starR*(U_doubleR - U_starR)
-                            else if (0.0_wp <= s_R) then
-                                F_hlld = F_R + s_R*(U_starR - U_R)
-                            else
-                                F_hlld = F_R
-                            end if
-
-                            ! (12) Reorder and write temporary variables to the flux array
-                            ! Mass
-                            flux_rs${XYZ}$_vf(j, k, l, 1) = F_hlld(1) ! TODO multi-component
-                            ! Momentum
-                            flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(1)) = F_hlld(2)
-                            flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(2)) = F_hlld(3)
-                            flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(3)) = F_hlld(4)
-                            ! Magnetic field
-                            if (n == 0) then
-                                flux_rs${XYZ}$_vf(j, k, l, B_idx%beg) = F_hlld(5)
-                                flux_rs${XYZ}$_vf(j, k, l, B_idx%beg + 1) = F_hlld(6)
-                            else
-                                flux_rs${XYZ}$_vf(j, k, l, B_idx%beg + dir_idx(2) - 1) = F_hlld(5)
-                                flux_rs${XYZ}$_vf(j, k, l, B_idx%beg + dir_idx(3) - 1) = F_hlld(6)
-                            end if
-                            ! Energy
-                            flux_rs${XYZ}$_vf(j, k, l, E_idx) = F_hlld(7)
-                            ! Partial fraction
-                            $:GPU_LOOP(parallelism='[seq]')
-                            do i = advxb, advxe
-                                flux_rs${XYZ}$_vf(j, k, l, i) = 0._wp ! TODO multi-component (zero for now)
-                            end do
-
-                            flux_src_rs${XYZ}$_vf(j, k, l, advxb) = 0._wp
-                        end do
-                    end do
-                end do
-                $:END_GPU_PARALLEL_LOOP()
-            end if
-        #:endfor
-
-        call s_finalize_riemann_solver(flux_vf, flux_src_vf, flux_gsrc_vf, &
-                                       norm_dir)
-    end subroutine s_hlld_riemann_solver
-
-    !>  The computation of parameters, the allocation of memory,
-        !!      the association of pointers and/or the execution of any
-        !!      other procedures that are necessary to setup the module.
-    impure subroutine s_initialize_riemann_solvers_module
-
-        ! Allocating the variables that will be utilized to formulate the
-        ! left, right, and average states of the Riemann problem, as well
-        ! the Riemann problem solution
-        integer :: i, j
-
-        @:ALLOCATE(Gs_rs(1:num_fluids))
-
-        do i = 1, num_fluids
-            Gs_rs(i) = fluid_pp(i)%G
-        end do
-        $:GPU_UPDATE(device='[Gs_rs]')
-
-        if (viscous) then
-            @:ALLOCATE(Res_gs(1:2, 1:Re_size_max))
-        end if
-
-        if (viscous) then
-            do i = 1, 2
-                do j = 1, Re_size(i)
-                    Res_gs(i, j) = fluid_pp(Re_idx(i, j))%Re(i)
-                end do
-            end do
-            $:GPU_UPDATE(device='[Res_gs,Re_idx,Re_size]')
-        end if
-
-        $:GPU_ENTER_DATA(copyin='[is1,is2,is3,isx,isy,isz]')
-
-        is1%beg = -1; is2%beg = 0; is3%beg = 0
-        is1%end = m; is2%end = n; is3%end = p
-
-        @:ALLOCATE(flux_rsx_vf(is1%beg:is1%end, &
-            is2%beg:is2%end, &
-            is3%beg:is3%end, 1:sys_size))
-        @:ALLOCATE(flux_gsrc_rsx_vf(is1%beg:is1%end, &
-            is2%beg:is2%end, &
-            is3%beg:is3%end, 1:sys_size))
-        @:ALLOCATE(flux_src_rsx_vf(is1%beg:is1%end, &
-            is2%beg:is2%end, &
-            is3%beg:is3%end, advxb:sys_size))
-        @:ALLOCATE(vel_src_rsx_vf(is1%beg:is1%end, &
-            is2%beg:is2%end, &
-            is3%beg:is3%end, 1:num_vels))
-        if (qbmm) then
-            @:ALLOCATE(mom_sp_rsx_vf(is1%beg:is1%end + 1, is2%beg:is2%end, is3%beg:is3%end, 1:4))
-        end if
-
-        if (viscous) then
-            @:ALLOCATE(Re_avg_rsx_vf(is1%beg:is1%end, &
-                is2%beg:is2%end, &
-                is3%beg:is3%end, 1:2))
-        end if
-
-        if (n == 0) return
-
-        is1%beg = -1; is2%beg = 0; is3%beg = 0
-        is1%end = n; is2%end = m; is3%end = p
-
-        @:ALLOCATE(flux_rsy_vf(is1%beg:is1%end, &
-            is2%beg:is2%end, &
-            is3%beg:is3%end, 1:sys_size))
-        @:ALLOCATE(flux_gsrc_rsy_vf(is1%beg:is1%end, &
-            is2%beg:is2%end, &
-            is3%beg:is3%end, 1:sys_size))
-        @:ALLOCATE(flux_src_rsy_vf(is1%beg:is1%end, &
-            is2%beg:is2%end, &
-            is3%beg:is3%end, advxb:sys_size))
-        @:ALLOCATE(vel_src_rsy_vf(is1%beg:is1%end, &
-            is2%beg:is2%end, &
-            is3%beg:is3%end, 1:num_vels))
-
-        if (qbmm) then
-            @:ALLOCATE(mom_sp_rsy_vf(is1%beg:is1%end + 1, is2%beg:is2%end, is3%beg:is3%end, 1:4))
-        end if
-
-        if (viscous) then
-            @:ALLOCATE(Re_avg_rsy_vf(is1%beg:is1%end, &
-                is2%beg:is2%end, &
-                is3%beg:is3%end, 1:2))
-        end if
-
-        if (p == 0) return
-
-        is1%beg = -1; is2%beg = 0; is3%beg = 0
-        is1%end = p; is2%end = n; is3%end = m
-
-        @:ALLOCATE(flux_rsz_vf(is1%beg:is1%end, &
-            is2%beg:is2%end, &
-            is3%beg:is3%end, 1:sys_size))
-        @:ALLOCATE(flux_gsrc_rsz_vf(is1%beg:is1%end, &
-            is2%beg:is2%end, &
-            is3%beg:is3%end, 1:sys_size))
-        @:ALLOCATE(flux_src_rsz_vf(is1%beg:is1%end, &
-            is2%beg:is2%end, &
-            is3%beg:is3%end, advxb:sys_size))
-        @:ALLOCATE(vel_src_rsz_vf(is1%beg:is1%end, &
-            is2%beg:is2%end, &
-            is3%beg:is3%end, 1:num_vels))
-
-        if (qbmm) then
-            @:ALLOCATE(mom_sp_rsz_vf(is1%beg:is1%end + 1, is2%beg:is2%end, is3%beg:is3%end, 1:4))
-        end if
-
-        if (viscous) then
-            @:ALLOCATE(Re_avg_rsz_vf(is1%beg:is1%end, &
-                is2%beg:is2%end, &
-                is3%beg:is3%end, 1:2))
-        end if
-
-    end subroutine s_initialize_riemann_solvers_module
-
-    !>  The purpose of this subroutine is to populate the buffers
-        !!      of the left and right Riemann states variables, depending
-        !!      on the boundary conditions.
-        !!  @param qL_prim_rsx_vf Left WENO-reconstructed cell-boundary values (x-dir)
-        !!  @param qL_prim_rsy_vf Left WENO-reconstructed cell-boundary values (y-dir)
-        !!  @param qL_prim_rsz_vf Left WENO-reconstructed cell-boundary values (z-dir)
-        !!  @param dqL_prim_dx_vf The  left WENO-reconstructed cell-boundary values of the
-        !!      first-order x-dir spatial derivatives
-        !!  @param dqL_prim_dy_vf The  left WENO-reconstructed cell-boundary values of the
-        !!      first-order y-dir spatial derivatives
-        !!  @param dqL_prim_dz_vf The  left WENO-reconstructed cell-boundary values of the
-        !!      first-order z-dir spatial derivatives
-        !!  @param qR_prim_rsx_vf Right WENO-reconstructed cell-boundary values (x-dir)
-        !!  @param qR_prim_rsy_vf Right WENO-reconstructed cell-boundary values (y-dir)
-        !!  @param qR_prim_rsz_vf Right WENO-reconstructed cell-boundary values (z-dir)
-        !!  @param dqR_prim_dx_vf The right WENO-reconstructed cell-boundary values of the
-        !!      first-order x-dir spatial derivatives
-        !!  @param dqR_prim_dy_vf The right WENO-reconstructed cell-boundary values of the
-        !!      first-order y-dir spatial derivatives
-        !!  @param dqR_prim_dz_vf The right WENO-reconstructed cell-boundary values of the
-        !!      first-order z-dir spatial derivatives
-        !!  @param norm_dir Dir. splitting direction
-        !!  @param ix Index bounds in the x-dir
-        !!  @param iy Index bounds in the y-dir
-        !!  @param iz Index bounds in the z-dir
-    subroutine s_populate_riemann_states_variables_buffers( &
-        qL_prim_rsx_vf, qL_prim_rsy_vf, qL_prim_rsz_vf, dqL_prim_dx_vf, &
-        dqL_prim_dy_vf, &
-        dqL_prim_dz_vf, &
-        qR_prim_rsx_vf, qR_prim_rsy_vf, qR_prim_rsz_vf, dqR_prim_dx_vf, &
-        dqR_prim_dy_vf, &
-        dqR_prim_dz_vf, &
-        norm_dir, ix, iy, iz)
-
-        real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:), intent(inout) :: qL_prim_rsx_vf, qL_prim_rsy_vf, qL_prim_rsz_vf, qR_prim_rsx_vf, qR_prim_rsy_vf, qR_prim_rsz_vf
-
-        type(scalar_field), &
-            allocatable, dimension(:), &
-            intent(inout) :: dqL_prim_dx_vf, dqR_prim_dx_vf, &
-                             dqL_prim_dy_vf, dqR_prim_dy_vf, &
-                             dqL_prim_dz_vf, dqR_prim_dz_vf
-
-        integer, intent(in) :: norm_dir
-        type(int_bounds_info), intent(in) :: ix, iy, iz
-
-        integer :: i, j, k, l !< Generic loop iterator
-
-        if (norm_dir == 1) then
-            is1 = ix; is2 = iy; is3 = iz
-            dir_idx = (/1, 2, 3/); dir_flg = (/1._wp, 0._wp, 0._wp/)
-        elseif (norm_dir == 2) then
-            is1 = iy; is2 = ix; is3 = iz
-            dir_idx = (/2, 1, 3/); dir_flg = (/0._wp, 1._wp, 0._wp/)
-        else
-            is1 = iz; is2 = iy; is3 = ix
-            dir_idx = (/3, 1, 2/); dir_flg = (/0._wp, 0._wp, 1._wp/)
-        end if
-
-        $:GPU_UPDATE(device='[is1,is2,is3]')
-
-        if (elasticity) then
-            if (norm_dir == 1) then
-                dir_idx_tau = (/1, 2, 4/)
-            else if (norm_dir == 2) then
-                dir_idx_tau = (/3, 2, 5/)
-            else
-                dir_idx_tau = (/6, 4, 5/)
-            end if
-        end if
-
-        isx = ix; isy = iy; isz = iz
-        ! for stuff in the same module
-        $:GPU_UPDATE(device='[isx,isy,isz]')
-        ! for stuff in different modules
-        $:GPU_UPDATE(device='[dir_idx,dir_flg,dir_idx_tau]')
-
-        ! Population of Buffers in x-direction
-        if (norm_dir == 1) then
-
-            if (bc_x%beg == BC_RIEMANN_EXTRAP) then    ! Riemann state extrap. BC at beginning
-                $:GPU_PARALLEL_LOOP(collapse=3)
-                do i = 1, sys_size
-                    do l = is3%beg, is3%end
-                        do k = is2%beg, is2%end
-                            qL_prim_rsx_vf(-1, k, l, i) = &
-                                qR_prim_rsx_vf(0, k, l, i)
-                        end do
-                    end do
-                end do
-                $:END_GPU_PARALLEL_LOOP()
-
-                if (viscous .or. dummy) then
-                    $:GPU_PARALLEL_LOOP(collapse=3)
-                    do i = momxb, momxe
-                        do l = isz%beg, isz%end
-                            do k = isy%beg, isy%end
-
-                                dqL_prim_dx_vf(i)%sf(-1, k, l) = &
-                                    dqR_prim_dx_vf(i)%sf(0, k, l)
-                            end do
-                        end do
-                    end do
-                    $:END_GPU_PARALLEL_LOOP()
-
-                    if (n > 0) then
-                        $:GPU_PARALLEL_LOOP(collapse=3)
-                        do i = momxb, momxe
-                            do l = isz%beg, isz%end
-                                do k = isy%beg, isy%end
-
-                                    dqL_prim_dy_vf(i)%sf(-1, k, l) = &
-                                        dqR_prim_dy_vf(i)%sf(0, k, l)
-                                end do
-                            end do
-                        end do
-                        $:END_GPU_PARALLEL_LOOP()
-
-                        if (p > 0) then
-                            $:GPU_PARALLEL_LOOP(collapse=3)
-                            do i = momxb, momxe
-                                do l = isz%beg, isz%end
-                                    do k = isy%beg, isy%end
-
-                                        dqL_prim_dz_vf(i)%sf(-1, k, l) = &
-                                            dqR_prim_dz_vf(i)%sf(0, k, l)
-                                    end do
-                                end do
-                            end do
-                            $:END_GPU_PARALLEL_LOOP()
-                        end if
-
-                    end if
-
-                end if
-
-            end if
-
-            if (bc_x%end == BC_RIEMANN_EXTRAP) then    ! Riemann state extrap. BC at end
-
-                $:GPU_PARALLEL_LOOP(collapse=3)
-                do i = 1, sys_size
-                    do l = is3%beg, is3%end
-                        do k = is2%beg, is2%end
-                            qR_prim_rsx_vf(m + 1, k, l, i) = &
-                                qL_prim_rsx_vf(m, k, l, i)
-                        end do
-                    end do
-                end do
-                $:END_GPU_PARALLEL_LOOP()
-
-                if (viscous .or. dummy) then
-
-                    $:GPU_PARALLEL_LOOP(collapse=3)
-                    do i = momxb, momxe
-                        do l = isz%beg, isz%end
-                            do k = isy%beg, isy%end
-
-                                dqR_prim_dx_vf(i)%sf(m + 1, k, l) = &
-                                    dqL_prim_dx_vf(i)%sf(m, k, l)
-                            end do
-                        end do
-                    end do
-                    $:END_GPU_PARALLEL_LOOP()
-
-                    if (n > 0) then
-                        $:GPU_PARALLEL_LOOP(collapse=3)
-                        do i = momxb, momxe
-                            do l = isz%beg, isz%end
-                                do k = isy%beg, isy%end
-
-                                    dqR_prim_dy_vf(i)%sf(m + 1, k, l) = &
-                                        dqL_prim_dy_vf(i)%sf(m, k, l)
-                                end do
-                            end do
-                        end do
-                        $:END_GPU_PARALLEL_LOOP()
-
-                        if (p > 0) then
-                            $:GPU_PARALLEL_LOOP(collapse=3)
-                            do i = momxb, momxe
-                                do l = isz%beg, isz%end
-                                    do k = isy%beg, isy%end
-
-                                        dqR_prim_dz_vf(i)%sf(m + 1, k, l) = &
-                                            dqL_prim_dz_vf(i)%sf(m, k, l)
-                                    end do
-                                end do
-                            end do
-                            $:END_GPU_PARALLEL_LOOP()
-                        end if
-
-                    end if
-
-                end if
-
-            end if
-            ! END: Population of Buffers in x-direction
-
-            ! Population of Buffers in y-direction
-        elseif (norm_dir == 2) then
-
-            if (bc_y%beg == BC_RIEMANN_EXTRAP) then    ! Riemann state extrap. BC at beginning
-                $:GPU_PARALLEL_LOOP(collapse=3)
-                do i = 1, sys_size
-                    do l = is3%beg, is3%end
-                        do k = is2%beg, is2%end
-                            qL_prim_rsy_vf(-1, k, l, i) = &
-                                qR_prim_rsy_vf(0, k, l, i)
-                        end do
-                    end do
-                end do
-                $:END_GPU_PARALLEL_LOOP()
-
-                if (viscous .or. dummy) then
-
-                    $:GPU_PARALLEL_LOOP(collapse=3)
-                    do i = momxb, momxe
-                        do l = isz%beg, isz%end
-                            do j = isx%beg, isx%end
-                                dqL_prim_dx_vf(i)%sf(j, -1, l) = &
-                                    dqR_prim_dx_vf(i)%sf(j, 0, l)
-                            end do
-                        end do
-                    end do
-                    $:END_GPU_PARALLEL_LOOP()
-
-                    $:GPU_PARALLEL_LOOP(collapse=3)
-                    do i = momxb, momxe
-                        do l = isz%beg, isz%end
-                            do j = isx%beg, isx%end
-                                dqL_prim_dy_vf(i)%sf(j, -1, l) = &
-                                    dqR_prim_dy_vf(i)%sf(j, 0, l)
-                            end do
-                        end do
-                    end do
-                    $:END_GPU_PARALLEL_LOOP()
-
-                    if (p > 0) then
-                        $:GPU_PARALLEL_LOOP(collapse=3)
-                        do i = momxb, momxe
-                            do l = isz%beg, isz%end
-                                do j = isx%beg, isx%end
-                                    dqL_prim_dz_vf(i)%sf(j, -1, l) = &
-                                        dqR_prim_dz_vf(i)%sf(j, 0, l)
-                                end do
-                            end do
-                        end do
-                        $:END_GPU_PARALLEL_LOOP()
-                    end if
-
-                end if
-
-            end if
-
-            if (bc_y%end == BC_RIEMANN_EXTRAP) then    ! Riemann state extrap. BC at end
-
-                $:GPU_PARALLEL_LOOP(collapse=3)
-                do i = 1, sys_size
-                    do l = is3%beg, is3%end
-                        do k = is2%beg, is2%end
-                            qR_prim_rsy_vf(n + 1, k, l, i) = &
-                                qL_prim_rsy_vf(n, k, l, i)
-                        end do
-                    end do
-                end do
-                $:END_GPU_PARALLEL_LOOP()
-
-                if (viscous .or. dummy) then
-
-                    $:GPU_PARALLEL_LOOP(collapse=3)
-                    do i = momxb, momxe
-                        do l = isz%beg, isz%end
-                            do j = isx%beg, isx%end
-                                dqR_prim_dx_vf(i)%sf(j, n + 1, l) = &
-                                    dqL_prim_dx_vf(i)%sf(j, n, l)
-                            end do
-                        end do
-                    end do
-                    $:END_GPU_PARALLEL_LOOP()
-
-                    $:GPU_PARALLEL_LOOP(collapse=3)
-                    do i = momxb, momxe
-                        do l = isz%beg, isz%end
-                            do j = isx%beg, isx%end
-                                dqR_prim_dy_vf(i)%sf(j, n + 1, l) = &
-                                    dqL_prim_dy_vf(i)%sf(j, n, l)
-                            end do
-                        end do
-                    end do
-                    $:END_GPU_PARALLEL_LOOP()
-
-                    if (p > 0) then
-                        $:GPU_PARALLEL_LOOP(collapse=3)
-                        do i = momxb, momxe
-                            do l = isz%beg, isz%end
-                                do j = isx%beg, isx%end
-                                    dqR_prim_dz_vf(i)%sf(j, n + 1, l) = &
-                                        dqL_prim_dz_vf(i)%sf(j, n, l)
-                                end do
-                            end do
-                        end do
-                        $:END_GPU_PARALLEL_LOOP()
-                    end if
-
-                end if
-
-            end if
-            ! END: Population of Buffers in y-direction
-
-            ! Population of Buffers in z-direction
-        else
-
-            if (bc_z%beg == BC_RIEMANN_EXTRAP) then    ! Riemann state extrap. BC at beginning
-                $:GPU_PARALLEL_LOOP(collapse=3)
-                do i = 1, sys_size
-                    do l = is3%beg, is3%end
-                        do k = is2%beg, is2%end
-                            qL_prim_rsz_vf(-1, k, l, i) = &
-                                qR_prim_rsz_vf(0, k, l, i)
-                        end do
-                    end do
-                end do
-                $:END_GPU_PARALLEL_LOOP()
-
-                if (viscous .or. dummy) then
-                    $:GPU_PARALLEL_LOOP(collapse=3)
-                    do i = momxb, momxe
-                        do k = isy%beg, isy%end
-                            do j = isx%beg, isx%end
-                                dqL_prim_dx_vf(i)%sf(j, k, -1) = &
-                                    dqR_prim_dx_vf(i)%sf(j, k, 0)
-                            end do
-                        end do
-                    end do
-                    $:END_GPU_PARALLEL_LOOP()
-                    $:GPU_PARALLEL_LOOP(collapse=3)
-                    do i = momxb, momxe
-                        do k = isy%beg, isy%end
-                            do j = isx%beg, isx%end
-                                dqL_prim_dy_vf(i)%sf(j, k, -1) = &
-                                    dqR_prim_dy_vf(i)%sf(j, k, 0)
-                            end do
-                        end do
-                    end do
-                    $:END_GPU_PARALLEL_LOOP()
-                    $:GPU_PARALLEL_LOOP(collapse=3)
-                    do i = momxb, momxe
-                        do k = isy%beg, isy%end
-                            do j = isx%beg, isx%end
-                                dqL_prim_dz_vf(i)%sf(j, k, -1) = &
-                                    dqR_prim_dz_vf(i)%sf(j, k, 0)
-                            end do
-                        end do
-                    end do
-                    $:END_GPU_PARALLEL_LOOP()
-                end if
-
-            end if
-
-            if (bc_z%end == BC_RIEMANN_EXTRAP) then    ! Riemann state extrap. BC at end
-
-                $:GPU_PARALLEL_LOOP(collapse=3)
-                do i = 1, sys_size
-                    do l = is3%beg, is3%end
-                        do k = is2%beg, is2%end
-                            qR_prim_rsz_vf(p + 1, k, l, i) = &
-                                qL_prim_rsz_vf(p, k, l, i)
-                        end do
-                    end do
-                end do
-                $:END_GPU_PARALLEL_LOOP()
-
-                if (viscous .or. dummy) then
-                    $:GPU_PARALLEL_LOOP(collapse=3)
-                    do i = momxb, momxe
-                        do k = isy%beg, isy%end
-                            do j = isx%beg, isx%end
-                                dqR_prim_dx_vf(i)%sf(j, k, p + 1) = &
-                                    dqL_prim_dx_vf(i)%sf(j, k, p)
-                            end do
-                        end do
-                    end do
-                    $:END_GPU_PARALLEL_LOOP()
-
-                    $:GPU_PARALLEL_LOOP(collapse=3)
-                    do i = momxb, momxe
-                        do k = isy%beg, isy%end
-                            do j = isx%beg, isx%end
-                                dqR_prim_dy_vf(i)%sf(j, k, p + 1) = &
-                                    dqL_prim_dy_vf(i)%sf(j, k, p)
-                            end do
-                        end do
-                    end do
-                    $:END_GPU_PARALLEL_LOOP()
-
-                    $:GPU_PARALLEL_LOOP(collapse=3)
-                    do i = momxb, momxe
-                        do k = isy%beg, isy%end
-                            do j = isx%beg, isx%end
-                                dqR_prim_dz_vf(i)%sf(j, k, p + 1) = &
-                                    dqL_prim_dz_vf(i)%sf(j, k, p)
-                            end do
-                        end do
-                    end do
-                    $:END_GPU_PARALLEL_LOOP()
-                end if
-
-            end if
-
-        end if
-        ! END: Population of Buffers in z-direction
-
-    end subroutine s_populate_riemann_states_variables_buffers
-
-    !>  The computation of parameters, the allocation of memory,
-        !!      the association of pointers and/or the execution of any
-        !!      other procedures needed to configure the chosen Riemann
-        !!      solver algorithm.
-        !!  @param flux_src_vf Intra-cell fluxes sources
-        !!  @param norm_dir Dir. splitting direction
-    subroutine s_initialize_riemann_solver( &
-        flux_src_vf, &
-        norm_dir)
-
-        type(scalar_field), &
-            dimension(sys_size), &
-            intent(inout) :: flux_src_vf
-
-        integer, intent(in) :: norm_dir
-
-        integer :: i, j, k, l ! Generic loop iterators
-
-        ! Reshaping Inputted Data in x-direction
-
-        if (norm_dir == 1) then
-
-            if (viscous .or. (surface_tension) .or. dummy) then
-
-                $:GPU_PARALLEL_LOOP(collapse=4)
-                do i = momxb, E_idx
-                    do l = is3%beg, is3%end
-                        do k = is2%beg, is2%end
-                            do j = is1%beg, is1%end
-                                flux_src_vf(i)%sf(j, k, l) = 0._wp
-                            end do
-                        end do
-                    end do
-                end do
-                $:END_GPU_PARALLEL_LOOP()
-            end if
-
-            if (chem_params%diffusion) then
-                $:GPU_PARALLEL_LOOP(collapse=4)
-                do i = E_idx, chemxe
-                    do l = is3%beg, is3%end
-                        do k = is2%beg, is2%end
-                            do j = is1%beg, is1%end
-                                if (i == E_idx .or. i >= chemxb) then
-                                    flux_src_vf(i)%sf(j, k, l) = 0._wp
-                                end if
-                            end do
-                        end do
-                    end do
-                end do
-                $:END_GPU_PARALLEL_LOOP()
-            end if
-
-            if (qbmm) then
-                $:GPU_PARALLEL_LOOP(collapse=4)
-                do i = 1, 4
-                    do l = is3%beg, is3%end
-                        do k = is2%beg, is2%end
-                            do j = is1%beg, is1%end + 1
-                                mom_sp_rsx_vf(j, k, l, i) = mom_sp(i)%sf(j, k, l)
-                            end do
-                        end do
-                    end do
-                end do
-                $:END_GPU_PARALLEL_LOOP()
-            end if
-
-            ! Reshaping Inputted Data in y-direction
-        elseif (norm_dir == 2) then
-
-            if (viscous .or. (surface_tension) .or. dummy) then
-                $:GPU_PARALLEL_LOOP(collapse=4)
-                do i = momxb, E_idx
-                    do l = is3%beg, is3%end
-                        do j = is1%beg, is1%end
-                            do k = is2%beg, is2%end
-                                flux_src_vf(i)%sf(k, j, l) = 0._wp
-                            end do
-                        end do
-                    end do
-                end do
-                $:END_GPU_PARALLEL_LOOP()
-            end if
-
-            if (chem_params%diffusion) then
-                $:GPU_PARALLEL_LOOP(collapse=4)
-                do i = E_idx, chemxe
-                    do l = is3%beg, is3%end
-                        do j = is1%beg, is1%end
-                            do k = is2%beg, is2%end
-                                if (i == E_idx .or. i >= chemxb) then
-                                    flux_src_vf(i)%sf(k, j, l) = 0._wp
-                                end if
-                            end do
-                        end do
-                    end do
-                end do
-                $:END_GPU_PARALLEL_LOOP()
-            end if
-
-            if (qbmm) then
-                $:GPU_PARALLEL_LOOP(collapse=4)
-                do i = 1, 4
-                    do l = is3%beg, is3%end
-                        do k = is2%beg, is2%end
-                            do j = is1%beg, is1%end + 1
-                                mom_sp_rsy_vf(j, k, l, i) = mom_sp(i)%sf(k, j, l)
-                            end do
-                        end do
-                    end do
-                end do
-                $:END_GPU_PARALLEL_LOOP()
-            end if
-
-            ! Reshaping Inputted Data in z-direction
-        else
-
-            if (viscous .or. (surface_tension) .or. dummy) then
-                $:GPU_PARALLEL_LOOP(collapse=4)
-                do i = momxb, E_idx
-                    do j = is1%beg, is1%end
-                        do k = is2%beg, is2%end
-                            do l = is3%beg, is3%end
-                                flux_src_vf(i)%sf(l, k, j) = 0._wp
-                            end do
-                        end do
-                    end do
-                end do
-                $:END_GPU_PARALLEL_LOOP()
-            end if
-
-            if (chem_params%diffusion) then
-                $:GPU_PARALLEL_LOOP(collapse=4)
-                do i = E_idx, chemxe
-                    do j = is1%beg, is1%end
-                        do k = is2%beg, is2%end
-                            do l = is3%beg, is3%end
-                                if (i == E_idx .or. i >= chemxb) then
-                                    flux_src_vf(i)%sf(l, k, j) = 0._wp
-                                end if
-                            end do
-                        end do
-                    end do
-                end do
-                $:END_GPU_PARALLEL_LOOP()
-            end if
-
-            if (qbmm) then
-                $:GPU_PARALLEL_LOOP(collapse=4)
-                do i = 1, 4
-                    do l = is3%beg, is3%end
-                        do k = is2%beg, is2%end
-                            do j = is1%beg, is1%end + 1
-                                mom_sp_rsz_vf(j, k, l, i) = mom_sp(i)%sf(l, k, j)
-                            end do
-                        end do
-                    end do
-                end do
-                $:END_GPU_PARALLEL_LOOP()
-            end if
-
-        end if
-
-    end subroutine s_initialize_riemann_solver
-
-    !> @brief Computes cylindrical viscous source flux contributions for momentum and energy.
-        !! Calculates Cartesian components of the stress tensor using averaged velocity derivatives
-        !! and cylindrical geometric factors, then updates `flux_src_vf`.
-        !! Assumes x-dir is axial (z_cyl), y-dir is radial (r_cyl), z-dir is azimuthal (theta_cyl for derivatives).
-        !! @param[in] velL_vf Left boundary velocity (\f$v_x, v_y, v_z\f$) (num_dims scalar_field).
-        !! @param[in] dvelL_dx_vf Left boundary \f$\partial v_i/\partial x\f$ (num_dims scalar_field).
-        !! @param[in] dvelL_dy_vf Left boundary \f$\partial v_i/\partial y\f$ (num_dims scalar_field).
-        !! @param[in] dvelL_dz_vf Left boundary \f$\partial v_i/\partial z\f$ (num_dims scalar_field).
-        !! @param[in] velR_vf Right boundary velocity (\f$v_x, v_y, v_z\f$) (num_dims scalar_field).
-        !! @param[in] dvelR_dx_vf Right boundary \f$\partial v_i/\partial x\f$ (num_dims scalar_field).
-        !! @param[in] dvelR_dy_vf Right boundary \f$\partial v_i/\partial y\f$ (num_dims scalar_field).
-        !! @param[in] dvelR_dz_vf Right boundary \f$\partial v_i/\partial z\f$ (num_dims scalar_field).
-        !! @param[inout] flux_src_vf Intercell source flux array to update (sys_size scalar_field).
-        !! @param[in] norm_dir Interface normal direction (1=x-face, 2=y-face, 3=z-face).
-        !! @param[in] ix Global X-direction loop bounds (int_bounds_info).
-        !! @param[in] iy Global Y-direction loop bounds (int_bounds_info).
-        !! @param[in] iz Global Z-direction loop bounds (int_bounds_info).
-    subroutine s_compute_cylindrical_viscous_source_flux(velL_vf, &
-                                                         dvelL_dx_vf, dvelL_dy_vf, dvelL_dz_vf, &
-                                                         velR_vf, &
-                                                         dvelR_dx_vf, dvelR_dy_vf, dvelR_dz_vf, &
-                                                         flux_src_vf, norm_dir, ix, iy, iz)
-
-        type(scalar_field), dimension(num_dims), intent(in) :: velL_vf, velR_vf
-        type(scalar_field), dimension(num_dims), intent(in) :: dvelL_dx_vf, dvelR_dx_vf
-        type(scalar_field), dimension(num_dims), intent(in) :: dvelL_dy_vf, dvelR_dy_vf
-        type(scalar_field), dimension(num_dims), intent(in) :: dvelL_dz_vf, dvelR_dz_vf
-        type(scalar_field), dimension(sys_size), intent(inout) :: flux_src_vf
-        integer, intent(in) :: norm_dir
-        type(int_bounds_info), intent(in) :: ix, iy, iz
-
-        ! Local variables
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
-            real(wp), dimension(3) :: avg_v_int       !!< Averaged interface velocity (\f$v_x, v_y, v_z\f$) (grid directions).
-            real(wp), dimension(3) :: avg_dvdx_int    !!< Averaged interface \f$\partial v_i/\partial x\f$ (grid dir 1).
-            real(wp), dimension(3) :: avg_dvdy_int    !!< Averaged interface \f$\partial v_i/\partial y\f$ (grid dir 2).
-            real(wp), dimension(3) :: avg_dvdz_int    !!< Averaged interface \f$\partial v_i/\partial z\f$ (grid dir 3).
-            real(wp), dimension(3) :: vel_src_int !!< Interface velocity (\f$v_1,v_2,v_3\f$) (grid directions) for viscous work.
-            real(wp), dimension(3) :: stress_vector_shear !!< Shear stress vector (\f$\sigma_{N1}, \sigma_{N2}, \sigma_{N3}\f$) on N-face (grid directions).
-        #:else
-            real(wp), dimension(num_dims) :: avg_v_int       !!< Averaged interface velocity (\f$v_x, v_y, v_z\f$) (grid directions).
-            real(wp), dimension(num_dims) :: avg_dvdx_int    !!< Averaged interface \f$\partial v_i/\partial x\f$ (grid dir 1).
-            real(wp), dimension(num_dims) :: avg_dvdy_int    !!< Averaged interface \f$\partial v_i/\partial y\f$ (grid dir 2).
-            real(wp), dimension(num_dims) :: avg_dvdz_int    !!< Averaged interface \f$\partial v_i/\partial z\f$ (grid dir 3).
-            real(wp), dimension(num_dims) :: vel_src_int !!< Interface velocity (\f$v_1,v_2,v_3\f$) (grid directions) for viscous work.
-            real(wp), dimension(num_dims) :: stress_vector_shear !!< Shear stress vector (\f$\sigma_{N1}, \sigma_{N2}, \sigma_{N3}\f$) on N-face (grid directions).
-        #:endif
-        real(wp) :: stress_normal_bulk  !!< Normal bulk stress component \f$\sigma_{NN}\f$ on N-face.
-
-        real(wp) :: Re_s, Re_b        !!< Effective interface shear and bulk Reynolds numbers.
-        real(wp) :: r_eff             !!< Effective radius at interface for cylindrical terms.
-        real(wp) :: div_v_term_const  !!< Common term \f$-(2/3)(\nabla \cdot \mathbf{v}) / \text{Re}_s\f$ for shear stress diagonal.
-        real(wp) :: divergence_cyl    !!< Full divergence \f$\nabla \cdot \mathbf{v}\f$ in cylindrical coordinates.
-
-        integer :: j, k, l           !!< Loop iterators for \f$x, y, z\f$ grid directions.
-        integer :: i_vel             !!< Loop iterator for velocity components.
-        integer :: idx_rp(3)         !!< Indices \f$(j,k,l)\f$ of 'right' point for averaging.
-
-        $:GPU_PARALLEL_LOOP(collapse=3, private='[idx_rp, avg_v_int, avg_dvdx_int, avg_dvdy_int, avg_dvdz_int, Re_s, Re_b, vel_src_int, r_eff, divergence_cyl, stress_vector_shear, stress_normal_bulk, div_v_term_const]')
-        do l = iz%beg, iz%end
-            do k = iy%beg, iy%end
-                do j = ix%beg, ix%end
-
-                    ! Determine indices for the 'right' state for averaging across the interface
-                    idx_rp = [j, k, l]
-                    idx_rp(norm_dir) = idx_rp(norm_dir) + 1
-
-                    ! Average velocities and their derivatives at the interface
-                    ! For cylindrical: x-dir ~ axial (z_cyl), y-dir ~ radial (r_cyl), z-dir ~ azimuthal (theta_cyl)
-                    $:GPU_LOOP(parallelism='[seq]')
-                    do i_vel = 1, num_dims
-                        avg_v_int(i_vel) = 0.5_wp*(velL_vf(i_vel)%sf(j, k, l) + velR_vf(i_vel)%sf(idx_rp(1), idx_rp(2), idx_rp(3)))
-
-                        avg_dvdx_int(i_vel) = 0.5_wp*(dvelL_dx_vf(i_vel)%sf(j, k, l) + &
-                                                      dvelR_dx_vf(i_vel)%sf(idx_rp(1), idx_rp(2), idx_rp(3)))
-                        if (num_dims > 1) then
-                            avg_dvdy_int(i_vel) = 0.5_wp*(dvelL_dy_vf(i_vel)%sf(j, k, l) + &
-                                                          dvelR_dy_vf(i_vel)%sf(idx_rp(1), idx_rp(2), idx_rp(3)))
-                        else
-                            avg_dvdy_int(i_vel) = 0.0_wp
-                        end if
-                        if (num_dims > 2) then
-                            avg_dvdz_int(i_vel) = 0.5_wp*(dvelL_dz_vf(i_vel)%sf(j, k, l) + &
-                                                          dvelR_dz_vf(i_vel)%sf(idx_rp(1), idx_rp(2), idx_rp(3)))
-                        else
-                            avg_dvdz_int(i_vel) = 0.0_wp
-                        end if
-                    end do
-
-                    ! Get Re numbers and interface velocity for viscous work
-                    select case (norm_dir)
-                    case (1) ! x-face (axial face in z_cyl direction)
-                        Re_s = Re_avg_rsx_vf(j, k, l, 1)
-                        Re_b = Re_avg_rsx_vf(j, k, l, 2)
-                        vel_src_int = vel_src_rsx_vf(j, k, l, 1:num_dims)
-                        r_eff = y_cc(k)
-                    case (2) ! y-face (radial face in r_cyl direction)
-                        Re_s = Re_avg_rsy_vf(k, j, l, 1)
-                        Re_b = Re_avg_rsy_vf(k, j, l, 2)
-                        vel_src_int = vel_src_rsy_vf(k, j, l, 1:num_dims)
-                        r_eff = y_cb(k)
-                    case (3) ! z-face (azimuthal face in theta_cyl direction)
-                        Re_s = Re_avg_rsz_vf(l, k, j, 1)
-                        Re_b = Re_avg_rsz_vf(l, k, j, 2)
-                        vel_src_int = vel_src_rsz_vf(l, k, j, 1:num_dims)
-                        r_eff = y_cc(k)
-                    end select
-
-                    ! Divergence in cylindrical coordinates (vx=vz_cyl, vy=vr_cyl, vz=vtheta_cyl)
-                    #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
-                        divergence_cyl = avg_dvdx_int(1) + avg_dvdy_int(2) + avg_v_int(2)/r_eff
-                        if (num_dims > 2) then
-                            #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
-                                divergence_cyl = divergence_cyl + avg_dvdz_int(3)/r_eff
-                            #:endif
-                        end if
-                    #:endif
-
-                    stress_vector_shear = 0.0_wp
-                    stress_normal_bulk = 0.0_wp
-
-                    if (shear_stress) then
-                        div_v_term_const = -(2.0_wp/3.0_wp)*divergence_cyl/Re_s
-
-                        select case (norm_dir)
-                        case (1) ! X-face (axial normal, z_cyl)
-                            stress_vector_shear(1) = (2.0_wp*avg_dvdx_int(1))/Re_s + div_v_term_const
-                            if (num_dims > 1) then
-                                #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
-                                    stress_vector_shear(2) = (avg_dvdy_int(1) + avg_dvdx_int(2))/Re_s
-                                #:endif
-                            end if
-                            if (num_dims > 2) then
-                                #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
-                                    stress_vector_shear(3) = (avg_dvdz_int(1)/r_eff + avg_dvdx_int(3))/Re_s
-                                #:endif
-                            end if
-                        case (2) ! Y-face (radial normal, r_cyl)
-                            if (num_dims > 1) then
-                                #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
-                                    stress_vector_shear(1) = (avg_dvdy_int(1) + avg_dvdx_int(2))/Re_s
-                                    stress_vector_shear(2) = (2.0_wp*avg_dvdy_int(2))/Re_s + div_v_term_const
-                                    if (num_dims > 2) then
-                                        #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
-                                            stress_vector_shear(3) = (avg_dvdz_int(2)/r_eff - avg_v_int(3)/r_eff + avg_dvdy_int(3))/Re_s
-                                        #:endif
-                                    end if
-                                #:endif
-                            else
-                                stress_vector_shear(1) = (2.0_wp*avg_dvdx_int(1))/Re_s + div_v_term_const
-                            end if
-                        case (3) ! Z-face (azimuthal normal, theta_cyl)
-                            if (num_dims > 2) then
-                                #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
-                                    stress_vector_shear(1) = (avg_dvdz_int(1)/r_eff + avg_dvdx_int(3))/Re_s
-                                    stress_vector_shear(2) = (avg_dvdz_int(2)/r_eff - avg_v_int(3)/r_eff + avg_dvdy_int(3))/Re_s
-                                    stress_vector_shear(3) = (2.0_wp*(avg_dvdz_int(3)/r_eff + avg_v_int(2)/r_eff))/Re_s + div_v_term_const
-                                #:endif
-                            end if
-                        end select
-
-                        $:GPU_LOOP(parallelism='[seq]')
-                        do i_vel = 1, num_dims
-                            flux_src_vf(momxb + i_vel - 1)%sf(j, k, l) = flux_src_vf(momxb + i_vel - 1)%sf(j, k, l) - stress_vector_shear(i_vel)
-                            flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - vel_src_int(i_vel)*stress_vector_shear(i_vel)
-                        end do
-                    end if
-
-                    if (bulk_stress) then
-                        stress_normal_bulk = divergence_cyl/Re_b
-
-                        flux_src_vf(momxb + norm_dir - 1)%sf(j, k, l) = flux_src_vf(momxb + norm_dir - 1)%sf(j, k, l) - stress_normal_bulk
-                        flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - vel_src_int(norm_dir)*stress_normal_bulk
-                    end if
-
-                end do
-            end do
-        end do
-        $:END_GPU_PARALLEL_LOOP()
-
-    end subroutine s_compute_cylindrical_viscous_source_flux
-
-    !> @brief Computes Cartesian viscous source flux contributions for momentum and energy.
-    !! Calculates averaged velocity gradients, gets Re and interface velocities,
-    !! calls helpers for shear/bulk stress, then updates `flux_src_vf`.
-    !! @param[in] dvelL_dx_vf Left boundary d(vel)/dx (num_dims scalar_field).
-    !! @param[in] dvelL_dy_vf Left boundary d(vel)/dy (num_dims scalar_field).
-    !! @param[in] dvelL_dz_vf Left boundary d(vel)/dz (num_dims scalar_field).
-    !! @param[in] dvelR_dx_vf Right boundary d(vel)/dx (num_dims scalar_field).
-    !! @param[in] dvelR_dy_vf Right boundary d(vel)/dy (num_dims scalar_field).
-    !! @param[in] dvelR_dz_vf Right boundary d(vel)/dz (num_dims scalar_field).
-    !! @param[inout] flux_src_vf Intercell source flux array to update (sys_size scalar_field).
-    !! @param[in] norm_dir Interface normal direction (1=x, 2=y, 3=z).
-    subroutine s_compute_cartesian_viscous_source_flux(dvelL_dx_vf, &
-                                                       dvelL_dy_vf, &
-                                                       dvelL_dz_vf, &
-                                                       dvelR_dx_vf, &
-                                                       dvelR_dy_vf, &
-                                                       dvelR_dz_vf, &
-                                                       flux_src_vf, &
-                                                       norm_dir)
-
-        ! Arguments
-        type(scalar_field), dimension(num_dims), intent(in) :: dvelL_dx_vf, dvelR_dx_vf
-        type(scalar_field), dimension(num_dims), intent(in) :: dvelL_dy_vf, dvelR_dy_vf
-        type(scalar_field), dimension(num_dims), intent(in) :: dvelL_dz_vf, dvelR_dz_vf
-        type(scalar_field), dimension(sys_size), intent(inout) :: flux_src_vf
-        integer, intent(in) :: norm_dir
-
-        ! Local variables
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
-            real(wp), dimension(3, 3) :: vel_grad_avg        !< Averaged velocity gradient tensor `d(vel_i)/d(coord_j)`.
-            real(wp), dimension(3, 3) :: current_tau_shear   !< Current shear stress tensor.
-            real(wp), dimension(3, 3) :: current_tau_bulk    !< Current bulk stress tensor.
-            real(wp), dimension(3) :: vel_src_at_interface         !< Interface velocities (u,v,w) for viscous work.
-        #:else
-            real(wp), dimension(num_dims, num_dims) :: vel_grad_avg        !< Averaged velocity gradient tensor `d(vel_i)/d(coord_j)`.
-            real(wp), dimension(num_dims, num_dims) :: current_tau_shear   !< Current shear stress tensor.
-            real(wp), dimension(num_dims, num_dims) :: current_tau_bulk    !< Current bulk stress tensor.
-            real(wp), dimension(num_dims) :: vel_src_at_interface         !< Interface velocities (u,v,w) for viscous work.
-        #:endif
-        integer, dimension(3) :: idx_right_phys                     !< Physical (j,k,l) indices for right state.
-
-        real(wp) :: Re_shear !< Interface shear Reynolds number.
-        real(wp) :: Re_bulk  !< Interface bulk Reynolds number.
-
-        integer :: j_loop         !< Physical x-index loop iterator.
-        integer :: k_loop         !< Physical y-index loop iterator.
-        integer :: l_loop         !< Physical z-index loop iterator.
-        integer :: i_dim          !< Generic dimension/component iterator.
-        integer :: vel_comp_idx   !< Velocity component iterator (1=u, 2=v, 3=w).
-
-        real(wp) :: divergence_v   !< Velocity divergence at interface.
-
-        $:GPU_PARALLEL_LOOP(collapse=3, private='[idx_right_phys, vel_grad_avg, current_tau_shear, current_tau_bulk, vel_src_at_interface, Re_shear, Re_bulk, divergence_v, i_dim, vel_comp_idx]')
-        do l_loop = isz%beg, isz%end
-            do k_loop = isy%beg, isy%end
-                do j_loop = isx%beg, isx%end
-
-                    idx_right_phys(1) = j_loop
-                    idx_right_phys(2) = k_loop
-                    idx_right_phys(3) = l_loop
-                    idx_right_phys(norm_dir) = idx_right_phys(norm_dir) + 1
-
-                    vel_grad_avg = 0.0_wp
-                    do vel_comp_idx = 1, num_dims
-                        vel_grad_avg(vel_comp_idx, 1) = 0.5_wp*(dvelL_dx_vf(vel_comp_idx)%sf(j_loop, k_loop, l_loop) + &
-                                                                dvelR_dx_vf(vel_comp_idx)%sf(idx_right_phys(1), idx_right_phys(2), idx_right_phys(3)))
-                        if (num_dims > 1) then
-                            #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
-                                vel_grad_avg(vel_comp_idx, 2) = 0.5_wp*(dvelL_dy_vf(vel_comp_idx)%sf(j_loop, k_loop, l_loop) + &
-                                                                        dvelR_dy_vf(vel_comp_idx)%sf(idx_right_phys(1), idx_right_phys(2), idx_right_phys(3)))
-                            #:endif
-                        end if
-                        if (num_dims > 2) then
-                            #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
-                                vel_grad_avg(vel_comp_idx, 3) = 0.5_wp*(dvelL_dz_vf(vel_comp_idx)%sf(j_loop, k_loop, l_loop) + &
-                                                                        dvelR_dz_vf(vel_comp_idx)%sf(idx_right_phys(1), idx_right_phys(2), idx_right_phys(3)))
-                            #:endif
-                        end if
-                    end do
-
-                    divergence_v = 0.0_wp
-                    do i_dim = 1, num_dims
-                        divergence_v = divergence_v + vel_grad_avg(i_dim, i_dim)
-                    end do
-
-                    vel_src_at_interface = 0.0_wp
-                    if (norm_dir == 1) then
-                        Re_shear = Re_avg_rsx_vf(j_loop, k_loop, l_loop, 1)
-                        Re_bulk = Re_avg_rsx_vf(j_loop, k_loop, l_loop, 2)
-                        do i_dim = 1, num_dims
-                            vel_src_at_interface(i_dim) = vel_src_rsx_vf(j_loop, k_loop, l_loop, i_dim)
-                        end do
-                    else if (norm_dir == 2) then
-                        Re_shear = Re_avg_rsy_vf(k_loop, j_loop, l_loop, 1)
-                        Re_bulk = Re_avg_rsy_vf(k_loop, j_loop, l_loop, 2)
-                        do i_dim = 1, num_dims
-                            vel_src_at_interface(i_dim) = vel_src_rsy_vf(k_loop, j_loop, l_loop, i_dim)
-                        end do
-                    else
-                        Re_shear = Re_avg_rsz_vf(l_loop, k_loop, j_loop, 1)
-                        Re_bulk = Re_avg_rsz_vf(l_loop, k_loop, j_loop, 2)
-                        do i_dim = 1, num_dims
-                            vel_src_at_interface(i_dim) = vel_src_rsz_vf(l_loop, k_loop, j_loop, i_dim)
-                        end do
-                    end if
-
-                    if (shear_stress) then
-                        ! current_tau_shear = 0.0_wp
-                        call s_calculate_shear_stress_tensor(vel_grad_avg, Re_shear, divergence_v, current_tau_shear)
-
-                        do i_dim = 1, num_dims
-                            flux_src_vf(momxb + i_dim - 1)%sf(j_loop, k_loop, l_loop) = &
-                                flux_src_vf(momxb + i_dim - 1)%sf(j_loop, k_loop, l_loop) - current_tau_shear(norm_dir, i_dim)
-
-                            flux_src_vf(E_idx)%sf(j_loop, k_loop, l_loop) = &
-                                flux_src_vf(E_idx)%sf(j_loop, k_loop, l_loop) - &
-                                vel_src_at_interface(i_dim)*current_tau_shear(norm_dir, i_dim)
-                        end do
-                    end if
-
-                    if (bulk_stress) then
-                        ! current_tau_bulk = 0.0_wp
-                        call s_calculate_bulk_stress_tensor(Re_bulk, divergence_v, current_tau_bulk)
-
-                        do i_dim = 1, num_dims
-                            flux_src_vf(momxb + i_dim - 1)%sf(j_loop, k_loop, l_loop) = &
-                                flux_src_vf(momxb + i_dim - 1)%sf(j_loop, k_loop, l_loop) - current_tau_bulk(norm_dir, i_dim)
-
-                            flux_src_vf(E_idx)%sf(j_loop, k_loop, l_loop) = &
-                                flux_src_vf(E_idx)%sf(j_loop, k_loop, l_loop) - &
-                                vel_src_at_interface(i_dim)*current_tau_bulk(norm_dir, i_dim)
-                        end do
-                    end if
-
-                end do
-            end do
-        end do
-        $:END_GPU_PARALLEL_LOOP()
-
-    end subroutine s_compute_cartesian_viscous_source_flux
-
-    !> @brief Calculates shear stress tensor components.
-    !! tau_ij_shear = ( (dui/dxj + duj/dxi) - (2/3)*(div_v)*delta_ij ) / Re_shear
-    !! @param[in] vel_grad_avg Averaged velocity gradient tensor (d(vel_i)/d(coord_j)).
-    !! @param[in] Re_shear Shear Reynolds number.
-    !! @param[in] divergence_v Velocity divergence (du/dx + dv/dy + dw/dz).
-    !! @param[out] tau_shear_out Calculated shear stress tensor (stress on i-face, j-direction).
-    subroutine s_calculate_shear_stress_tensor(vel_grad_avg, Re_shear, divergence_v, tau_shear_out)
-        $:GPU_ROUTINE(parallelism='[seq]')
-
-        ! Arguments
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
-            real(wp), dimension(3, 3), intent(in) :: vel_grad_avg
-            real(wp), dimension(3, 3), intent(out) :: tau_shear_out
-        #:else
-            real(wp), dimension(num_dims, num_dims), intent(in) :: vel_grad_avg
-            real(wp), dimension(num_dims, num_dims), intent(out) :: tau_shear_out
-        #:endif
-        real(wp), intent(in) :: Re_shear
-        real(wp), intent(in) :: divergence_v
-
-        ! Local variables
-        integer :: i_dim !< Loop iterator for face normal.
-        integer :: j_dim !< Loop iterator for force component direction.
-
-        tau_shear_out = 0.0_wp
-
-        do i_dim = 1, num_dims
-            do j_dim = 1, num_dims
-                tau_shear_out(i_dim, j_dim) = (vel_grad_avg(j_dim, i_dim) + vel_grad_avg(i_dim, j_dim))/Re_shear
-                if (i_dim == j_dim) then
-                    tau_shear_out(i_dim, j_dim) = tau_shear_out(i_dim, j_dim) - &
-                                                  (2.0_wp/3.0_wp)*divergence_v/Re_shear
-                end if
-            end do
-        end do
-
-    end subroutine s_calculate_shear_stress_tensor
-
-    !> @brief Calculates bulk stress tensor components (diagonal only).
-    !! tau_ii_bulk = (div_v) / Re_bulk. Off-diagonals are zero.
-    !! @param[in] Re_bulk Bulk Reynolds number.
-    !! @param[in] divergence_v Velocity divergence (du/dx + dv/dy + dw/dz).
-    !! @param[out] tau_bulk_out Calculated bulk stress tensor (stress on i-face, i-direction).
-    subroutine s_calculate_bulk_stress_tensor(Re_bulk, divergence_v, tau_bulk_out)
-        $:GPU_ROUTINE(parallelism='[seq]')
-
-        ! Arguments
-        real(wp), intent(in) :: Re_bulk
-        real(wp), intent(in) :: divergence_v
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
-            real(wp), dimension(3, 3), intent(out) :: tau_bulk_out
-        #:else
-            real(wp), dimension(num_dims, num_dims), intent(out) :: tau_bulk_out
-        #:endif
-
-        ! Local variables
-        integer :: i_dim !< Loop iterator for diagonal components.
-
-        tau_bulk_out = 0.0_wp
-
-        do i_dim = 1, num_dims
-            tau_bulk_out(i_dim, i_dim) = divergence_v/Re_bulk
-        end do
-
-    end subroutine s_calculate_bulk_stress_tensor
-
-    !>  Deallocation and/or disassociation procedures that are
-        !!      needed to finalize the selected Riemann problem solver
-        !!  @param flux_vf       Intercell fluxes
-        !!  @param flux_src_vf   Intercell source fluxes
-        !!  @param flux_gsrc_vf  Intercell geometric source fluxes
-        !!  @param norm_dir Dimensional splitting coordinate direction
-    subroutine s_finalize_riemann_solver(flux_vf, flux_src_vf, &
-                                         flux_gsrc_vf, &
-                                         norm_dir)
-
-        type(scalar_field), &
-            dimension(sys_size), &
-            intent(inout) :: flux_vf, flux_src_vf, flux_gsrc_vf
-
-        integer, intent(in) :: norm_dir
-
-        integer :: i, j, k, l !< Generic loop iterators
-
-        ! Reshaping Outputted Data in y-direction
-        if (norm_dir == 2) then
-            $:GPU_PARALLEL_LOOP(collapse=4)
-            do i = 1, sys_size
-                do l = is3%beg, is3%end
-                    do j = is1%beg, is1%end
-                        do k = is2%beg, is2%end
-                            flux_vf(i)%sf(k, j, l) = &
-                                flux_rsy_vf(j, k, l, i)
-                        end do
-                    end do
-                end do
-            end do
-            $:END_GPU_PARALLEL_LOOP()
-
-            if (cyl_coord) then
-                $:GPU_PARALLEL_LOOP(collapse=4)
-                do i = 1, sys_size
-                    do l = is3%beg, is3%end
-                        do j = is1%beg, is1%end
-                            do k = is2%beg, is2%end
-                                flux_gsrc_vf(i)%sf(k, j, l) = &
-                                    flux_gsrc_rsy_vf(j, k, l, i)
-                            end do
-                        end do
-                    end do
-                end do
-                $:END_GPU_PARALLEL_LOOP()
-            end if
-
-            $:GPU_PARALLEL_LOOP(collapse=3)
-            do l = is3%beg, is3%end
-                do j = is1%beg, is1%end
-                    do k = is2%beg, is2%end
-                        flux_src_vf(advxb)%sf(k, j, l) = &
-                            flux_src_rsy_vf(j, k, l, advxb)
-                    end do
-                end do
-            end do
-            $:END_GPU_PARALLEL_LOOP()
-
-            if (riemann_solver == 1 .or. riemann_solver == 4) then
-                $:GPU_PARALLEL_LOOP(collapse=4)
-                do i = advxb + 1, advxe
-                    do l = is3%beg, is3%end
-                        do j = is1%beg, is1%end
-                            do k = is2%beg, is2%end
-                                flux_src_vf(i)%sf(k, j, l) = &
-                                    flux_src_rsy_vf(j, k, l, i)
-                            end do
-                        end do
-                    end do
-                end do
-                $:END_GPU_PARALLEL_LOOP()
-
-            end if
-            ! Reshaping Outputted Data in z-direction
-        elseif (norm_dir == 3) then
-            $:GPU_PARALLEL_LOOP(collapse=4)
-            do i = 1, sys_size
-                do j = is1%beg, is1%end
-                    do k = is2%beg, is2%end
-                        do l = is3%beg, is3%end
-
-                            flux_vf(i)%sf(l, k, j) = &
-                                flux_rsz_vf(j, k, l, i)
-                        end do
-                    end do
-                end do
-            end do
-            $:END_GPU_PARALLEL_LOOP()
-            if (grid_geometry == 3) then
-                $:GPU_PARALLEL_LOOP(collapse=4)
-                do i = 1, sys_size
-                    do j = is1%beg, is1%end
-                        do k = is2%beg, is2%end
-                            do l = is3%beg, is3%end
-
-                                flux_gsrc_vf(i)%sf(l, k, j) = &
-                                    flux_gsrc_rsz_vf(j, k, l, i)
-                            end do
-                        end do
-                    end do
-                end do
-                $:END_GPU_PARALLEL_LOOP()
-            end if
-
-            $:GPU_PARALLEL_LOOP(collapse=3)
-            do j = is1%beg, is1%end
-                do k = is2%beg, is2%end
-                    do l = is3%beg, is3%end
-                        flux_src_vf(advxb)%sf(l, k, j) = &
-                            flux_src_rsz_vf(j, k, l, advxb)
-                    end do
-                end do
-            end do
-            $:END_GPU_PARALLEL_LOOP()
-
-            if (riemann_solver == 1 .or. riemann_solver == 4) then
-                $:GPU_PARALLEL_LOOP(collapse=4)
-                do i = advxb + 1, advxe
-                    do j = is1%beg, is1%end
-                        do k = is2%beg, is2%end
-                            do l = is3%beg, is3%end
-                                flux_src_vf(i)%sf(l, k, j) = &
-                                    flux_src_rsz_vf(j, k, l, i)
-                            end do
-                        end do
-                    end do
-                end do
-                $:END_GPU_PARALLEL_LOOP()
-
-            end if
-        elseif (norm_dir == 1) then
-            $:GPU_PARALLEL_LOOP(collapse=4)
-            do i = 1, sys_size
-                do l = is3%beg, is3%end
-                    do k = is2%beg, is2%end
-                        do j = is1%beg, is1%end
-                            flux_vf(i)%sf(j, k, l) = &
-                                flux_rsx_vf(j, k, l, i)
-                        end do
-                    end do
-                end do
-            end do
-            $:END_GPU_PARALLEL_LOOP()
-
-            $:GPU_PARALLEL_LOOP(collapse=3)
-            do l = is3%beg, is3%end
-                do k = is2%beg, is2%end
-                    do j = is1%beg, is1%end
-                        flux_src_vf(advxb)%sf(j, k, l) = &
-                            flux_src_rsx_vf(j, k, l, advxb)
-                    end do
-                end do
-            end do
-            $:END_GPU_PARALLEL_LOOP()
-
-            if (riemann_solver == 1 .or. riemann_solver == 4) then
-                $:GPU_PARALLEL_LOOP(collapse=4)
-                do i = advxb + 1, advxe
-                    do l = is3%beg, is3%end
-                        do k = is2%beg, is2%end
-                            do j = is1%beg, is1%end
-                                flux_src_vf(i)%sf(j, k, l) = &
-                                    flux_src_rsx_vf(j, k, l, i)
-                            end do
-                        end do
-                    end do
-                end do
-                $:END_GPU_PARALLEL_LOOP()
-            end if
-        end if
-
-    end subroutine s_finalize_riemann_solver
-
-    !> Module deallocation and/or disassociation procedures
-    impure subroutine s_finalize_riemann_solvers_module
-
-        if (viscous) then
-            @:DEALLOCATE(Re_avg_rsx_vf)
-        end if
-        @:DEALLOCATE(vel_src_rsx_vf)
-        @:DEALLOCATE(flux_rsx_vf)
-        @:DEALLOCATE(flux_src_rsx_vf)
-        @:DEALLOCATE(flux_gsrc_rsx_vf)
-        if (qbmm) then
-            @:DEALLOCATE(mom_sp_rsx_vf)
-        end if
-
-        if (n == 0) return
-
-        if (viscous) then
-            @:DEALLOCATE(Re_avg_rsy_vf)
-        end if
-        @:DEALLOCATE(vel_src_rsy_vf)
-        @:DEALLOCATE(flux_rsy_vf)
-        @:DEALLOCATE(flux_src_rsy_vf)
-        @:DEALLOCATE(flux_gsrc_rsy_vf)
-        if (qbmm) then
-            @:DEALLOCATE(mom_sp_rsy_vf)
-        end if
-
-        if (p == 0) return
-
-        if (viscous) then
-            @:DEALLOCATE(Re_avg_rsz_vf)
-        end if
-        @:DEALLOCATE(vel_src_rsz_vf)
-        @:DEALLOCATE(flux_rsz_vf)
-        @:DEALLOCATE(flux_src_rsz_vf)
-        @:DEALLOCATE(flux_gsrc_rsz_vf)
-        if (qbmm) then
-            @:DEALLOCATE(mom_sp_rsz_vf)
-        end if
-
-    end subroutine s_finalize_riemann_solvers_module
-
-end module m_riemann_solvers
+!>
+!! @file
+!! @brief Contains module m_riemann_solvers
+
+!> @brief Approximate and exact Riemann solvers (HLL, HLLC, HLLD, exact) for the multicomponent Navier--Stokes equations
+
+#:include 'case.fpp'
+#:include 'macros.fpp'
+#:include 'inline_riemann.fpp'
+
+module m_riemann_solvers
+
+    use m_derived_types        !< Definitions of the derived types
+
+    use m_global_parameters    !< Definitions of the global parameters
+
+    use m_mpi_proxy            !< Message passing interface (MPI) module proxy
+
+    use m_variables_conversion !< State variables type conversion procedures
+
+    use m_bubbles              !< To get the bubble wall pressure function
+
+    use m_bubbles_EE
+
+    use m_surface_tension      !< To get the capillary fluxes
+
+    use m_helper_basic         !< Functions to compare floating point numbers
+
+    use m_chemistry
+
+    use m_re_visc              !< Dynamic Re_visc (Newtonian/non-Newtonian)
+
+    use m_thermochem, only: &
+        gas_constant, get_mixture_molecular_weight, &
+        get_mixture_specific_heat_cv_mass, get_mixture_energy_mass, &
+        get_species_specific_heats_r, get_species_enthalpies_rt, &
+        get_mixture_specific_heat_cp_mass
+
+    #:if USING_AMD
+        use m_chemistry, only: molecular_weights_nonparameter
+    #:endif
+
+    implicit none
+
+    private; public :: s_initialize_riemann_solvers_module, &
+ s_riemann_solver, &
+ s_hll_riemann_solver, &
+ s_hllc_riemann_solver, &
+ s_hlld_riemann_solver, &
+ s_lf_riemann_solver, &
+ s_finalize_riemann_solvers_module
+
+    !> The cell-boundary values of the fluxes (src - source) that are computed
+    !! through the chosen Riemann problem solver, and the direct evaluation of
+    !! source terms, by using the left and right states given in qK_prim_rs_vf,
+    !! dqK_prim_ds_vf where ds = dx, dy or dz.
+    !> @{
+
+    real(wp), allocatable, dimension(:, :, :, :) :: flux_rsx_vf, flux_src_rsx_vf
+    real(wp), allocatable, dimension(:, :, :, :) :: flux_rsy_vf, flux_src_rsy_vf
+    real(wp), allocatable, dimension(:, :, :, :) :: flux_rsz_vf, flux_src_rsz_vf
+    $:GPU_DECLARE(create='[flux_rsx_vf,flux_src_rsx_vf,flux_rsy_vf,flux_src_rsy_vf,flux_rsz_vf,flux_src_rsz_vf]')
+    !> @}
+
+    !> The cell-boundary values of the geometrical source flux that are computed
+    !! through the chosen Riemann problem solver by using the left and right
+    !! states given in qK_prim_rs_vf. Currently 2D axisymmetric for inviscid only.
+    !> @{
+
+    real(wp), allocatable, dimension(:, :, :, :) :: flux_gsrc_rsx_vf !<
+    real(wp), allocatable, dimension(:, :, :, :) :: flux_gsrc_rsy_vf !<
+    real(wp), allocatable, dimension(:, :, :, :) :: flux_gsrc_rsz_vf !<
+    $:GPU_DECLARE(create='[flux_gsrc_rsx_vf,flux_gsrc_rsy_vf,flux_gsrc_rsz_vf]')
+    !> @}
+
+    ! The cell-boundary values of the velocity. vel_src_rs_vf is determined as
+    ! part of Riemann problem solution and is used to evaluate the source flux.
+
+    real(wp), allocatable, dimension(:, :, :, :) :: vel_src_rsx_vf
+    real(wp), allocatable, dimension(:, :, :, :) :: vel_src_rsy_vf
+    real(wp), allocatable, dimension(:, :, :, :) :: vel_src_rsz_vf
+    $:GPU_DECLARE(create='[vel_src_rsx_vf,vel_src_rsy_vf,vel_src_rsz_vf]')
+
+    real(wp), allocatable, dimension(:, :, :, :) :: mom_sp_rsx_vf
+    real(wp), allocatable, dimension(:, :, :, :) :: mom_sp_rsy_vf
+    real(wp), allocatable, dimension(:, :, :, :) :: mom_sp_rsz_vf
+    $:GPU_DECLARE(create='[mom_sp_rsx_vf,mom_sp_rsy_vf,mom_sp_rsz_vf]')
+
+    real(wp), allocatable, dimension(:, :, :, :) :: Re_avg_rsx_vf
+    real(wp), allocatable, dimension(:, :, :, :) :: Re_avg_rsy_vf
+    real(wp), allocatable, dimension(:, :, :, :) :: Re_avg_rsz_vf
+    $:GPU_DECLARE(create='[Re_avg_rsx_vf,Re_avg_rsy_vf,Re_avg_rsz_vf]')
+
+    !> @name Indical bounds in the s1-, s2- and s3-directions
+    !> @{
+    type(int_bounds_info) :: is1, is2, is3
+    type(int_bounds_info) :: isx, isy, isz
+    !> @}
+
+    $:GPU_DECLARE(create='[is1,is2,is3,isx,isy,isz]')
+
+    real(wp), allocatable, dimension(:) :: Gs_rs
+    $:GPU_DECLARE(create='[Gs_rs]')
+
+    ! Note: Static Res_gs array removed - s_compute_re_visc handles
+    ! both Newtonian and non-Newtonian cases dynamically
+
+contains
+
+    !> Dispatch to the subroutines that are utilized to compute the
+        !! Riemann problem solution. For additional information please reference:
+        !!                        1) s_hll_riemann_solver
+        !!                        2) s_hllc_riemann_solver
+        !!                        3) s_exact_riemann_solver
+        !!                        4) s_hlld_riemann_solver
+        !!  @param qL_prim_rsx_vf Left WENO-reconstructed cell-boundary values (x-dir)
+        !!  @param qL_prim_rsy_vf Left WENO-reconstructed cell-boundary values (y-dir)
+        !!  @param qL_prim_rsz_vf Left WENO-reconstructed cell-boundary values (z-dir)
+        !!  @param dqL_prim_dx_vf The  left WENO-reconstructed cell-boundary values of the
+        !!      first-order x-dir spatial derivatives
+        !!  @param dqL_prim_dy_vf The  left WENO-reconstructed cell-boundary values of the
+        !!      first-order y-dir spatial derivatives
+        !!  @param dqL_prim_dz_vf The  left WENO-reconstructed cell-boundary values of the
+        !!      first-order z-dir spatial derivatives
+        !!  @param qL_prim_vf The  left WENO-reconstructed cell-boundary values of the
+        !!      cell-average primitive variables
+        !!  @param qR_prim_rsx_vf Right WENO-reconstructed cell-boundary values (x-dir)
+        !!  @param qR_prim_rsy_vf Right WENO-reconstructed cell-boundary values (y-dir)
+        !!  @param qR_prim_rsz_vf Right WENO-reconstructed cell-boundary values (z-dir)
+        !!  @param dqR_prim_dx_vf The right WENO-reconstructed cell-boundary values of the
+        !!      first-order x-dir spatial derivatives
+        !!  @param dqR_prim_dy_vf The right WENO-reconstructed cell-boundary values of the
+        !!      first-order y-dir spatial derivatives
+        !!  @param dqR_prim_dz_vf The right WENO-reconstructed cell-boundary values of the
+        !!      first-order z-dir spatial derivatives
+        !!  @param qR_prim_vf The right WENO-reconstructed cell-boundary values of the
+        !!      cell-average primitive variables
+        !!  @param q_prim_vf Cell-averaged primitive variables
+        !!  @param flux_vf Intra-cell fluxes
+        !!  @param flux_src_vf Intra-cell fluxes sources
+        !!  @param flux_gsrc_vf Intra-cell geometric fluxes sources
+        !!  @param norm_dir Dir. splitting direction
+        !!  @param ix Index bounds in the x-dir
+        !!  @param iy Index bounds in the y-dir
+        !!  @param iz Index bounds in the z-dir
+    subroutine s_riemann_solver(qL_prim_rsx_vf, qL_prim_rsy_vf, qL_prim_rsz_vf, dqL_prim_dx_vf, &
+                                dqL_prim_dy_vf, &
+                                dqL_prim_dz_vf, &
+                                qL_prim_vf, &
+                                qR_prim_rsx_vf, qR_prim_rsy_vf, qR_prim_rsz_vf, dqR_prim_dx_vf, &
+                                dqR_prim_dy_vf, &
+                                dqR_prim_dz_vf, &
+                                qR_prim_vf, &
+                                q_prim_vf, &
+                                flux_vf, flux_src_vf, &
+                                flux_gsrc_vf, &
+                                norm_dir, ix, iy, iz)
+
+        real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:), intent(INOUT) :: qL_prim_rsx_vf, qL_prim_rsy_vf, qL_prim_rsz_vf, qR_prim_rsx_vf, qR_prim_rsy_vf, qR_prim_rsz_vf
+        type(scalar_field), dimension(sys_size), intent(IN) :: q_prim_vf
+
+        type(scalar_field), allocatable, dimension(:), intent(INOUT) :: qL_prim_vf, qR_prim_vf
+
+        type(scalar_field), &
+            allocatable, dimension(:), &
+            intent(INOUT) :: dqL_prim_dx_vf, dqR_prim_dx_vf, &
+                             dqL_prim_dy_vf, dqR_prim_dy_vf, &
+                             dqL_prim_dz_vf, dqR_prim_dz_vf
+
+        type(scalar_field), &
+            dimension(sys_size), &
+            intent(INOUT) :: flux_vf, flux_src_vf, flux_gsrc_vf
+
+        integer, intent(IN) :: norm_dir
+
+        type(int_bounds_info), intent(IN) :: ix, iy, iz
+
+        #:for NAME, NUM in [('hll', 1), ('hllc', 2), ('hlld', 4), ('lf', 5)]
+            if (riemann_solver == ${NUM}$) then
+                call s_${NAME}$_riemann_solver(qL_prim_rsx_vf, qL_prim_rsy_vf, qL_prim_rsz_vf, dqL_prim_dx_vf, &
+                                               dqL_prim_dy_vf, &
+                                               dqL_prim_dz_vf, &
+                                               qL_prim_vf, &
+                                               qR_prim_rsx_vf, qR_prim_rsy_vf, qR_prim_rsz_vf, dqR_prim_dx_vf, &
+                                               dqR_prim_dy_vf, &
+                                               dqR_prim_dz_vf, &
+                                               qR_prim_vf, &
+                                               q_prim_vf, &
+                                               flux_vf, flux_src_vf, &
+                                               flux_gsrc_vf, &
+                                               norm_dir, ix, iy, iz)
+            end if
+        #:endfor
+
+    end subroutine s_riemann_solver
+
+    !> Dispatch to the subroutines that are utilized to compute
+        !! the viscous source fluxes for either Cartesian or cylindrical geometries.
+        !! For more information please refer to:
+        !!      1) s_compute_cartesian_viscous_source_flux
+        !!      2) s_compute_cylindrical_viscous_source_flux
+    subroutine s_compute_viscous_source_flux(velL_vf, &
+                                             dvelL_dx_vf, &
+                                             dvelL_dy_vf, &
+                                             dvelL_dz_vf, &
+                                             velR_vf, &
+                                             dvelR_dx_vf, &
+                                             dvelR_dy_vf, &
+                                             dvelR_dz_vf, &
+                                             flux_src_vf, &
+                                             norm_dir, &
+                                             ix, iy, iz)
+
+        type(scalar_field), &
+            dimension(num_vels), &
+            intent(IN) :: velL_vf, velR_vf, &
+                          dvelL_dx_vf, dvelR_dx_vf, &
+                          dvelL_dy_vf, dvelR_dy_vf, &
+                          dvelL_dz_vf, dvelR_dz_vf
+
+        type(scalar_field), &
+            dimension(sys_size), &
+            intent(INOUT) :: flux_src_vf
+
+        integer, intent(IN) :: norm_dir
+
+        type(int_bounds_info), intent(IN) :: ix, iy, iz
+
+        if (grid_geometry == 3) then
+            call s_compute_cylindrical_viscous_source_flux(velL_vf, &
+                                                           dvelL_dx_vf, &
+                                                           dvelL_dy_vf, &
+                                                           dvelL_dz_vf, &
+                                                           velR_vf, &
+                                                           dvelR_dx_vf, &
+                                                           dvelR_dy_vf, &
+                                                           dvelR_dz_vf, &
+                                                           flux_src_vf, &
+                                                           norm_dir, &
+                                                           ix, iy, iz)
+        else
+            call s_compute_cartesian_viscous_source_flux(dvelL_dx_vf, &
+                                                         dvelL_dy_vf, &
+                                                         dvelL_dz_vf, &
+                                                         dvelR_dx_vf, &
+                                                         dvelR_dy_vf, &
+                                                         dvelR_dz_vf, &
+                                                         flux_src_vf, &
+                                                         norm_dir)
+        end if
+    end subroutine s_compute_viscous_source_flux
+
+    !> @brief Computes intercell fluxes using the Harten-Lax-van Leer (HLL) approximate Riemann solver.
+    subroutine s_hll_riemann_solver(qL_prim_rsx_vf, qL_prim_rsy_vf, qL_prim_rsz_vf, dqL_prim_dx_vf, &
+                                    dqL_prim_dy_vf, &
+                                    dqL_prim_dz_vf, &
+                                    qL_prim_vf, &
+                                    qR_prim_rsx_vf, qR_prim_rsy_vf, qR_prim_rsz_vf, dqR_prim_dx_vf, &
+                                    dqR_prim_dy_vf, &
+                                    dqR_prim_dz_vf, &
+                                    qR_prim_vf, &
+                                    q_prim_vf, &
+                                    flux_vf, flux_src_vf, &
+                                    flux_gsrc_vf, &
+                                    norm_dir, ix, iy, iz)
+
+        real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:), intent(inout) :: qL_prim_rsx_vf, qL_prim_rsy_vf, qL_prim_rsz_vf, qR_prim_rsx_vf, qR_prim_rsy_vf, qR_prim_rsz_vf
+        type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf
+
+        type(scalar_field), allocatable, dimension(:), intent(inout) :: qL_prim_vf, qR_prim_vf
+
+        type(scalar_field), &
+            allocatable, dimension(:), &
+            intent(inout) :: dqL_prim_dx_vf, dqR_prim_dx_vf, &
+                             dqL_prim_dy_vf, dqR_prim_dy_vf, &
+                             dqL_prim_dz_vf, dqR_prim_dz_vf
+
+        ! Intercell fluxes
+        type(scalar_field), &
+            dimension(sys_size), &
+            intent(inout) :: flux_vf, flux_src_vf, flux_gsrc_vf
+        real(wp) :: flux_tau_L, flux_tau_R
+
+        integer, intent(in) :: norm_dir
+        type(int_bounds_info), intent(in) :: ix, iy, iz
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3) :: alpha_rho_L, alpha_rho_R
+            real(wp), dimension(3) :: vel_L, vel_R
+            real(wp), dimension(3) :: alpha_L, alpha_R
+            real(wp), dimension(10) :: Ys_L, Ys_R
+            real(wp), dimension(10) :: Cp_iL, Cp_iR, Xs_L, Xs_R, Gamma_iL, Gamma_iR
+            real(wp), dimension(10) :: Yi_avg, Phi_avg, h_iL, h_iR, h_avg_2
+        #:else
+            real(wp), dimension(num_fluids) :: alpha_rho_L, alpha_rho_R
+            real(wp), dimension(num_vels) :: vel_L, vel_R
+            real(wp), dimension(num_fluids) :: alpha_L, alpha_R
+            real(wp), dimension(num_species) :: Ys_L, Ys_R
+            real(wp), dimension(num_species) :: Cp_iL, Cp_iR, Xs_L, Xs_R, Gamma_iL, Gamma_iR
+            real(wp), dimension(num_species) :: Yi_avg, Phi_avg, h_iL, h_iR, h_avg_2
+        #:endif
+        real(wp) :: rho_L, rho_R
+        real(wp) :: pres_L, pres_R
+        real(wp) :: E_L, E_R
+        real(wp) :: H_L, H_R
+        real(wp) :: Cp_avg, Cv_avg, T_avg, eps, c_sum_Yi_Phi
+        real(wp) :: T_L, T_R
+        real(wp) :: Y_L, Y_R
+        real(wp) :: MW_L, MW_R
+        real(wp) :: R_gas_L, R_gas_R
+        real(wp) :: Cp_L, Cp_R
+        real(wp) :: Cv_L, Cv_R
+        real(wp) :: Gamm_L, Gamm_R
+        real(wp) :: gamma_L, gamma_R
+        real(wp) :: pi_inf_L, pi_inf_R
+        real(wp) :: qv_L, qv_R
+        real(wp) :: c_L, c_R
+        real(wp), dimension(6) :: tau_e_L, tau_e_R
+        real(wp) :: G_L, G_R
+        real(wp), dimension(2) :: Re_L, Re_R
+        real(wp), dimension(3) :: xi_field_L, xi_field_R
+        ! Non-Newtonian per-phase Re arrays
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3, 2) :: Re_visc_per_phase_L, Re_visc_per_phase_R
+        #:else
+            real(wp), dimension(num_fluids, 2) :: Re_visc_per_phase_L, Re_visc_per_phase_R
+        #:endif
+
+        real(wp) :: rho_avg
+        real(wp) :: H_avg
+        real(wp) :: qv_avg
+        real(wp) :: gamma_avg
+        real(wp) :: c_avg
+
+        real(wp) :: s_L, s_R, s_M, s_P, s_S
+        real(wp) :: xi_M, xi_P
+
+        real(wp) :: ptilde_L, ptilde_R
+        real(wp) :: vel_L_rms, vel_R_rms, vel_avg_rms
+        real(wp) :: vel_L_tmp, vel_R_tmp
+        real(wp) :: Ms_L, Ms_R, pres_SL, pres_SR
+        real(wp) :: alpha_L_sum, alpha_R_sum
+        real(wp) :: zcoef, pcorr !< low Mach number correction
+
+        type(riemann_states) :: c_fast, pres_mag
+        type(riemann_states_vec3) :: B
+
+        type(riemann_states) :: Ga ! Gamma (Lorentz factor)
+        type(riemann_states) :: vdotB, B2
+        type(riemann_states_vec3) :: b4 ! 4-magnetic field components (spatial: b4x, b4y, b4z)
+        type(riemann_states_vec3) :: cm ! Conservative momentum variables
+
+        integer :: i, j, k, l, q !< Generic loop iterators
+
+        ! Populating the buffers of the left and right Riemann problem
+        ! states variables, based on the choice of boundary conditions
+        call s_populate_riemann_states_variables_buffers( &
+            qL_prim_rsx_vf, qL_prim_rsy_vf, qL_prim_rsz_vf, dqL_prim_dx_vf, &
+            dqL_prim_dy_vf, &
+            dqL_prim_dz_vf, &
+            qR_prim_rsx_vf, qR_prim_rsy_vf, qR_prim_rsz_vf, dqR_prim_dx_vf, &
+            dqR_prim_dy_vf, &
+            dqR_prim_dz_vf, &
+            norm_dir, ix, iy, iz)
+
+        ! Reshaping inputted data based on dimensional splitting direction
+        call s_initialize_riemann_solver( &
+            flux_src_vf, &
+            norm_dir)
+        #:for NORM_DIR, XYZ in [(1, 'x'), (2, 'y'), (3, 'z')]
+
+            if (norm_dir == ${NORM_DIR}$) then
+                $:GPU_PARALLEL_LOOP(collapse=3, private='[i,j,k,l,q,alpha_rho_L,alpha_rho_R,vel_L,vel_R,alpha_L,alpha_R,tau_e_L,tau_e_R,Re_L,Re_R,s_L,s_R,s_S,Ys_L,Ys_R,xi_field_L, xi_field_R, Cp_iL, Cp_iR, Xs_L, Xs_R, Gamma_iL, Gamma_iR, Yi_avg, Phi_avg, h_iL, h_iR, h_avg_2, c_fast, pres_mag, B, Ga, vdotB, B2, b4, cm, pcorr, zcoef, vel_L_tmp, vel_R_tmp, rho_L, rho_R, pres_L, pres_R, E_L, E_R, H_L, H_R, Cp_avg, Cv_avg, T_avg, eps, c_sum_Yi_Phi, T_L, T_R, Y_L, Y_R, MW_L, MW_R, R_gas_L, R_gas_R, Cp_L, Cp_R, Cv_L, Cv_R, Gamm_L, Gamm_R, gamma_L, gamma_R, pi_inf_L, pi_inf_R, qv_L, qv_R, qv_avg, c_L, c_R, G_L, G_R, rho_avg, H_avg, c_avg, gamma_avg, ptilde_L, ptilde_R, vel_L_rms, vel_R_rms, vel_avg_rms, Ms_L, Ms_R, pres_SL, pres_SR, alpha_L_sum, alpha_R_sum, flux_tau_L, flux_tau_R, Re_visc_per_phase_L, Re_visc_per_phase_R]', copyin='[norm_dir]')
+                do l = is3%beg, is3%end
+                    do k = is2%beg, is2%end
+                        do j = is1%beg, is1%end
+                            $:GPU_LOOP(parallelism='[seq]')
+                            do i = 1, contxe
+                                alpha_rho_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, i)
+                                alpha_rho_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)
+                            end do
+
+                            vel_L_rms = 0._wp; vel_R_rms = 0._wp
+
+                            $:GPU_LOOP(parallelism='[seq]')
+                            do i = 1, num_vels
+                                vel_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, contxe + i)
+                                vel_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, contxe + i)
+                                vel_L_rms = vel_L_rms + vel_L(i)**2._wp
+                                vel_R_rms = vel_R_rms + vel_R(i)**2._wp
+                            end do
+
+                            $:GPU_LOOP(parallelism='[seq]')
+                            do i = 1, num_fluids
+                                alpha_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)
+                                alpha_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)
+                            end do
+
+                            pres_L = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx)
+                            pres_R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx)
+
+                            if (mhd) then
+                                if (n == 0) then ! 1D: constant Bx; By, Bz as variables
+                                    B%L(1) = Bx0
+                                    B%R(1) = Bx0
+                                    B%L(2) = qL_prim_rs${XYZ}$_vf(j, k, l, B_idx%beg)
+                                    B%R(2) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, B_idx%beg)
+                                    B%L(3) = qL_prim_rs${XYZ}$_vf(j, k, l, B_idx%beg + 1)
+                                    B%R(3) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, B_idx%beg + 1)
+                                else ! 2D/3D: Bx, By, Bz as variables
+                                    B%L(1) = qL_prim_rs${XYZ}$_vf(j, k, l, B_idx%beg)
+                                    B%R(1) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, B_idx%beg)
+                                    B%L(2) = qL_prim_rs${XYZ}$_vf(j, k, l, B_idx%beg + 1)
+                                    B%R(2) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, B_idx%beg + 1)
+                                    B%L(3) = qL_prim_rs${XYZ}$_vf(j, k, l, B_idx%beg + 2)
+                                    B%R(3) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, B_idx%beg + 2)
+                                end if
+                            end if
+
+                            rho_L = 0._wp
+                            gamma_L = 0._wp
+                            pi_inf_L = 0._wp
+                            qv_L = 0._wp
+
+                            rho_R = 0._wp
+                            gamma_R = 0._wp
+                            pi_inf_R = 0._wp
+                            qv_R = 0._wp
+
+                            alpha_L_sum = 0._wp
+                            alpha_R_sum = 0._wp
+
+                            pres_mag%L = 0._wp
+                            pres_mag%R = 0._wp
+
+                            if (mpp_lim) then
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = 1, num_fluids
+                                    alpha_rho_L(i) = max(0._wp, alpha_rho_L(i))
+                                    alpha_L(i) = min(max(0._wp, alpha_L(i)), 1._wp)
+                                    alpha_L_sum = alpha_L_sum + alpha_L(i)
+                                    alpha_rho_R(i) = max(0._wp, alpha_rho_R(i))
+                                    alpha_R(i) = min(max(0._wp, alpha_R(i)), 1._wp)
+                                    alpha_R_sum = alpha_R_sum + alpha_R(i)
+                                end do
+
+                                alpha_L = alpha_L/max(alpha_L_sum, sgm_eps)
+                                alpha_R = alpha_R/max(alpha_R_sum, sgm_eps)
+                            end if
+
+                            $:GPU_LOOP(parallelism='[seq]')
+                            do i = 1, num_fluids
+                                rho_L = rho_L + alpha_rho_L(i)
+                                gamma_L = gamma_L + alpha_L(i)*gammas(i)
+                                pi_inf_L = pi_inf_L + alpha_L(i)*pi_infs(i)
+                                qv_L = qv_L + alpha_rho_L(i)*qvs(i)
+
+                                rho_R = rho_R + alpha_rho_R(i)
+                                gamma_R = gamma_R + alpha_R(i)*gammas(i)
+                                pi_inf_R = pi_inf_R + alpha_R(i)*pi_infs(i)
+                                qv_R = qv_R + alpha_rho_R(i)*qvs(i)
+                            end do
+
+                            if (viscous) then
+                                ! Map rotated (j,k,l) to physical (x,y,z) indices
+                                #:if NORM_DIR == 1
+                                    call s_compute_re_visc(q_prim_vf, &
+                                        alpha_L, j, k, l, Re_visc_per_phase_L)
+                                    call s_compute_re_visc(q_prim_vf, &
+                                        alpha_R, j + 1, k, l, Re_visc_per_phase_R)
+                                #:elif NORM_DIR == 2
+                                    call s_compute_re_visc(q_prim_vf, &
+                                        alpha_L, k, j, l, Re_visc_per_phase_L)
+                                    call s_compute_re_visc(q_prim_vf, &
+                                        alpha_R, k, j + 1, l, Re_visc_per_phase_R)
+                                #:else
+                                    call s_compute_re_visc(q_prim_vf, &
+                                        alpha_L, l, k, j, Re_visc_per_phase_L)
+                                    call s_compute_re_visc(q_prim_vf, &
+                                        alpha_R, l, k, j + 1, Re_visc_per_phase_R)
+                                #:endif
+                                call s_compute_mixture_re( &
+                                    alpha_L, Re_visc_per_phase_L, Re_L)
+                                call s_compute_mixture_re( &
+                                    alpha_R, Re_visc_per_phase_R, Re_R)
+                            end if
+
+                            if (chemistry) then
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = chemxb, chemxe
+                                    Ys_L(i - chemxb + 1) = qL_prim_rs${XYZ}$_vf(j, k, l, i)
+                                    Ys_R(i - chemxb + 1) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)
+                                end do
+
+                                call get_mixture_molecular_weight(Ys_L, MW_L)
+                                call get_mixture_molecular_weight(Ys_R, MW_R)
+                                #:if USING_AMD
+                                    Xs_L(:) = Ys_L(:)*MW_L/molecular_weights_nonparameter(:)
+                                    Xs_R(:) = Ys_R(:)*MW_R/molecular_weights_nonparameter(:)
+                                #:else
+                                    Xs_L(:) = Ys_L(:)*MW_L/molecular_weights(:)
+                                    Xs_R(:) = Ys_R(:)*MW_R/molecular_weights(:)
+                                #:endif
+
+                                R_gas_L = gas_constant/MW_L
+                                R_gas_R = gas_constant/MW_R
+                                T_L = pres_L/rho_L/R_gas_L
+                                T_R = pres_R/rho_R/R_gas_R
+
+                                call get_species_specific_heats_r(T_L, Cp_iL)
+                                call get_species_specific_heats_r(T_R, Cp_iR)
+
+                                if (chem_params%gamma_method == 1) then
+                                    ! gamma_method = 1: Ref. Section 2.3.1 Formulation of doi:10.7907/ZKW8-ES97.
+                                    Gamma_iL = Cp_iL/(Cp_iL - 1.0_wp)
+                                    Gamma_iR = Cp_iR/(Cp_iR - 1.0_wp)
+
+                                    gamma_L = sum(Xs_L(:)/(Gamma_iL(:) - 1.0_wp))
+                                    gamma_R = sum(Xs_R(:)/(Gamma_iR(:) - 1.0_wp))
+                                else if (chem_params%gamma_method == 2) then
+                                    ! gamma_method = 2: c_p / c_v where c_p, c_v are specific heats.
+                                    call get_mixture_specific_heat_cp_mass(T_L, Ys_L, Cp_L)
+                                    call get_mixture_specific_heat_cp_mass(T_R, Ys_R, Cp_R)
+                                    call get_mixture_specific_heat_cv_mass(T_L, Ys_L, Cv_L)
+                                    call get_mixture_specific_heat_cv_mass(T_R, Ys_R, Cv_R)
+
+                                    Gamm_L = Cp_L/Cv_L
+                                    gamma_L = 1.0_wp/(Gamm_L - 1.0_wp)
+                                    Gamm_R = Cp_R/Cv_R
+                                    gamma_R = 1.0_wp/(Gamm_R - 1.0_wp)
+                                end if
+
+                                call get_mixture_energy_mass(T_L, Ys_L, E_L)
+                                call get_mixture_energy_mass(T_R, Ys_R, E_R)
+
+                                E_L = rho_L*E_L + 5.e-1*rho_L*vel_L_rms
+                                E_R = rho_R*E_R + 5.e-1*rho_R*vel_R_rms
+                                H_L = (E_L + pres_L)/rho_L
+                                H_R = (E_R + pres_R)/rho_R
+                            elseif (mhd .and. relativity) then
+                                Ga%L = 1._wp/sqrt(1._wp - vel_L_rms)
+                                Ga%R = 1._wp/sqrt(1._wp - vel_R_rms)
+                                #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
+                                    vdotB%L = vel_L(1)*B%L(1) + vel_L(2)*B%L(2) + vel_L(3)*B%L(3)
+                                    vdotB%R = vel_R(1)*B%R(1) + vel_R(2)*B%R(2) + vel_R(3)*B%R(3)
+
+                                    b4%L(1:3) = B%L(1:3)/Ga%L + Ga%L*vel_L(1:3)*vdotB%L
+                                    b4%R(1:3) = B%R(1:3)/Ga%R + Ga%R*vel_R(1:3)*vdotB%R
+                                    B2%L = B%L(1)**2._wp + B%L(2)**2._wp + B%L(3)**2._wp
+                                    B2%R = B%R(1)**2._wp + B%R(2)**2._wp + B%R(3)**2._wp
+                                #:endif
+
+                                pres_mag%L = 0.5_wp*(B2%L/Ga%L**2._wp + vdotB%L**2._wp)
+                                pres_mag%R = 0.5_wp*(B2%R/Ga%R**2._wp + vdotB%R**2._wp)
+
+                                ! Hard-coded EOS
+                                H_L = 1._wp + (gamma_L + 1)*pres_L/rho_L
+                                H_R = 1._wp + (gamma_R + 1)*pres_R/rho_R
+                                #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
+                                    cm%L(1:3) = (rho_L*H_L*Ga%L**2 + B2%L)*vel_L(1:3) - vdotB%L*B%L(1:3)
+                                    cm%R(1:3) = (rho_R*H_R*Ga%R**2 + B2%R)*vel_R(1:3) - vdotB%R*B%R(1:3)
+                                #:endif
+
+                                E_L = rho_L*H_L*Ga%L**2 - pres_L + 0.5_wp*(B2%L + vel_L_rms*B2%L - vdotB%L**2._wp) - rho_L*Ga%L
+                                E_R = rho_R*H_R*Ga%R**2 - pres_R + 0.5_wp*(B2%R + vel_R_rms*B2%R - vdotB%R**2._wp) - rho_R*Ga%R
+                            elseif (mhd .and. .not. relativity) then
+                                #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
+                                    pres_mag%L = 0.5_wp*(B%L(1)**2._wp + B%L(2)**2._wp + B%L(3)**2._wp)
+                                    pres_mag%R = 0.5_wp*(B%R(1)**2._wp + B%R(2)**2._wp + B%R(3)**2._wp)
+                                #:endif
+                                E_L = gamma_L*pres_L + pi_inf_L + 0.5_wp*rho_L*vel_L_rms + qv_L + pres_mag%L
+                                E_R = gamma_R*pres_R + pi_inf_R + 0.5_wp*rho_R*vel_R_rms + qv_R + pres_mag%R ! includes magnetic energy
+                                H_L = (E_L + pres_L - pres_mag%L)/rho_L
+                                H_R = (E_R + pres_R - pres_mag%R)/rho_R ! stagnation enthalpy here excludes magnetic energy (only used to find speed of sound)
+                            else
+                                E_L = gamma_L*pres_L + pi_inf_L + 5.e-1*rho_L*vel_L_rms + qv_L
+                                E_R = gamma_R*pres_R + pi_inf_R + 5.e-1*rho_R*vel_R_rms + qv_R
+                                H_L = (E_L + pres_L)/rho_L
+                                H_R = (E_R + pres_R)/rho_R
+                            end if
+
+                            ! elastic energy update
+                            if (hypoelasticity) then
+                                G_L = 0._wp; G_R = 0._wp
+
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = 1, num_fluids
+                                    G_L = G_L + alpha_L(i)*Gs_rs(i)
+                                    G_R = G_R + alpha_R(i)*Gs_rs(i)
+                                end do
+
+                                if (cont_damage) then
+                                    G_L = G_L*max((1._wp - qL_prim_rs${XYZ}$_vf(j, k, l, damage_idx)), 0._wp)
+                                    G_R = G_R*max((1._wp - qR_prim_rs${XYZ}$_vf(j, k, l, damage_idx)), 0._wp)
+                                end if
+
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = 1, strxe - strxb + 1
+                                    tau_e_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, strxb - 1 + i)
+                                    tau_e_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, strxb - 1 + i)
+                                    ! Elastic contribution to energy if G large enough
+                                    !TODO take out if statement if stable without
+                                    if ((G_L > 1000) .and. (G_R > 1000)) then
+                                        E_L = E_L + (tau_e_L(i)*tau_e_L(i))/(4._wp*G_L)
+                                        E_R = E_R + (tau_e_R(i)*tau_e_R(i))/(4._wp*G_R)
+                                        ! Double for shear stresses
+                                        if (any(strxb - 1 + i == shear_indices)) then
+                                            E_L = E_L + (tau_e_L(i)*tau_e_L(i))/(4._wp*G_L)
+                                            E_R = E_R + (tau_e_R(i)*tau_e_R(i))/(4._wp*G_R)
+                                        end if
+                                    end if
+                                end do
+                            end if
+
+                            ! elastic energy update
+                            !if ( hyperelasticity ) then
+                            !    G_L = 0._wp
+                            !    G_R = 0._wp
+                            !
+                            !    $:GPU_LOOP(parallelism='[seq]')
+                            !    do i = 1, num_fluids
+                            !        G_L = G_L + alpha_L(i)*Gs_rs(i)
+                            !        G_R = G_R + alpha_R(i)*Gs_rs(i)
+                            !    end do
+                            !    ! Elastic contribution to energy if G large enough
+                            !    if ((G_L > 1.e-3_wp) .and. (G_R > 1.e-3_wp)) then
+                            !    E_L = E_L + G_L*qL_prim_rs${XYZ}$_vf(j, k, l, xiend + 1)
+                            !    E_R = E_R + G_R*qR_prim_rs${XYZ}$_vf(j + 1, k, l, xiend + 1)
+                            !    $:GPU_LOOP(parallelism='[seq]')
+                            !    do i = 1, b_size-1
+                            !        tau_e_L(i) = G_L*qL_prim_rs${XYZ}$_vf(j, k, l, strxb - 1 + i)
+                            !        tau_e_R(i) = G_R*qR_prim_rs${XYZ}$_vf(j + 1, k, l, strxb - 1 + i)
+                            !    end do
+                            !    $:GPU_LOOP(parallelism='[seq]')
+                            !    do i = 1, b_size-1
+                            !        tau_e_L(i) = 0._wp
+                            !        tau_e_R(i) = 0._wp
+                            !    end do
+                            !    $:GPU_LOOP(parallelism='[seq]')
+                            !    do i = 1, num_dims
+                            !        xi_field_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, xibeg - 1 + i)
+                            !        xi_field_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, xibeg - 1 + i)
+                            !    end do
+                            !    end if
+                            !end if
+
+                            @:compute_average_state()
+
+                            call s_compute_speed_of_sound(pres_L, rho_L, gamma_L, pi_inf_L, H_L, alpha_L, &
+                                                          vel_L_rms, 0._wp, c_L, qv_L)
+
+                            call s_compute_speed_of_sound(pres_R, rho_R, gamma_R, pi_inf_R, H_R, alpha_R, &
+                                                          vel_R_rms, 0._wp, c_R, qv_R)
+
+                            !> The computation of c_avg does not require all the variables, and therefore the non '_avg'
+                            ! variables are placeholders to call the subroutine.
+
+                            call s_compute_speed_of_sound(pres_R, rho_avg, gamma_avg, pi_inf_R, H_avg, alpha_R, &
+                                                          vel_avg_rms, c_sum_Yi_Phi, c_avg, qv_avg)
+
+                            if (mhd) then
+                                call s_compute_fast_magnetosonic_speed(rho_L, c_L, B%L, norm_dir, c_fast%L, H_L)
+                                call s_compute_fast_magnetosonic_speed(rho_R, c_R, B%R, norm_dir, c_fast%R, H_R)
+                            end if
+
+                            if (hyper_cleaning) then ! mhd
+                                c_fast%L = min(c_fast%L, -hyper_cleaning_speed)
+                                c_fast%R = max(c_fast%R, hyper_cleaning_speed)
+                            end if
+
+                            if (viscous) then
+                                if (chemistry) then
+                                    call compute_viscosity_and_inversion(T_L, Ys_L, T_R, Ys_R, Re_L(1), Re_R(1))
+                                end if
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = 1, 2
+                                    Re_avg_rs${XYZ}$_vf(j, k, l, i) = 2._wp/(1._wp/Re_L(i) + 1._wp/Re_R(i))
+                                end do
+                            end if
+
+                            if (wave_speeds == 1) then
+                                if (mhd) then
+                                    s_L = min(vel_L(dir_idx(1)) - c_fast%L, vel_R(dir_idx(1)) - c_fast%R)
+                                    s_R = max(vel_R(dir_idx(1)) + c_fast%R, vel_L(dir_idx(1)) + c_fast%L)
+                                elseif (hypoelasticity) then
+                                    s_L = min(vel_L(dir_idx(1)) - sqrt(c_L*c_L + &
+                                                                       (((4._wp*G_L)/3._wp) + &
+                                                                        tau_e_L(dir_idx_tau(1)))/rho_L) &
+                                              , vel_R(dir_idx(1)) - sqrt(c_R*c_R + &
+                                                                         (((4._wp*G_R)/3._wp) + &
+                                                                          tau_e_R(dir_idx_tau(1)))/rho_R))
+                                    s_R = max(vel_R(dir_idx(1)) + sqrt(c_R*c_R + &
+                                                                       (((4._wp*G_R)/3._wp) + &
+                                                                        tau_e_R(dir_idx_tau(1)))/rho_R) &
+                                              , vel_L(dir_idx(1)) + sqrt(c_L*c_L + &
+                                                                         (((4._wp*G_L)/3._wp) + &
+                                                                          tau_e_L(dir_idx_tau(1)))/rho_L))
+                                else if (hyperelasticity) then
+                                    s_L = min(vel_L(dir_idx(1)) - sqrt(c_L*c_L + (4._wp*G_L/3._wp)/rho_L) &
+                                              , vel_R(dir_idx(1)) - sqrt(c_R*c_R + (4._wp*G_R/3._wp)/rho_R))
+                                    s_R = max(vel_R(dir_idx(1)) + sqrt(c_R*c_R + (4._wp*G_R/3._wp)/rho_R) &
+                                              , vel_L(dir_idx(1)) + sqrt(c_L*c_L + (4._wp*G_L/3._wp)/rho_L))
+                                else
+                                    s_L = min(vel_L(dir_idx(1)) - c_L, vel_R(dir_idx(1)) - c_R)
+                                    s_R = max(vel_R(dir_idx(1)) + c_R, vel_L(dir_idx(1)) + c_L)
+                                end if
+
+                                s_S = (pres_R - pres_L + rho_L*vel_L(dir_idx(1))* &
+                                       (s_L - vel_L(dir_idx(1))) - &
+                                       rho_R*vel_R(dir_idx(1))* &
+                                       (s_R - vel_R(dir_idx(1)))) &
+                                      /(rho_L*(s_L - vel_L(dir_idx(1))) - &
+                                        rho_R*(s_R - vel_R(dir_idx(1))))
+                            elseif (wave_speeds == 2) then
+                                pres_SL = 5.e-1_wp*(pres_L + pres_R + rho_avg*c_avg* &
+                                                    (vel_L(dir_idx(1)) - &
+                                                     vel_R(dir_idx(1))))
+
+                                pres_SR = pres_SL
+
+                                Ms_L = max(1._wp, sqrt(1._wp + ((5.e-1_wp + gamma_L)/(1._wp + gamma_L))* &
+                                                       (pres_SL/pres_L - 1._wp)*pres_L/ &
+                                                       ((pres_L + pi_inf_L/(1._wp + gamma_L)))))
+                                Ms_R = max(1._wp, sqrt(1._wp + ((5.e-1_wp + gamma_R)/(1._wp + gamma_R))* &
+                                                       (pres_SR/pres_R - 1._wp)*pres_R/ &
+                                                       ((pres_R + pi_inf_R/(1._wp + gamma_R)))))
+
+                                s_L = vel_L(dir_idx(1)) - c_L*Ms_L
+                                s_R = vel_R(dir_idx(1)) + c_R*Ms_R
+
+                                s_S = 5.e-1_wp*((vel_L(dir_idx(1)) + vel_R(dir_idx(1))) + &
+                                                (pres_L - pres_R)/ &
+                                                (rho_avg*c_avg))
+                            end if
+
+                            s_M = min(0._wp, s_L); s_P = max(0._wp, s_R)
+
+                            xi_M = (5.e-1_wp + sign(5.e-1_wp, s_L)) &
+                                   + (5.e-1_wp - sign(5.e-1_wp, s_L)) &
+                                   *(5.e-1_wp + sign(5.e-1_wp, s_R))
+                            xi_P = (5.e-1_wp - sign(5.e-1_wp, s_R)) &
+                                   + (5.e-1_wp - sign(5.e-1_wp, s_L)) &
+                                   *(5.e-1_wp + sign(5.e-1_wp, s_R))
+
+                            ! Low Mach correction
+                            if (low_Mach == 1) then
+                                @:compute_low_Mach_correction()
+                            else
+                                pcorr = 0._wp
+                            end if
+
+                            ! Mass
+                            if (.not. relativity) then
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = 1, contxe
+                                    flux_rs${XYZ}$_vf(j, k, l, i) = &
+                                        (s_M*alpha_rho_R(i)*vel_R(norm_dir) &
+                                         - s_P*alpha_rho_L(i)*vel_L(norm_dir) &
+                                         + s_M*s_P*(alpha_rho_L(i) &
+                                                    - alpha_rho_R(i))) &
+                                        /(s_M - s_P)
+                                end do
+                            elseif (relativity) then
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = 1, contxe
+                                    flux_rs${XYZ}$_vf(j, k, l, i) = &
+                                        (s_M*Ga%R*alpha_rho_R(i)*vel_R(norm_dir) &
+                                         - s_P*Ga%L*alpha_rho_L(i)*vel_L(norm_dir) &
+                                         + s_M*s_P*(Ga%L*alpha_rho_L(i) &
+                                                    - Ga%R*alpha_rho_R(i))) &
+                                        /(s_M - s_P)
+                                end do
+                            end if
+
+                            ! Momentum
+                            if (mhd .and. (.not. relativity)) then
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = 1, 3
+                                    ! Flux of rho*v_i in the ${XYZ}$ direction
+                                    ! = rho * v_i * v_${XYZ}$ - B_i * B_${XYZ}$ + delta_(${XYZ}$,i) * p_tot
+                                    flux_rs${XYZ}$_vf(j, k, l, contxe + i) = &
+                                        (s_M*(rho_R*vel_R(i)*vel_R(norm_dir) &
+                                              - B%R(i)*B%R(norm_dir) &
+                                              + dir_flg(i)*(pres_R + pres_mag%R)) &
+                                         - s_P*(rho_L*vel_L(i)*vel_L(norm_dir) &
+                                                - B%L(i)*B%L(norm_dir) &
+                                                + dir_flg(i)*(pres_L + pres_mag%L)) &
+                                         + s_M*s_P*(rho_L*vel_L(i) - rho_R*vel_R(i))) &
+                                        /(s_M - s_P)
+                                end do
+                            elseif (mhd .and. relativity) then
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = 1, 3
+                                    ! Flux of m_i in the ${XYZ}$ direction
+                                    ! = m_i * v_${XYZ}$ - b_i/Gamma * B_${XYZ}$ + delta_(${XYZ}$,i) * p_tot
+                                    flux_rs${XYZ}$_vf(j, k, l, contxe + i) = &
+                                        (s_M*(cm%R(i)*vel_R(norm_dir) &
+                                              - b4%R(i)/Ga%R*B%R(norm_dir) &
+                                              + dir_flg(i)*(pres_R + pres_mag%R)) &
+                                         - s_P*(cm%L(i)*vel_L(norm_dir) &
+                                                - b4%L(i)/Ga%L*B%L(norm_dir) &
+                                                + dir_flg(i)*(pres_L + pres_mag%L)) &
+                                         + s_M*s_P*(cm%L(i) - cm%R(i))) &
+                                        /(s_M - s_P)
+                                end do
+                            elseif (bubbles_euler) then
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = 1, num_vels
+                                    flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = &
+                                        (s_M*(rho_R*vel_R(dir_idx(1)) &
+                                              *vel_R(dir_idx(i)) &
+                                              + dir_flg(dir_idx(i))*(pres_R - ptilde_R)) &
+                                         - s_P*(rho_L*vel_L(dir_idx(1)) &
+                                                *vel_L(dir_idx(i)) &
+                                                + dir_flg(dir_idx(i))*(pres_L - ptilde_L)) &
+                                         + s_M*s_P*(rho_L*vel_L(dir_idx(i)) &
+                                                    - rho_R*vel_R(dir_idx(i)))) &
+                                        /(s_M - s_P) &
+                                        + (s_M/s_L)*(s_P/s_R)*pcorr*(vel_R(dir_idx(i)) - vel_L(dir_idx(i)))
+                                end do
+                            else if (hypoelasticity) then
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = 1, num_vels
+                                    flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = &
+                                        (s_M*(rho_R*vel_R(dir_idx(1)) &
+                                              *vel_R(dir_idx(i)) &
+                                              + dir_flg(dir_idx(i))*pres_R &
+                                              - tau_e_R(dir_idx_tau(i))) &
+                                         - s_P*(rho_L*vel_L(dir_idx(1)) &
+                                                *vel_L(dir_idx(i)) &
+                                                + dir_flg(dir_idx(i))*pres_L &
+                                                - tau_e_L(dir_idx_tau(i))) &
+                                         + s_M*s_P*(rho_L*vel_L(dir_idx(i)) &
+                                                    - rho_R*vel_R(dir_idx(i)))) &
+                                        /(s_M - s_P)
+                                end do
+                            else
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = 1, num_vels
+                                    flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = &
+                                        (s_M*(rho_R*vel_R(dir_idx(1)) &
+                                              *vel_R(dir_idx(i)) &
+                                              + dir_flg(dir_idx(i))*pres_R) &
+                                         - s_P*(rho_L*vel_L(dir_idx(1)) &
+                                                *vel_L(dir_idx(i)) &
+                                                + dir_flg(dir_idx(i))*pres_L) &
+                                         + s_M*s_P*(rho_L*vel_L(dir_idx(i)) &
+                                                    - rho_R*vel_R(dir_idx(i)))) &
+                                        /(s_M - s_P) &
+                                        + (s_M/s_L)*(s_P/s_R)*pcorr*(vel_R(dir_idx(i)) - vel_L(dir_idx(i)))
+                                end do
+                            end if
+
+                            ! Energy
+                            if (mhd .and. (.not. relativity)) then
+                                ! energy flux = (E + p + p_mag) * v_${XYZ}$ - B_${XYZ}$ * (v_x*B_x + v_y*B_y + v_z*B_z)
+                                #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
+                                    flux_rs${XYZ}$_vf(j, k, l, E_idx) = &
+                                        (s_M*(vel_R(norm_dir)*(E_R + pres_R + pres_mag%R) - B%R(norm_dir)*(vel_R(1)*B%R(1) + vel_R(2)*B%R(2) + vel_R(3)*B%R(3))) &
+                                         - s_P*(vel_L(norm_dir)*(E_L + pres_L + pres_mag%L) - B%L(norm_dir)*(vel_L(1)*B%L(1) + vel_L(2)*B%L(2) + vel_L(3)*B%L(3))) &
+                                         + s_M*s_P*(E_L - E_R)) &
+                                        /(s_M - s_P)
+                                #:endif
+                            elseif (mhd .and. relativity) then
+                                ! energy flux = m_${XYZ}$ - mass flux
+                                ! Hard-coded for single-component for now
+                                flux_rs${XYZ}$_vf(j, k, l, E_idx) = &
+                                    (s_M*(cm%R(norm_dir) - Ga%R*alpha_rho_R(1)*vel_R(norm_dir)) &
+                                     - s_P*(cm%L(norm_dir) - Ga%L*alpha_rho_L(1)*vel_L(norm_dir)) &
+                                     + s_M*s_P*(E_L - E_R)) &
+                                    /(s_M - s_P)
+                            else if (bubbles_euler) then
+                                flux_rs${XYZ}$_vf(j, k, l, E_idx) = &
+                                    (s_M*vel_R(dir_idx(1))*(E_R + pres_R - ptilde_R) &
+                                     - s_P*vel_L(dir_idx(1))*(E_L + pres_L - ptilde_L) &
+                                     + s_M*s_P*(E_L - E_R)) &
+                                    /(s_M - s_P) &
+                                    + (s_M/s_L)*(s_P/s_R)*pcorr*(vel_R_rms - vel_L_rms)/2._wp
+                            else if (hypoelasticity) then
+                                flux_tau_L = 0._wp; flux_tau_R = 0._wp
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = 1, num_dims
+                                    flux_tau_L = flux_tau_L + tau_e_L(dir_idx_tau(i))*vel_L(dir_idx(i))
+                                    flux_tau_R = flux_tau_R + tau_e_R(dir_idx_tau(i))*vel_R(dir_idx(i))
+                                end do
+                                flux_rs${XYZ}$_vf(j, k, l, E_idx) = &
+                                    (s_M*(vel_R(dir_idx(1))*(E_R + pres_R) - flux_tau_R) &
+                                     - s_P*(vel_L(dir_idx(1))*(E_L + pres_L) - flux_tau_L) &
+                                     + s_M*s_P*(E_L - E_R))/(s_M - s_P)
+                            else
+                                flux_rs${XYZ}$_vf(j, k, l, E_idx) = &
+                                    (s_M*vel_R(dir_idx(1))*(E_R + pres_R) &
+                                     - s_P*vel_L(dir_idx(1))*(E_L + pres_L) &
+                                     + s_M*s_P*(E_L - E_R)) &
+                                    /(s_M - s_P) &
+                                    + (s_M/s_L)*(s_P/s_R)*pcorr*(vel_R_rms - vel_L_rms)/2._wp
+                            end if
+
+                            ! Elastic Stresses
+                            if (hypoelasticity) then
+                                do i = 1, strxe - strxb + 1 !TODO: this indexing may be slow
+                                    flux_rs${XYZ}$_vf(j, k, l, strxb - 1 + i) = &
+                                        (s_M*(rho_R*vel_R(dir_idx(1)) &
+                                              *tau_e_R(i)) &
+                                         - s_P*(rho_L*vel_L(dir_idx(1)) &
+                                                *tau_e_L(i)) &
+                                         + s_M*s_P*(rho_L*tau_e_L(i) &
+                                                    - rho_R*tau_e_R(i))) &
+                                        /(s_M - s_P)
+                                end do
+                            end if
+
+                            ! Advection
+                            $:GPU_LOOP(parallelism='[seq]')
+                            do i = advxb, advxe
+                                flux_rs${XYZ}$_vf(j, k, l, i) = &
+                                    (qL_prim_rs${XYZ}$_vf(j, k, l, i) &
+                                     - qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)) &
+                                    *s_M*s_P/(s_M - s_P)
+                                flux_src_rs${XYZ}$_vf(j, k, l, i) = &
+                                    (s_M*qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) &
+                                     - s_P*qL_prim_rs${XYZ}$_vf(j, k, l, i)) &
+                                    /(s_M - s_P)
+                            end do
+
+                            if (bubbles_euler) then
+                                ! From HLLC: Kills mass transport @ bubble gas density
+                                if (num_fluids > 1) then
+                                    flux_rs${XYZ}$_vf(j, k, l, contxe) = 0._wp
+                                end if
+                            end if
+
+                            if (chemistry) then
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = chemxb, chemxe
+                                    Y_L = qL_prim_rs${XYZ}$_vf(j, k, l, i)
+                                    Y_R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)
+
+                                    flux_rs${XYZ}$_vf(j, k, l, i) = (s_M*Y_R*rho_R*vel_R(dir_idx(1)) &
+                                                                     - s_P*Y_L*rho_L*vel_L(dir_idx(1)) &
+                                                                     + s_M*s_P*(Y_L*rho_L - Y_R*rho_R)) &
+                                                                    /(s_M - s_P)
+                                    flux_src_rs${XYZ}$_vf(j, k, l, i) = 0._wp
+                                end do
+                            end if
+
+                            if (mhd) then
+                                if (n == 0) then ! 1D: d/dx flux only & Bx = Bx0 = const.
+                                    ! B_y flux = v_x * B_y - v_y * Bx0
+                                    ! B_z flux = v_x * B_z - v_z * Bx0
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 0, 1
+                                        flux_rsx_vf(j, k, l, B_idx%beg + i) = (s_M*(vel_R(1)*B%R(2 + i) - vel_R(2 + i)*Bx0) &
+                                                                               - s_P*(vel_L(1)*B%L(2 + i) - vel_L(2 + i)*Bx0) &
+                                                                               + s_M*s_P*(B%L(2 + i) - B%R(2 + i)))/(s_M - s_P)
+                                    end do
+                                else ! 2D/3D: Bx, By, Bz /= const. but zero flux component in the same direction
+                                    ! B_x d/d${XYZ}$ flux = (1 - delta(x,${XYZ}$)) * (v_${XYZ}$ * B_x - v_x * B_${XYZ}$)
+                                    ! B_y d/d${XYZ}$ flux = (1 - delta(y,${XYZ}$)) * (v_${XYZ}$ * B_y - v_y * B_${XYZ}$)
+                                    ! B_z d/d${XYZ}$ flux = (1 - delta(z,${XYZ}$)) * (v_${XYZ}$ * B_z - v_z * B_${XYZ}$)
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 0, 2
+                                        flux_rs${XYZ}$_vf(j, k, l, B_idx%beg + i) = (s_M*(vel_R(dir_idx(1))*B%R(i + 1) - vel_R(i + 1)*B%R(norm_dir)) - &
+                                                                                     s_P*(vel_L(dir_idx(1))*B%L(i + 1) - vel_L(i + 1)*B%L(norm_dir)) + &
+                                                                                     s_M*s_P*(B%L(i + 1) - B%R(i + 1)))/(s_M - s_P)
+                                    end do
+
+                                    if (hyper_cleaning) then
+                                        ! propagate magnetic field divergence as a wave
+                                        flux_rs${XYZ}$_vf(j, k, l, B_idx%beg + norm_dir - 1) = flux_rs${XYZ}$_vf(j, k, l, B_idx%beg + norm_dir - 1) + &
+                                                                                               (s_M*qR_prim_rs${XYZ}$_vf(j + 1, k, l, psi_idx) - s_P*qL_prim_rs${XYZ}$_vf(j, k, l, psi_idx))/(s_M - s_P)
+
+                                        flux_rs${XYZ}$_vf(j, k, l, psi_idx) = (hyper_cleaning_speed**2*(s_M*B%R(norm_dir) - s_P*B%L(norm_dir)) + s_M*s_P*(qL_prim_rs${XYZ}$_vf(j, k, l, psi_idx) - qR_prim_rs${XYZ}$_vf(j + 1, k, l, psi_idx)))/(s_M - s_P)
+                                    else
+                                        flux_rs${XYZ}$_vf(j, k, l, B_idx%beg + norm_dir - 1) = 0._wp ! Without hyperbolic cleaning, make sure flux of B_normal is identically zero
+                                    end if
+                                end if
+                                flux_src_rs${XYZ}$_vf(j, k, l, advxb) = 0._wp
+                            end if
+
+                            #:if (NORM_DIR == 2)
+                                if (cyl_coord) then
+                                    !Substituting the advective flux into the inviscid geometrical source flux
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, E_idx
+                                        flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i)
+                                    end do
+                                    ! Recalculating the radial momentum geometric source flux
+                                    flux_gsrc_rs${XYZ}$_vf(j, k, l, contxe + 2) = &
+                                        flux_rs${XYZ}$_vf(j, k, l, contxe + 2) &
+                                        - (s_M*pres_R - s_P*pres_L)/(s_M - s_P)
+                                    ! Geometrical source of the void fraction(s) is zero
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = advxb, advxe
+                                        flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i)
+                                    end do
+                                end if
+
+                                if (cyl_coord .and. hypoelasticity) then
+                                    ! += tau_sigmasigma using HLL
+                                    flux_gsrc_rs${XYZ}$_vf(j, k, l, contxe + 2) = &
+                                        flux_gsrc_rs${XYZ}$_vf(j, k, l, contxe + 2) + &
+                                        (s_M*tau_e_R(4) - s_P*tau_e_L(4)) &
+                                        /(s_M - s_P)
+
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = strxb, strxe
+                                        flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i)
+                                    end do
+                                end if
+                            #:endif
+
+                        end do
+                    end do
+                end do
+                $:END_GPU_PARALLEL_LOOP()
+            end if
+
+        #:endfor
+
+        if (viscous .or. dummy) then
+            if (weno_Re_flux) then
+
+                call s_compute_viscous_source_flux( &
+                    qL_prim_vf(momxb:momxe), &
+                    dqL_prim_dx_vf(momxb:momxe), &
+                    dqL_prim_dy_vf(momxb:momxe), &
+                    dqL_prim_dz_vf(momxb:momxe), &
+                    qR_prim_vf(momxb:momxe), &
+                    dqR_prim_dx_vf(momxb:momxe), &
+                    dqR_prim_dy_vf(momxb:momxe), &
+                    dqR_prim_dz_vf(momxb:momxe), &
+                    flux_src_vf, norm_dir, ix, iy, iz)
+            else
+                call s_compute_viscous_source_flux( &
+                    q_prim_vf(momxb:momxe), &
+                    dqL_prim_dx_vf(momxb:momxe), &
+                    dqL_prim_dy_vf(momxb:momxe), &
+                    dqL_prim_dz_vf(momxb:momxe), &
+                    q_prim_vf(momxb:momxe), &
+                    dqR_prim_dx_vf(momxb:momxe), &
+                    dqR_prim_dy_vf(momxb:momxe), &
+                    dqR_prim_dz_vf(momxb:momxe), &
+                    flux_src_vf, norm_dir, ix, iy, iz)
+            end if
+        end if
+
+        call s_finalize_riemann_solver(flux_vf, flux_src_vf, &
+                                       flux_gsrc_vf, &
+                                       norm_dir)
+
+    end subroutine s_hll_riemann_solver
+
+    !> @brief Computes intercell fluxes using the Lax-Friedrichs (LF) approximate Riemann solver.
+    subroutine s_lf_riemann_solver(qL_prim_rsx_vf, qL_prim_rsy_vf, qL_prim_rsz_vf, dqL_prim_dx_vf, &
+                                   dqL_prim_dy_vf, &
+                                   dqL_prim_dz_vf, &
+                                   qL_prim_vf, &
+                                   qR_prim_rsx_vf, qR_prim_rsy_vf, qR_prim_rsz_vf, dqR_prim_dx_vf, &
+                                   dqR_prim_dy_vf, &
+                                   dqR_prim_dz_vf, &
+                                   qR_prim_vf, &
+                                   q_prim_vf, &
+                                   flux_vf, flux_src_vf, &
+                                   flux_gsrc_vf, &
+                                   norm_dir, ix, iy, iz)
+
+        real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:), intent(inout) :: qL_prim_rsx_vf, qL_prim_rsy_vf, qL_prim_rsz_vf, qR_prim_rsx_vf, qR_prim_rsy_vf, qR_prim_rsz_vf
+        type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf
+
+        type(scalar_field), allocatable, dimension(:), intent(inout) :: qL_prim_vf, qR_prim_vf
+
+        type(scalar_field), &
+            allocatable, dimension(:), &
+            intent(inout) :: dqL_prim_dx_vf, dqR_prim_dx_vf, &
+                             dqL_prim_dy_vf, dqR_prim_dy_vf, &
+                             dqL_prim_dz_vf, dqR_prim_dz_vf
+
+        ! Intercell fluxes
+        type(scalar_field), &
+            dimension(sys_size), &
+            intent(inout) :: flux_vf, flux_src_vf, flux_gsrc_vf
+        real(wp) :: flux_tau_L, flux_tau_R
+
+        integer, intent(in) :: norm_dir
+        type(int_bounds_info), intent(in) :: ix, iy, iz
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3) :: alpha_rho_L, alpha_rho_R
+            real(wp), dimension(3) :: vel_L, vel_R
+            real(wp), dimension(3) :: alpha_L, alpha_R
+            real(wp), dimension(10) :: Ys_L, Ys_R
+            real(wp), dimension(10) :: Cp_iL, Cp_iR, Xs_L, Xs_R, Gamma_iL, Gamma_iR
+            real(wp), dimension(10) :: Yi_avg, Phi_avg, h_iL, h_iR, h_avg_2
+            real(wp), dimension(3, 3) :: vel_grad_L, vel_grad_R       !< Averaged velocity gradient tensor `d(vel_i)/d(coord_j)`.
+        #:else
+            real(wp), dimension(num_fluids) :: alpha_rho_L, alpha_rho_R
+            real(wp), dimension(num_vels) :: vel_L, vel_R
+            real(wp), dimension(num_fluids) :: alpha_L, alpha_R
+            real(wp), dimension(num_species) :: Ys_L, Ys_R
+            real(wp), dimension(num_species) :: Cp_iL, Cp_iR, Xs_L, Xs_R, Gamma_iL, Gamma_iR
+            real(wp), dimension(num_species) :: Yi_avg, Phi_avg, h_iL, h_iR, h_avg_2
+            real(wp), dimension(num_dims, num_dims) :: vel_grad_L, vel_grad_R       !< Averaged velocity gradient tensor `d(vel_i)/d(coord_j)`.
+        #:endif
+        real(wp) :: rho_L, rho_R
+
+        real(wp) :: pres_L, pres_R
+        real(wp) :: E_L, E_R
+        real(wp) :: H_L, H_R
+        real(wp) :: Cp_avg, Cv_avg, T_avg, eps, c_sum_Yi_Phi
+        real(wp) :: T_L, T_R
+        real(wp) :: Y_L, Y_R
+        real(wp) :: MW_L, MW_R
+        real(wp) :: R_gas_L, R_gas_R
+        real(wp) :: Cp_L, Cp_R
+        real(wp) :: Cv_L, Cv_R
+        real(wp) :: Gamm_L, Gamm_R
+        real(wp) :: gamma_L, gamma_R
+        real(wp) :: pi_inf_L, pi_inf_R
+        real(wp) :: qv_L, qv_R
+        real(wp) :: c_L, c_R
+        real(wp), dimension(6) :: tau_e_L, tau_e_R
+        real(wp) :: G_L, G_R
+        real(wp), dimension(2) :: Re_L, Re_R
+        real(wp), dimension(3) :: xi_field_L, xi_field_R
+        ! Non-Newtonian per-phase Re arrays
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3, 2) :: Re_visc_per_phase_L, Re_visc_per_phase_R
+        #:else
+            real(wp), dimension(num_fluids, 2) :: Re_visc_per_phase_L, Re_visc_per_phase_R
+        #:endif
+
+        real(wp) :: rho_avg
+        real(wp) :: H_avg
+        real(wp) :: gamma_avg
+        real(wp) :: c_avg
+
+        real(wp) :: s_L, s_R, s_M, s_P, s_S
+        real(wp) :: xi_M, xi_P
+
+        real(wp) :: ptilde_L, ptilde_R
+        real(wp) :: vel_L_rms, vel_R_rms, vel_avg_rms
+        real(wp) :: vel_L_tmp, vel_R_tmp
+        real(wp) :: Ms_L, Ms_R, pres_SL, pres_SR
+        real(wp) :: alpha_L_sum, alpha_R_sum
+        real(wp) :: zcoef, pcorr !< low Mach number correction
+
+        type(riemann_states) :: c_fast, pres_mag
+        type(riemann_states_vec3) :: B
+
+        type(riemann_states) :: Ga ! Gamma (Lorentz factor)
+        type(riemann_states) :: vdotB, B2
+        type(riemann_states_vec3) :: b4 ! 4-magnetic field components (spatial: b4x, b4y, b4z)
+        type(riemann_states_vec3) :: cm ! Conservative momentum variables
+
+        integer :: i, j, k, l, q !< Generic loop iterators
+        integer, dimension(3) :: idx_right_phys                     !< Physical (j,k,l) indices for right state.
+
+        ! Populating the buffers of the left and right Riemann problem
+        ! states variables, based on the choice of boundary conditions
+        call s_populate_riemann_states_variables_buffers( &
+            qL_prim_rsx_vf, qL_prim_rsy_vf, qL_prim_rsz_vf, dqL_prim_dx_vf, &
+            dqL_prim_dy_vf, &
+            dqL_prim_dz_vf, &
+            qR_prim_rsx_vf, qR_prim_rsy_vf, qR_prim_rsz_vf, dqR_prim_dx_vf, &
+            dqR_prim_dy_vf, &
+            dqR_prim_dz_vf, &
+            norm_dir, ix, iy, iz)
+
+        ! Reshaping inputted data based on dimensional splitting direction
+        call s_initialize_riemann_solver( &
+            flux_src_vf, &
+            norm_dir)
+        #:for NORM_DIR, XYZ in [(1, 'x'), (2, 'y'), (3, 'z')]
+
+            if (norm_dir == ${NORM_DIR}$) then
+                $:GPU_PARALLEL_LOOP(collapse=3, private='[i,j,k,l, q, alpha_rho_L,alpha_rho_R,vel_L,vel_R,alpha_L,alpha_R,tau_e_L,tau_e_R,G_L,G_R,Re_L,Re_R,rho_avg,h_avg,gamma_avg,s_L,s_R,s_S,Ys_L,Ys_R,xi_field_L,xi_field_R,Cp_iL,Cp_iR,Xs_L,Xs_R,Gamma_iL,Gamma_iR,Yi_avg,Phi_avg,h_iL,h_iR,h_avg_2,c_fast,pres_mag,B,Ga,vdotB,B2,b4,cm,pcorr,zcoef,vel_grad_L,vel_grad_R,idx_right_phys,vel_L_rms,vel_R_rms,vel_avg_rms,vel_L_tmp,vel_R_tmp,Ms_L,Ms_R,pres_SL,pres_SR,alpha_L_sum,alpha_R_sum,c_avg,pres_L,pres_R,rho_L,rho_R,gamma_L,gamma_R,pi_inf_L,pi_inf_R,qv_L,qv_R,c_L,c_R,E_L,E_R,H_L,H_R,ptilde_L,ptilde_R,s_M,s_P,xi_M,xi_P,Cp_avg,Cv_avg,T_avg,eps,c_sum_Yi_Phi,Cp_L,Cp_R,Cv_L,Cv_R,R_gas_L,R_gas_R,MW_L,MW_R,T_L,T_R,Y_L,Y_R, Re_visc_per_phase_L, Re_visc_per_phase_R]')
+                do l = is3%beg, is3%end
+                    do k = is2%beg, is2%end
+                        do j = is1%beg, is1%end
+                            $:GPU_LOOP(parallelism='[seq]')
+                            do i = 1, contxe
+                                alpha_rho_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, i)
+                                alpha_rho_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)
+                            end do
+
+                            vel_L_rms = 0._wp; vel_R_rms = 0._wp
+
+                            $:GPU_LOOP(parallelism='[seq]')
+                            do i = 1, num_vels
+                                vel_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, contxe + i)
+                                vel_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, contxe + i)
+                                vel_L_rms = vel_L_rms + vel_L(i)**2._wp
+                                vel_R_rms = vel_R_rms + vel_R(i)**2._wp
+                            end do
+
+                            $:GPU_LOOP(parallelism='[seq]')
+                            do i = 1, num_fluids
+                                alpha_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)
+                                alpha_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)
+                            end do
+
+                            pres_L = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx)
+                            pres_R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx)
+
+                            if (mhd) then
+                                if (n == 0) then ! 1D: constant Bx; By, Bz as variables
+                                    B%L(1) = Bx0
+                                    B%R(1) = Bx0
+                                    B%L(2) = qL_prim_rs${XYZ}$_vf(j, k, l, B_idx%beg)
+                                    B%R(2) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, B_idx%beg)
+                                    B%L(3) = qL_prim_rs${XYZ}$_vf(j, k, l, B_idx%beg + 1)
+                                    B%R(3) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, B_idx%beg + 1)
+                                else ! 2D/3D: Bx, By, Bz as variables
+                                    B%L(1) = qL_prim_rs${XYZ}$_vf(j, k, l, B_idx%beg)
+                                    B%R(1) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, B_idx%beg)
+                                    B%L(2) = qL_prim_rs${XYZ}$_vf(j, k, l, B_idx%beg + 1)
+                                    B%R(2) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, B_idx%beg + 1)
+                                    B%L(3) = qL_prim_rs${XYZ}$_vf(j, k, l, B_idx%beg + 2)
+                                    B%R(3) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, B_idx%beg + 2)
+                                end if
+                            end if
+
+                            rho_L = 0._wp
+                            gamma_L = 0._wp
+                            pi_inf_L = 0._wp
+                            qv_L = 0._wp
+
+                            rho_R = 0._wp
+                            gamma_R = 0._wp
+                            pi_inf_R = 0._wp
+                            qv_R = 0._wp
+
+                            alpha_L_sum = 0._wp
+                            alpha_R_sum = 0._wp
+
+                            pres_mag%L = 0._wp
+                            pres_mag%R = 0._wp
+
+                            if (mpp_lim) then
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = 1, num_fluids
+                                    alpha_rho_L(i) = max(0._wp, alpha_rho_L(i))
+                                    alpha_L(i) = min(max(0._wp, alpha_L(i)), 1._wp)
+                                    alpha_L_sum = alpha_L_sum + alpha_L(i)
+                                    alpha_rho_R(i) = max(0._wp, alpha_rho_R(i))
+                                    alpha_R(i) = min(max(0._wp, alpha_R(i)), 1._wp)
+                                    alpha_R_sum = alpha_R_sum + alpha_R(i)
+                                end do
+
+                                alpha_L = alpha_L/max(alpha_L_sum, sgm_eps)
+                                alpha_R = alpha_R/max(alpha_R_sum, sgm_eps)
+                            end if
+
+                            $:GPU_LOOP(parallelism='[seq]')
+                            do i = 1, num_fluids
+                                rho_L = rho_L + alpha_rho_L(i)
+                                gamma_L = gamma_L + alpha_L(i)*gammas(i)
+                                pi_inf_L = pi_inf_L + alpha_L(i)*pi_infs(i)
+                                qv_L = qv_L + alpha_rho_L(i)*qvs(i)
+
+                                rho_R = rho_R + alpha_rho_R(i)
+                                gamma_R = gamma_R + alpha_R(i)*gammas(i)
+                                pi_inf_R = pi_inf_R + alpha_R(i)*pi_infs(i)
+                                qv_R = qv_R + alpha_rho_R(i)*qvs(i)
+                            end do
+
+                            if (viscous) then
+                                ! Map rotated (j,k,l) to physical (x,y,z) indices
+                                #:if NORM_DIR == 1
+                                    call s_compute_re_visc(q_prim_vf, &
+                                        alpha_L, j, k, l, Re_visc_per_phase_L)
+                                    call s_compute_re_visc(q_prim_vf, &
+                                        alpha_R, j + 1, k, l, Re_visc_per_phase_R)
+                                #:elif NORM_DIR == 2
+                                    call s_compute_re_visc(q_prim_vf, &
+                                        alpha_L, k, j, l, Re_visc_per_phase_L)
+                                    call s_compute_re_visc(q_prim_vf, &
+                                        alpha_R, k, j + 1, l, Re_visc_per_phase_R)
+                                #:else
+                                    call s_compute_re_visc(q_prim_vf, &
+                                        alpha_L, l, k, j, Re_visc_per_phase_L)
+                                    call s_compute_re_visc(q_prim_vf, &
+                                        alpha_R, l, k, j + 1, Re_visc_per_phase_R)
+                                #:endif
+                                call s_compute_mixture_re( &
+                                    alpha_L, Re_visc_per_phase_L, Re_L)
+                                call s_compute_mixture_re( &
+                                    alpha_R, Re_visc_per_phase_R, Re_R)
+                            end if
+
+                            if (chemistry) then
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = chemxb, chemxe
+                                    Ys_L(i - chemxb + 1) = qL_prim_rs${XYZ}$_vf(j, k, l, i)
+                                    Ys_R(i - chemxb + 1) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)
+                                end do
+
+                                call get_mixture_molecular_weight(Ys_L, MW_L)
+                                call get_mixture_molecular_weight(Ys_R, MW_R)
+
+                                #:if USING_AMD
+                                    Xs_L(:) = Ys_L(:)*MW_L/molecular_weights_nonparameter(:)
+                                    Xs_R(:) = Ys_R(:)*MW_R/molecular_weights_nonparameter(:)
+                                #:else
+                                    Xs_L(:) = Ys_L(:)*MW_L/molecular_weights(:)
+                                    Xs_R(:) = Ys_R(:)*MW_R/molecular_weights(:)
+                                #:endif
+
+                                R_gas_L = gas_constant/MW_L
+                                R_gas_R = gas_constant/MW_R
+                                T_L = pres_L/rho_L/R_gas_L
+                                T_R = pres_R/rho_R/R_gas_R
+
+                                call get_species_specific_heats_r(T_L, Cp_iL)
+                                call get_species_specific_heats_r(T_R, Cp_iR)
+
+                                if (chem_params%gamma_method == 1) then
+                                    ! gamma_method = 1: Ref. Section 2.3.1 Formulation of doi:10.7907/ZKW8-ES97.
+                                    Gamma_iL = Cp_iL/(Cp_iL - 1.0_wp)
+                                    Gamma_iR = Cp_iR/(Cp_iR - 1.0_wp)
+
+                                    gamma_L = sum(Xs_L(:)/(Gamma_iL(:) - 1.0_wp))
+                                    gamma_R = sum(Xs_R(:)/(Gamma_iR(:) - 1.0_wp))
+                                else if (chem_params%gamma_method == 2) then
+                                    ! gamma_method = 2: c_p / c_v where c_p, c_v are specific heats.
+                                    call get_mixture_specific_heat_cp_mass(T_L, Ys_L, Cp_L)
+                                    call get_mixture_specific_heat_cp_mass(T_R, Ys_R, Cp_R)
+                                    call get_mixture_specific_heat_cv_mass(T_L, Ys_L, Cv_L)
+                                    call get_mixture_specific_heat_cv_mass(T_R, Ys_R, Cv_R)
+
+                                    Gamm_L = Cp_L/Cv_L
+                                    gamma_L = 1.0_wp/(Gamm_L - 1.0_wp)
+                                    Gamm_R = Cp_R/Cv_R
+                                    gamma_R = 1.0_wp/(Gamm_R - 1.0_wp)
+                                end if
+
+                                call get_mixture_energy_mass(T_L, Ys_L, E_L)
+                                call get_mixture_energy_mass(T_R, Ys_R, E_R)
+
+                                E_L = rho_L*E_L + 5.e-1*rho_L*vel_L_rms
+                                E_R = rho_R*E_R + 5.e-1*rho_R*vel_R_rms
+                                H_L = (E_L + pres_L)/rho_L
+                                H_R = (E_R + pres_R)/rho_R
+                            elseif (mhd .and. relativity) then
+                                #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
+                                    Ga%L = 1._wp/sqrt(1._wp - vel_L_rms)
+                                    Ga%R = 1._wp/sqrt(1._wp - vel_R_rms)
+                                    vdotB%L = vel_L(1)*B%L(1) + vel_L(2)*B%L(2) + vel_L(3)*B%L(3)
+                                    vdotB%R = vel_R(1)*B%R(1) + vel_R(2)*B%R(2) + vel_R(3)*B%R(3)
+
+                                    b4%L(1:3) = B%L(1:3)/Ga%L + Ga%L*vel_L(1:3)*vdotB%L
+                                    b4%R(1:3) = B%R(1:3)/Ga%R + Ga%R*vel_R(1:3)*vdotB%R
+                                    B2%L = B%L(1)**2._wp + B%L(2)**2._wp + B%L(3)**2._wp
+                                    B2%R = B%R(1)**2._wp + B%R(2)**2._wp + B%R(3)**2._wp
+
+                                    pres_mag%L = 0.5_wp*(B2%L/Ga%L**2._wp + vdotB%L**2._wp)
+                                    pres_mag%R = 0.5_wp*(B2%R/Ga%R**2._wp + vdotB%R**2._wp)
+
+                                    ! Hard-coded EOS
+                                    H_L = 1._wp + (gamma_L + 1)*pres_L/rho_L
+                                    H_R = 1._wp + (gamma_R + 1)*pres_R/rho_R
+
+                                    cm%L(1:3) = (rho_L*H_L*Ga%L**2 + B2%L)*vel_L(1:3) - vdotB%L*B%L(1:3)
+                                    cm%R(1:3) = (rho_R*H_R*Ga%R**2 + B2%R)*vel_R(1:3) - vdotB%R*B%R(1:3)
+
+                                    E_L = rho_L*H_L*Ga%L**2 - pres_L + 0.5_wp*(B2%L + vel_L_rms*B2%L - vdotB%L**2._wp) - rho_L*Ga%L
+                                    E_R = rho_R*H_R*Ga%R**2 - pres_R + 0.5_wp*(B2%R + vel_R_rms*B2%R - vdotB%R**2._wp) - rho_R*Ga%R
+                                #:endif
+                            elseif (mhd .and. .not. relativity) then
+                                pres_mag%L = 0.5_wp*(B%L(1)**2._wp + B%L(2)**2._wp + B%L(3)**2._wp)
+                                pres_mag%R = 0.5_wp*(B%R(1)**2._wp + B%R(2)**2._wp + B%R(3)**2._wp)
+                                E_L = gamma_L*pres_L + pi_inf_L + 0.5_wp*rho_L*vel_L_rms + qv_L + pres_mag%L
+                                E_R = gamma_R*pres_R + pi_inf_R + 0.5_wp*rho_R*vel_R_rms + qv_R + pres_mag%R ! includes magnetic energy
+                                H_L = (E_L + pres_L - pres_mag%L)/rho_L
+                                H_R = (E_R + pres_R - pres_mag%R)/rho_R ! stagnation enthalpy here excludes magnetic energy (only used to find speed of sound)
+                            else
+                                E_L = gamma_L*pres_L + pi_inf_L + 5.e-1*rho_L*vel_L_rms + qv_L
+                                E_R = gamma_R*pres_R + pi_inf_R + 5.e-1*rho_R*vel_R_rms + qv_R
+                                H_L = (E_L + pres_L)/rho_L
+                                H_R = (E_R + pres_R)/rho_R
+                            end if
+
+                            ! elastic energy update
+                            if (hypoelasticity) then
+                                G_L = 0._wp; G_R = 0._wp
+
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = 1, num_fluids
+                                    G_L = G_L + alpha_L(i)*Gs_rs(i)
+                                    G_R = G_R + alpha_R(i)*Gs_rs(i)
+                                end do
+
+                                if (cont_damage) then
+                                    G_L = G_L*max((1._wp - qL_prim_rs${XYZ}$_vf(j, k, l, damage_idx)), 0._wp)
+                                    G_R = G_R*max((1._wp - qR_prim_rs${XYZ}$_vf(j, k, l, damage_idx)), 0._wp)
+                                end if
+
+                                do i = 1, strxe - strxb + 1
+                                    tau_e_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, strxb - 1 + i)
+                                    tau_e_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, strxb - 1 + i)
+                                    ! Elastic contribution to energy if G large enough
+                                    !TODO take out if statement if stable without
+                                    if ((G_L > 1000) .and. (G_R > 1000)) then
+                                        E_L = E_L + (tau_e_L(i)*tau_e_L(i))/(4._wp*G_L)
+                                        E_R = E_R + (tau_e_R(i)*tau_e_R(i))/(4._wp*G_R)
+                                        ! Double for shear stresses
+                                        if (any(strxb - 1 + i == shear_indices)) then
+                                            E_L = E_L + (tau_e_L(i)*tau_e_L(i))/(4._wp*G_L)
+                                            E_R = E_R + (tau_e_R(i)*tau_e_R(i))/(4._wp*G_R)
+                                        end if
+                                    end if
+                                end do
+                            end if
+
+                            call s_compute_speed_of_sound(pres_L, rho_L, gamma_L, pi_inf_L, H_L, alpha_L, &
+                                                          vel_L_rms, 0._wp, c_L, qv_L)
+
+                            call s_compute_speed_of_sound(pres_R, rho_R, gamma_R, pi_inf_R, H_R, alpha_R, &
+                                                          vel_R_rms, 0._wp, c_R, qv_R)
+
+                            if (mhd) then
+                                call s_compute_fast_magnetosonic_speed(rho_L, c_L, B%L, norm_dir, c_fast%L, H_L)
+                                call s_compute_fast_magnetosonic_speed(rho_R, c_R, B%R, norm_dir, c_fast%R, H_R)
+                            end if
+
+                            s_L = 0._wp; s_R = 0._wp
+
+                            $:GPU_LOOP(parallelism='[seq]')
+                            do i = 1, num_dims
+                                s_L = s_L + vel_L(i)**2._wp
+                                s_R = s_R + vel_R(i)**2._wp
+                            end do
+
+                            s_L = sqrt(s_L)
+                            s_R = sqrt(s_R)
+
+                            s_P = max(s_L, s_R) + max(c_L, c_R)
+                            s_M = -s_P
+
+                            s_L = s_M
+                            s_R = s_P
+
+                            ! Low Mach correction
+                            if (low_Mach == 1) then
+                                @:compute_low_Mach_correction()
+                            else
+                                pcorr = 0._wp
+                            end if
+
+                            ! Mass
+                            if (.not. relativity) then
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = 1, contxe
+                                    flux_rs${XYZ}$_vf(j, k, l, i) = &
+                                        (s_M*alpha_rho_R(i)*vel_R(norm_dir) &
+                                         - s_P*alpha_rho_L(i)*vel_L(norm_dir) &
+                                         + s_M*s_P*(alpha_rho_L(i) &
+                                                    - alpha_rho_R(i))) &
+                                        /(s_M - s_P)
+                                end do
+                            elseif (relativity) then
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = 1, contxe
+                                    flux_rs${XYZ}$_vf(j, k, l, i) = &
+                                        (s_M*Ga%R*alpha_rho_R(i)*vel_R(norm_dir) &
+                                         - s_P*Ga%L*alpha_rho_L(i)*vel_L(norm_dir) &
+                                         + s_M*s_P*(Ga%L*alpha_rho_L(i) &
+                                                    - Ga%R*alpha_rho_R(i))) &
+                                        /(s_M - s_P)
+                                end do
+                            end if
+
+                            ! Momentum
+                            if (mhd .and. (.not. relativity)) then
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = 1, 3
+                                    ! Flux of rho*v_i in the ${XYZ}$ direction
+                                    ! = rho * v_i * v_${XYZ}$ - B_i * B_${XYZ}$ + delta_(${XYZ}$,i) * p_tot
+                                    flux_rs${XYZ}$_vf(j, k, l, contxe + i) = &
+                                        (s_M*(rho_R*vel_R(i)*vel_R(norm_dir) &
+                                              - B%R(i)*B%R(norm_dir) &
+                                              + dir_flg(i)*(pres_R + pres_mag%R)) &
+                                         - s_P*(rho_L*vel_L(i)*vel_L(norm_dir) &
+                                                - B%L(i)*B%L(norm_dir) &
+                                                + dir_flg(i)*(pres_L + pres_mag%L)) &
+                                         + s_M*s_P*(rho_L*vel_L(i) - rho_R*vel_R(i))) &
+                                        /(s_M - s_P)
+                                end do
+                            elseif (mhd .and. relativity) then
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = 1, 3
+                                    ! Flux of m_i in the ${XYZ}$ direction
+                                    ! = m_i * v_${XYZ}$ - b_i/Gamma * B_${XYZ}$ + delta_(${XYZ}$,i) * p_tot
+                                    flux_rs${XYZ}$_vf(j, k, l, contxe + i) = &
+                                        (s_M*(cm%R(i)*vel_R(norm_dir) &
+                                              - b4%R(i)/Ga%R*B%R(norm_dir) &
+                                              + dir_flg(i)*(pres_R + pres_mag%R)) &
+                                         - s_P*(cm%L(i)*vel_L(norm_dir) &
+                                                - b4%L(i)/Ga%L*B%L(norm_dir) &
+                                                + dir_flg(i)*(pres_L + pres_mag%L)) &
+                                         + s_M*s_P*(cm%L(i) - cm%R(i))) &
+                                        /(s_M - s_P)
+                                end do
+                            elseif (bubbles_euler) then
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = 1, num_vels
+                                    flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = &
+                                        (s_M*(rho_R*vel_R(dir_idx(1)) &
+                                              *vel_R(dir_idx(i)) &
+                                              + dir_flg(dir_idx(i))*(pres_R - ptilde_R)) &
+                                         - s_P*(rho_L*vel_L(dir_idx(1)) &
+                                                *vel_L(dir_idx(i)) &
+                                                + dir_flg(dir_idx(i))*(pres_L - ptilde_L)) &
+                                         + s_M*s_P*(rho_L*vel_L(dir_idx(i)) &
+                                                    - rho_R*vel_R(dir_idx(i)))) &
+                                        /(s_M - s_P) &
+                                        + (s_M/s_L)*(s_P/s_R)*pcorr*(vel_R(dir_idx(i)) - vel_L(dir_idx(i)))
+                                end do
+                            else if (hypoelasticity) then
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = 1, num_vels
+                                    flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = &
+                                        (s_M*(rho_R*vel_R(dir_idx(1)) &
+                                              *vel_R(dir_idx(i)) &
+                                              + dir_flg(dir_idx(i))*pres_R &
+                                              - tau_e_R(dir_idx_tau(i))) &
+                                         - s_P*(rho_L*vel_L(dir_idx(1)) &
+                                                *vel_L(dir_idx(i)) &
+                                                + dir_flg(dir_idx(i))*pres_L &
+                                                - tau_e_L(dir_idx_tau(i))) &
+                                         + s_M*s_P*(rho_L*vel_L(dir_idx(i)) &
+                                                    - rho_R*vel_R(dir_idx(i)))) &
+                                        /(s_M - s_P)
+                                end do
+                            else
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = 1, num_vels
+                                    flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = &
+                                        (s_M*(rho_R*vel_R(dir_idx(1)) &
+                                              *vel_R(dir_idx(i)) &
+                                              + dir_flg(dir_idx(i))*pres_R) &
+                                         - s_P*(rho_L*vel_L(dir_idx(1)) &
+                                                *vel_L(dir_idx(i)) &
+                                                + dir_flg(dir_idx(i))*pres_L) &
+                                         + s_M*s_P*(rho_L*vel_L(dir_idx(i)) &
+                                                    - rho_R*vel_R(dir_idx(i)))) &
+                                        /(s_M - s_P) &
+                                        + (s_M/s_L)*(s_P/s_R)*pcorr*(vel_R(dir_idx(i)) - vel_L(dir_idx(i)))
+                                end do
+                            end if
+
+                            ! Energy
+                            if (mhd .and. (.not. relativity)) then
+                                ! energy flux = (E + p + p_mag) * v_${XYZ}$ - B_${XYZ}$ * (v_x*B_x + v_y*B_y + v_z*B_z)
+                                #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
+                                    flux_rs${XYZ}$_vf(j, k, l, E_idx) = &
+                                        (s_M*(vel_R(norm_dir)*(E_R + pres_R + pres_mag%R) - B%R(norm_dir)*(vel_R(1)*B%R(1) + vel_R(2)*B%R(2) + vel_R(3)*B%R(3))) &
+                                         - s_P*(vel_L(norm_dir)*(E_L + pres_L + pres_mag%L) - B%L(norm_dir)*(vel_L(1)*B%L(1) + vel_L(2)*B%L(2) + vel_L(3)*B%L(3))) &
+                                         + s_M*s_P*(E_L - E_R)) &
+                                        /(s_M - s_P)
+                                #:endif
+                            elseif (mhd .and. relativity) then
+                                ! energy flux = m_${XYZ}$ - mass flux
+                                ! Hard-coded for single-component for now
+                                flux_rs${XYZ}$_vf(j, k, l, E_idx) = &
+                                    (s_M*(cm%R(norm_dir) - Ga%R*alpha_rho_R(1)*vel_R(norm_dir)) &
+                                     - s_P*(cm%L(norm_dir) - Ga%L*alpha_rho_L(1)*vel_L(norm_dir)) &
+                                     + s_M*s_P*(E_L - E_R)) &
+                                    /(s_M - s_P)
+                            else if (bubbles_euler) then
+                                flux_rs${XYZ}$_vf(j, k, l, E_idx) = &
+                                    (s_M*vel_R(dir_idx(1))*(E_R + pres_R - ptilde_R) &
+                                     - s_P*vel_L(dir_idx(1))*(E_L + pres_L - ptilde_L) &
+                                     + s_M*s_P*(E_L - E_R)) &
+                                    /(s_M - s_P) &
+                                    + (s_M/s_L)*(s_P/s_R)*pcorr*(vel_R_rms - vel_L_rms)/2._wp
+                            else if (hypoelasticity) then
+                                flux_tau_L = 0._wp; flux_tau_R = 0._wp
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = 1, num_dims
+                                    flux_tau_L = flux_tau_L + tau_e_L(dir_idx_tau(i))*vel_L(dir_idx(i))
+                                    flux_tau_R = flux_tau_R + tau_e_R(dir_idx_tau(i))*vel_R(dir_idx(i))
+                                end do
+                                flux_rs${XYZ}$_vf(j, k, l, E_idx) = &
+                                    (s_M*(vel_R(dir_idx(1))*(E_R + pres_R) - flux_tau_R) &
+                                     - s_P*(vel_L(dir_idx(1))*(E_L + pres_L) - flux_tau_L) &
+                                     + s_M*s_P*(E_L - E_R))/(s_M - s_P)
+                            else
+                                flux_rs${XYZ}$_vf(j, k, l, E_idx) = &
+                                    (s_M*vel_R(dir_idx(1))*(E_R + pres_R) &
+                                     - s_P*vel_L(dir_idx(1))*(E_L + pres_L) &
+                                     + s_M*s_P*(E_L - E_R)) &
+                                    /(s_M - s_P) &
+                                    + (s_M/s_L)*(s_P/s_R)*pcorr*(vel_R_rms - vel_L_rms)/2._wp
+                            end if
+
+                            ! Elastic Stresses
+                            if (hypoelasticity) then
+                                do i = 1, strxe - strxb + 1 !TODO: this indexing may be slow
+                                    flux_rs${XYZ}$_vf(j, k, l, strxb - 1 + i) = &
+                                        (s_M*(rho_R*vel_R(dir_idx(1)) &
+                                              *tau_e_R(i)) &
+                                         - s_P*(rho_L*vel_L(dir_idx(1)) &
+                                                *tau_e_L(i)) &
+                                         + s_M*s_P*(rho_L*tau_e_L(i) &
+                                                    - rho_R*tau_e_R(i))) &
+                                        /(s_M - s_P)
+                                end do
+                            end if
+
+                            ! Advection
+                            $:GPU_LOOP(parallelism='[seq]')
+                            do i = advxb, advxe
+                                flux_rs${XYZ}$_vf(j, k, l, i) = &
+                                    (qL_prim_rs${XYZ}$_vf(j, k, l, i) &
+                                     - qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)) &
+                                    *s_M*s_P/(s_M - s_P)
+                                flux_src_rs${XYZ}$_vf(j, k, l, i) = &
+                                    (s_M*qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) &
+                                     - s_P*qL_prim_rs${XYZ}$_vf(j, k, l, i)) &
+                                    /(s_M - s_P)
+                            end do
+
+                            if (bubbles_euler) then
+                                ! From HLLC: Kills mass transport @ bubble gas density
+                                if (num_fluids > 1) then
+                                    flux_rs${XYZ}$_vf(j, k, l, contxe) = 0._wp
+                                end if
+                            end if
+
+                            if (chemistry) then
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = chemxb, chemxe
+                                    Y_L = qL_prim_rs${XYZ}$_vf(j, k, l, i)
+                                    Y_R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)
+
+                                    flux_rs${XYZ}$_vf(j, k, l, i) = (s_M*Y_R*rho_R*vel_R(dir_idx(1)) &
+                                                                     - s_P*Y_L*rho_L*vel_L(dir_idx(1)) &
+                                                                     + s_M*s_P*(Y_L*rho_L - Y_R*rho_R)) &
+                                                                    /(s_M - s_P)
+                                    flux_src_rs${XYZ}$_vf(j, k, l, i) = 0._wp
+                                end do
+                            end if
+
+                            if (mhd) then
+                                if (n == 0) then ! 1D: d/dx flux only & Bx = Bx0 = const.
+                                    ! B_y flux = v_x * B_y - v_y * Bx0
+                                    ! B_z flux = v_x * B_z - v_z * Bx0
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 0, 1
+                                        flux_rsx_vf(j, k, l, B_idx%beg + i) = (s_M*(vel_R(1)*B%R(2 + i) - vel_R(2 + i)*Bx0) &
+                                                                               - s_P*(vel_L(1)*B%L(2 + i) - vel_L(2 + i)*Bx0) &
+                                                                               + s_M*s_P*(B%L(2 + i) - B%R(2 + i)))/(s_M - s_P)
+                                    end do
+                                else ! 2D/3D: Bx, By, Bz /= const. but zero flux component in the same direction
+                                    ! B_x d/d${XYZ}$ flux = (1 - delta(x,${XYZ}$)) * (v_${XYZ}$ * B_x - v_x * B_${XYZ}$)
+                                    ! B_y d/d${XYZ}$ flux = (1 - delta(y,${XYZ}$)) * (v_${XYZ}$ * B_y - v_y * B_${XYZ}$)
+                                    ! B_z d/d${XYZ}$ flux = (1 - delta(z,${XYZ}$)) * (v_${XYZ}$ * B_z - v_z * B_${XYZ}$)
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 0, 2
+                                        flux_rs${XYZ}$_vf(j, k, l, B_idx%beg + i) = (1 - dir_flg(i + 1))*( &
+                                                                                    s_M*(vel_R(dir_idx(1))*B%R(i + 1) - vel_R(i + 1)*B%R(norm_dir)) - &
+                                                                                    s_P*(vel_L(dir_idx(1))*B%L(i + 1) - vel_L(i + 1)*B%L(norm_dir)) + &
+                                                                                    s_M*s_P*(B%L(i + 1) - B%R(i + 1)))/(s_M - s_P)
+                                    end do
+                                end if
+                                flux_src_rs${XYZ}$_vf(j, k, l, advxb) = 0._wp
+                            end if
+
+                            #:if (NORM_DIR == 2)
+                                if (cyl_coord) then
+                                    !Substituting the advective flux into the inviscid geometrical source flux
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, E_idx
+                                        flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i)
+                                    end do
+                                    ! Recalculating the radial momentum geometric source flux
+                                    flux_gsrc_rs${XYZ}$_vf(j, k, l, contxe + 2) = &
+                                        flux_rs${XYZ}$_vf(j, k, l, contxe + 2) &
+                                        - (s_M*pres_R - s_P*pres_L)/(s_M - s_P)
+                                    ! Geometrical source of the void fraction(s) is zero
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = advxb, advxe
+                                        flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i)
+                                    end do
+                                end if
+
+                                if (cyl_coord .and. hypoelasticity) then
+                                    ! += tau_sigmasigma using HLL
+                                    flux_gsrc_rs${XYZ}$_vf(j, k, l, contxe + 2) = &
+                                        flux_gsrc_rs${XYZ}$_vf(j, k, l, contxe + 2) + &
+                                        (s_M*tau_e_R(4) - s_P*tau_e_L(4)) &
+                                        /(s_M - s_P)
+
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = strxb, strxe
+                                        flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i)
+                                    end do
+                                end if
+                            #:endif
+                        end do
+                    end do
+                end do
+                $:END_GPU_PARALLEL_LOOP()
+            end if
+
+        #:endfor
+
+        if (viscous .or. dummy) then
+            $:GPU_PARALLEL_LOOP(collapse=3, private='[i,j,k,l, idx_right_phys, vel_grad_L, vel_grad_R, alpha_L, alpha_R, vel_L, vel_R, Re_L, Re_R, Re_visc_per_phase_L, Re_visc_per_phase_R]', copyin='[norm_dir]')
+            do l = isz%beg, isz%end
+                do k = isy%beg, isy%end
+                    do j = isx%beg, isx%end
+                        idx_right_phys(1) = j
+                        idx_right_phys(2) = k
+                        idx_right_phys(3) = l
+                        idx_right_phys(norm_dir) = idx_right_phys(norm_dir) + 1
+
+                        if (norm_dir == 1) then
+                            $:GPU_LOOP(parallelism='[seq]')
+                            do i = 1, num_fluids
+                                alpha_L(i) = qL_prim_rsx_vf(j, k, l, E_idx + i)
+                                alpha_R(i) = qR_prim_rsx_vf(j + 1, k, l, E_idx + i)
+                            end do
+
+                            $:GPU_LOOP(parallelism='[seq]')
+                            do i = 1, num_dims
+                                vel_L(i) = qL_prim_rsx_vf(j, k, l, momxb + i - 1)
+                                vel_R(i) = qR_prim_rsx_vf(j + 1, k, l, momxb + i - 1)
+                            end do
+                        else if (norm_dir == 2) then
+                            $:GPU_LOOP(parallelism='[seq]')
+                            do i = 1, num_fluids
+                                alpha_L(i) = qL_prim_rsy_vf(k, j, l, E_idx + i)
+                                alpha_R(i) = qR_prim_rsy_vf(k + 1, j, l, E_idx + i)
+                            end do
+                            $:GPU_LOOP(parallelism='[seq]')
+                            do i = 1, num_dims
+                                vel_L(i) = qL_prim_rsy_vf(k, j, l, momxb + i - 1)
+                                vel_R(i) = qR_prim_rsy_vf(k + 1, j, l, momxb + i - 1)
+                            end do
+                        else
+                            $:GPU_LOOP(parallelism='[seq]')
+                            do i = 1, num_fluids
+                                alpha_L(i) = qL_prim_rsz_vf(l, k, j, E_idx + i)
+                                alpha_R(i) = qR_prim_rsz_vf(l + 1, k, j, E_idx + i)
+                            end do
+
+                            $:GPU_LOOP(parallelism='[seq]')
+                            do i = 1, num_dims
+                                vel_L(i) = qL_prim_rsz_vf(l, k, j, momxb + i - 1)
+                                vel_R(i) = qR_prim_rsz_vf(l + 1, k, j, momxb + i - 1)
+                            end do
+                        end if
+
+                        call s_compute_re_visc(q_prim_vf, &
+                            alpha_L, j, k, l, Re_visc_per_phase_L)
+                        call s_compute_re_visc(q_prim_vf, &
+                            alpha_R, idx_right_phys(1), idx_right_phys(2), idx_right_phys(3), Re_visc_per_phase_R)
+                        call s_compute_mixture_re( &
+                            alpha_L, Re_visc_per_phase_L, Re_L)
+                        call s_compute_mixture_re( &
+                            alpha_R, Re_visc_per_phase_R, Re_R)
+
+                        if (shear_stress) then
+
+                            $:GPU_LOOP(parallelism='[seq]')
+                            do i = 1, num_dims
+                                vel_grad_L(i, 1) = (dqL_prim_dx_vf(momxb + i - 1)%sf(j, k, l)/Re_L(1))
+                                vel_grad_R(i, 1) = (dqR_prim_dx_vf(momxb + i - 1)%sf(idx_right_phys(1), idx_right_phys(2), idx_right_phys(3))/Re_R(1))
+                                #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
+                                    if (num_dims > 1) then
+                                        vel_grad_L(i, 2) = (dqL_prim_dy_vf(momxb + i - 1)%sf(j, k, l)/Re_L(1))
+                                        vel_grad_R(i, 2) = (dqR_prim_dy_vf(momxb + i - 1)%sf(idx_right_phys(1), idx_right_phys(2), idx_right_phys(3))/Re_R(1))
+                                    end if
+                                    #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
+                                        if (num_dims > 2) then
+                                            vel_grad_L(i, 3) = (dqL_prim_dz_vf(momxb + i - 1)%sf(j, k, l)/Re_L(1))
+                                            vel_grad_R(i, 3) = (dqR_prim_dz_vf(momxb + i - 1)%sf(idx_right_phys(1), idx_right_phys(2), idx_right_phys(3))/Re_R(1))
+                                        end if
+                                    #:endif
+                                #:endif
+                            end do
+
+                            if (norm_dir == 1) then
+                                flux_src_vf(momxb)%sf(j, k, l) = flux_src_vf(momxb)%sf(j, k, l) - (4._wp/3._wp)*0.5_wp*(vel_grad_L(1, 1) + vel_grad_R(1, 1))
+                                flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - (4._wp/3._wp)*0.5_wp*(vel_grad_L(1, 1)*vel_L(1) + vel_grad_R(1, 1)*vel_R(1))
+                                #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
+                                    if (num_dims > 1) then
+                                        flux_src_vf(momxb)%sf(j, k, l) = flux_src_vf(momxb)%sf(j, k, l) - (-2._wp/3._wp)*0.5_wp*(vel_grad_L(2, 2) + vel_grad_R(2, 2))
+                                        flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - (-2._wp/3._wp)*0.5_wp*(vel_grad_L(2, 2)*vel_L(1) + vel_grad_R(2, 2)*vel_R(1))
+
+                                        flux_src_vf(momxb + 1)%sf(j, k, l) = flux_src_vf(momxb + 1)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 2) + vel_grad_R(1, 2)) - 0.5_wp*(vel_grad_L(2, 1) + vel_grad_R(2, 1))
+                                        flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 2)*vel_L(2) + vel_grad_R(1, 2)*vel_R(2)) - 0.5_wp*(vel_grad_L(2, 1)*vel_L(2) + vel_grad_R(2, 1)*vel_R(2))
+                                        #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
+                                            if (num_dims > 2) then
+                                                flux_src_vf(momxb)%sf(j, k, l) = flux_src_vf(momxb)%sf(j, k, l) - (-2._wp/3._wp)*0.5_wp*(vel_grad_L(3, 3) + vel_grad_R(3, 3))
+                                                flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - (-2._wp/3._wp)*0.5_wp*(vel_grad_L(3, 3)*vel_L(1) + vel_grad_R(3, 3)*vel_R(1))
+
+                                                flux_src_vf(momxb + 2)%sf(j, k, l) = flux_src_vf(momxb + 2)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 3) + vel_grad_R(1, 3)) - 0.5_wp*(vel_grad_L(3, 1) + vel_grad_R(3, 1))
+                                                flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 3)*vel_L(3) + vel_grad_R(1, 3)*vel_R(3)) - 0.5_wp*(vel_grad_L(3, 1)*vel_L(3) + vel_grad_R(3, 1)*vel_R(3))
+                                            end if
+                                        #:endif
+                                    end if
+                                #:endif
+
+                            else if (norm_dir == 2) then
+                                #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
+                                    flux_src_vf(momxb + 1)%sf(j, k, l) = flux_src_vf(momxb + 1)%sf(j, k, l) - (-2._wp/3._wp)*0.5_wp*(vel_grad_L(1, 1) + vel_grad_R(1, 1))
+                                    flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - (-2._wp/3._wp)*0.5_wp*(vel_grad_L(1, 1)*vel_L(2) + vel_grad_R(1, 1)*vel_R(2))
+
+                                    flux_src_vf(momxb + 1)%sf(j, k, l) = flux_src_vf(momxb + 1)%sf(j, k, l) - (4._wp/3._wp)*0.5_wp*(vel_grad_L(2, 2) + vel_grad_R(2, 2))
+                                    flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - (4._wp/3._wp)*0.5_wp*(vel_grad_L(2, 2)*vel_L(2) + vel_grad_R(2, 2)*vel_R(2))
+
+                                    flux_src_vf(momxb)%sf(j, k, l) = flux_src_vf(momxb)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 2) + vel_grad_R(1, 2)) - 0.5_wp*(vel_grad_L(2, 1) + vel_grad_R(2, 1))
+                                    flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 2)*vel_L(1) + vel_grad_R(1, 2)*vel_R(1)) - 0.5_wp*(vel_grad_L(2, 1)*vel_L(1) + vel_grad_R(2, 1)*vel_R(1))
+                                    #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
+                                        if (num_dims > 2) then
+                                            flux_src_vf(momxb + 1)%sf(j, k, l) = flux_src_vf(momxb + 1)%sf(j, k, l) - (-2._wp/3._wp)*0.5_wp*(vel_grad_L(3, 3) + vel_grad_R(3, 3))
+                                            flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - (-2._wp/3._wp)*0.5_wp*(vel_grad_L(3, 3)*vel_L(2) + vel_grad_R(3, 3)*vel_R(2))
+
+                                            flux_src_vf(momxb + 2)%sf(j, k, l) = flux_src_vf(momxb + 2)%sf(j, k, l) - 0.5_wp*(vel_grad_L(2, 3) + vel_grad_R(2, 3)) - 0.5_wp*(vel_grad_L(3, 2) + vel_grad_R(3, 2))
+                                            flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(2, 3)*vel_L(3) + vel_grad_R(2, 3)*vel_R(3)) - 0.5_wp*(vel_grad_L(3, 2)*vel_L(3) + vel_grad_R(3, 2)*vel_R(3))
+                                        end if
+                                    #:endif
+                                #:endif
+                            else
+                                #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
+                                    flux_src_vf(momxb + 2)%sf(j, k, l) = flux_src_vf(momxb + 2)%sf(j, k, l) - (-2._wp/3._wp)*0.5_wp*(vel_grad_L(1, 1) + vel_grad_R(1, 1))
+                                    flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - (-2._wp/3._wp)*0.5_wp*(vel_grad_L(1, 1)*vel_L(3) + vel_grad_R(1, 1)*vel_R(3))
+
+                                    flux_src_vf(momxb + 2)%sf(j, k, l) = flux_src_vf(momxb + 2)%sf(j, k, l) - (-2._wp/3._wp)*0.5_wp*(vel_grad_L(2, 2) + vel_grad_R(2, 2))
+                                    flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - (-2._wp/3._wp)*0.5_wp*(vel_grad_L(2, 2)*vel_L(3) + vel_grad_R(2, 2)*vel_R(3))
+
+                                    flux_src_vf(momxb)%sf(j, k, l) = flux_src_vf(momxb)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 3) + vel_grad_R(1, 3)) - 0.5_wp*(vel_grad_L(3, 1) + vel_grad_R(3, 1))
+                                    flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 3)*vel_L(1) + vel_grad_R(1, 3)*vel_R(1)) - 0.5_wp*(vel_grad_L(3, 1)*vel_L(1) + vel_grad_R(3, 1)*vel_R(1))
+
+                                    flux_src_vf(momxb + 2)%sf(j, k, l) = flux_src_vf(momxb + 2)%sf(j, k, l) - (4._wp/3._wp)*0.5_wp*(vel_grad_L(3, 3) + vel_grad_R(3, 3))
+                                    flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - (4._wp/3._wp)*0.5_wp*(vel_grad_L(3, 3)*vel_L(3) + vel_grad_R(3, 3)*vel_R(3))
+
+                                    flux_src_vf(momxb + 1)%sf(j, k, l) = flux_src_vf(momxb + 1)%sf(j, k, l) - 0.5_wp*(vel_grad_L(2, 3) + vel_grad_R(2, 3)) - 0.5_wp*(vel_grad_L(3, 2) + vel_grad_R(3, 2))
+                                    flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(2, 3)*vel_L(2) + vel_grad_R(2, 3)*vel_R(2)) - 0.5_wp*(vel_grad_L(3, 2)*vel_L(2) + vel_grad_R(3, 2)*vel_R(2))
+                                #:endif
+                            end if
+                        end if
+
+                        if (bulk_stress) then
+
+                            $:GPU_LOOP(parallelism='[seq]')
+                            do i = 1, num_dims
+                                vel_grad_L(i, 1) = (dqL_prim_dx_vf(momxb + i - 1)%sf(j, k, l)/Re_L(2))
+                                vel_grad_R(i, 1) = (dqR_prim_dx_vf(momxb + i - 1)%sf(idx_right_phys(1), idx_right_phys(2), idx_right_phys(3))/Re_R(2))
+                                #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
+                                    if (num_dims > 1) then
+                                        vel_grad_L(i, 2) = (dqL_prim_dy_vf(momxb + i - 1)%sf(j, k, l)/Re_L(2))
+                                        vel_grad_R(i, 2) = (dqR_prim_dy_vf(momxb + i - 1)%sf(idx_right_phys(1), idx_right_phys(2), idx_right_phys(3))/Re_R(2))
+                                    end if
+                                #:endif
+                                #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
+                                    if (num_dims > 2) then
+                                        vel_grad_L(i, 3) = (dqL_prim_dz_vf(momxb + i - 1)%sf(j, k, l)/Re_L(2))
+                                        vel_grad_R(i, 3) = (dqR_prim_dz_vf(momxb + i - 1)%sf(idx_right_phys(1), idx_right_phys(2), idx_right_phys(3))/Re_R(2))
+                                    end if
+                                #:endif
+                            end do
+
+                            if (norm_dir == 1) then
+                                flux_src_vf(momxb)%sf(j, k, l) = flux_src_vf(momxb)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 1) + vel_grad_R(1, 1))
+                                flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 1)*vel_L(1) + vel_grad_R(1, 1)*vel_R(1))
+                                #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
+                                    if (num_dims > 1) then
+                                        flux_src_vf(momxb)%sf(j, k, l) = flux_src_vf(momxb)%sf(j, k, l) - 0.5_wp*(vel_grad_L(2, 2) + vel_grad_R(2, 2))
+                                        flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(2, 2)*vel_L(1) + vel_grad_R(2, 2)*vel_R(1))
+
+                                        #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
+                                            if (num_dims > 2) then
+                                                flux_src_vf(momxb)%sf(j, k, l) = flux_src_vf(momxb)%sf(j, k, l) - 0.5_wp*(vel_grad_L(3, 3) + vel_grad_R(3, 3))
+                                                flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(3, 3)*vel_L(1) + vel_grad_R(3, 3)*vel_R(1))
+                                            end if
+                                        #:endif
+                                    end if
+                                #:endif
+
+                            else if (norm_dir == 2) then
+                                #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
+                                    flux_src_vf(momxb + 1)%sf(j, k, l) = flux_src_vf(momxb + 1)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 1) + vel_grad_R(1, 1))
+                                    flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 1)*vel_L(2) + vel_grad_R(1, 1)*vel_R(2))
+
+                                    flux_src_vf(momxb + 1)%sf(j, k, l) = flux_src_vf(momxb + 1)%sf(j, k, l) - 0.5_wp*(vel_grad_L(2, 2) + vel_grad_R(2, 2))
+                                    flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(2, 2)*vel_L(2) + vel_grad_R(2, 2)*vel_R(2))
+
+                                    #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
+                                        if (num_dims > 2) then
+                                            flux_src_vf(momxb + 1)%sf(j, k, l) = flux_src_vf(momxb + 1)%sf(j, k, l) - 0.5_wp*(vel_grad_L(3, 3) + vel_grad_R(3, 3))
+                                            flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(3, 3)*vel_L(2) + vel_grad_R(3, 3)*vel_R(2))
+                                        end if
+                                    #:endif
+                                #:endif
+                            else
+                                #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
+                                    flux_src_vf(momxb + 2)%sf(j, k, l) = flux_src_vf(momxb + 2)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 1) + vel_grad_R(1, 1))
+                                    flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 1)*vel_L(3) + vel_grad_R(1, 1)*vel_R(3))
+
+                                    flux_src_vf(momxb + 2)%sf(j, k, l) = flux_src_vf(momxb + 2)%sf(j, k, l) - 0.5_wp*(vel_grad_L(2, 2) + vel_grad_R(2, 2))
+                                    flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(2, 2)*vel_L(3) + vel_grad_R(2, 2)*vel_R(3))
+
+                                    flux_src_vf(momxb + 2)%sf(j, k, l) = flux_src_vf(momxb + 2)%sf(j, k, l) - 0.5_wp*(vel_grad_L(3, 3) + vel_grad_R(3, 3))
+                                    flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(3, 3)*vel_L(3) + vel_grad_R(3, 3)*vel_R(3))
+                                #:endif
+                            end if
+
+                        end if
+                    end do
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+
+        end if
+
+        call s_finalize_riemann_solver(flux_vf, flux_src_vf, &
+                                       flux_gsrc_vf, &
+                                       norm_dir)
+
+    end subroutine s_lf_riemann_solver
+
+    !> This procedure is the implementation of the Harten, Lax,
+        !!      van Leer, and contact (HLLC) approximate Riemann solver,
+        !!      see Toro (1999) and Johnsen (2007). The viscous and the
+        !!      surface tension effects have been included by modifying
+        !!      the exact Riemann solver of Perigaud and Saurel (2005).
+        !!  @param qL_prim_rsx_vf Left WENO-reconstructed cell-boundary values (x-dir)
+        !!  @param qL_prim_rsy_vf Left WENO-reconstructed cell-boundary values (y-dir)
+        !!  @param qL_prim_rsz_vf Left WENO-reconstructed cell-boundary values (z-dir)
+        !!  @param dqL_prim_dx_vf The left WENO-reconstructed cell-boundary values of the
+        !!      first-order x-dir spatial derivatives
+        !!  @param dqL_prim_dy_vf The left WENO-reconstructed cell-boundary values of the
+        !!      first-order y-dir spatial derivatives
+        !!  @param dqL_prim_dz_vf The left WENO-reconstructed cell-boundary values of the
+        !!      first-order z-dir spatial derivatives
+        !!  @param qL_prim_vf The left WENO-reconstructed cell-boundary values of the
+        !!      cell-average primitive variables
+        !!  @param qR_prim_rsx_vf Right WENO-reconstructed cell-boundary values (x-dir)
+        !!  @param qR_prim_rsy_vf Right WENO-reconstructed cell-boundary values (y-dir)
+        !!  @param qR_prim_rsz_vf Right WENO-reconstructed cell-boundary values (z-dir)
+        !!  @param dqR_prim_dx_vf The right WENO-reconstructed cell-boundary values of the
+        !!      first-order x-dir spatial derivatives
+        !!  @param dqR_prim_dy_vf The right WENO-reconstructed cell-boundary values of the
+        !!      first-order y-dir spatial derivatives
+        !!  @param dqR_prim_dz_vf The right WENO-reconstructed cell-boundary values of the
+        !!      first-order z-dir spatial derivatives
+        !!  @param qR_prim_vf The right WENO-reconstructed cell-boundary values of the
+        !!      cell-average primitive variables
+        !!  @param q_prim_vf Cell-averaged primitive variables
+        !!  @param flux_vf Intra-cell fluxes
+        !!  @param flux_src_vf Intra-cell fluxes sources
+        !!  @param flux_gsrc_vf Intra-cell geometric fluxes sources
+        !!  @param norm_dir Dir. splitting direction
+        !!  @param ix Index bounds in the x-dir
+        !!  @param iy Index bounds in the y-dir
+        !!  @param iz Index bounds in the z-dir
+    subroutine s_hllc_riemann_solver(qL_prim_rsx_vf, qL_prim_rsy_vf, qL_prim_rsz_vf, dqL_prim_dx_vf, &
+                                     dqL_prim_dy_vf, &
+                                     dqL_prim_dz_vf, &
+                                     qL_prim_vf, &
+                                     qR_prim_rsx_vf, qR_prim_rsy_vf, qR_prim_rsz_vf, dqR_prim_dx_vf, &
+                                     dqR_prim_dy_vf, &
+                                     dqR_prim_dz_vf, &
+                                     qR_prim_vf, &
+                                     q_prim_vf, &
+                                     flux_vf, flux_src_vf, &
+                                     flux_gsrc_vf, &
+                                     norm_dir, ix, iy, iz)
+
+        real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:), intent(inout) :: qL_prim_rsx_vf, qL_prim_rsy_vf, qL_prim_rsz_vf, qR_prim_rsx_vf, qR_prim_rsy_vf, qR_prim_rsz_vf
+        type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf
+        type(scalar_field), allocatable, dimension(:), intent(inout) :: qL_prim_vf, qR_prim_vf
+
+        type(scalar_field), &
+            allocatable, dimension(:), &
+            intent(inout) :: dqL_prim_dx_vf, dqR_prim_dx_vf, &
+                             dqL_prim_dy_vf, dqR_prim_dy_vf, &
+                             dqL_prim_dz_vf, dqR_prim_dz_vf
+
+        ! Intercell fluxes
+        type(scalar_field), &
+            dimension(sys_size), &
+            intent(inout) :: flux_vf, flux_src_vf, flux_gsrc_vf
+
+        integer, intent(in) :: norm_dir
+        type(int_bounds_info), intent(in) :: ix, iy, iz
+
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3) :: alpha_rho_L, alpha_rho_R
+            real(wp), dimension(3) :: alpha_L, alpha_R
+            real(wp), dimension(3) :: vel_L, vel_R
+        #:else
+            real(wp), dimension(num_fluids) :: alpha_rho_L, alpha_rho_R
+            real(wp), dimension(num_fluids) :: alpha_L, alpha_R
+            real(wp), dimension(num_dims) :: vel_L, vel_R
+        #:endif
+
+        real(wp) :: rho_L, rho_R
+        real(wp) :: pres_L, pres_R
+        real(wp) :: E_L, E_R
+        real(wp) :: H_L, H_R
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(10) :: Ys_L, Ys_R, Xs_L, Xs_R, Gamma_iL, Gamma_iR, Cp_iL, Cp_iR
+            real(wp), dimension(10) :: Yi_avg, Phi_avg, h_iL, h_iR, h_avg_2
+        #:else
+            real(wp), dimension(num_species) :: Ys_L, Ys_R, Xs_L, Xs_R, Gamma_iL, Gamma_iR, Cp_iL, Cp_iR
+            real(wp), dimension(num_species) :: Yi_avg, Phi_avg, h_iL, h_iR, h_avg_2
+        #:endif
+        real(wp) :: Cp_avg, Cv_avg, T_avg, c_sum_Yi_Phi, eps
+        real(wp) :: T_L, T_R
+        real(wp) :: MW_L, MW_R
+        real(wp) :: R_gas_L, R_gas_R
+        real(wp) :: Cp_L, Cp_R
+        real(wp) :: Cv_L, Cv_R
+        real(wp) :: Gamm_L, Gamm_R
+        real(wp) :: Y_L, Y_R
+        real(wp) :: gamma_L, gamma_R
+        real(wp) :: pi_inf_L, pi_inf_R
+        real(wp) :: qv_L, qv_R
+        real(wp) :: c_L, c_R
+        real(wp), dimension(2) :: Re_L, Re_R
+        ! Non-Newtonian per-phase Re arrays
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3, 2) :: Re_visc_per_phase_L, Re_visc_per_phase_R
+        #:else
+            real(wp), dimension(num_fluids, 2) :: Re_visc_per_phase_L, Re_visc_per_phase_R
+        #:endif
+
+        real(wp) :: rho_avg
+        real(wp) :: H_avg
+        real(wp) :: gamma_avg
+        real(wp) :: qv_avg
+        real(wp) :: c_avg
+
+        real(wp) :: s_L, s_R, s_M, s_P, s_S
+        real(wp) :: xi_L, xi_R !< Left and right wave speeds functions
+        real(wp) :: xi_M, xi_P
+        real(wp) :: xi_MP, xi_PP
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3) :: R0_L, R0_R
+            real(wp), dimension(3) :: V0_L, V0_R
+            real(wp), dimension(3) :: P0_L, P0_R
+            real(wp), dimension(3) :: pbw_L, pbw_R
+        #:else
+            real(wp), dimension(nb) :: R0_L, R0_R
+            real(wp), dimension(nb) :: V0_L, V0_R
+            real(wp), dimension(nb) :: P0_L, P0_R
+            real(wp), dimension(nb) :: pbw_L, pbw_R
+        #:endif
+
+        real(wp) :: alpha_L_sum, alpha_R_sum, nbub_L, nbub_R
+        real(wp) :: ptilde_L, ptilde_R
+
+        real(wp) :: PbwR3Lbar, PbwR3Rbar
+        real(wp) :: R3Lbar, R3Rbar
+        real(wp) :: R3V2Lbar, R3V2Rbar
+
+        real(wp), dimension(6) :: tau_e_L, tau_e_R
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3) :: xi_field_L, xi_field_R
+        #:else
+            real(wp), dimension(num_dims) :: xi_field_L, xi_field_R
+        #:endif
+        real(wp) :: G_L, G_R
+
+        real(wp) :: vel_L_rms, vel_R_rms, vel_avg_rms
+        real(wp) :: vel_L_tmp, vel_R_tmp
+        real(wp) :: rho_Star, E_Star, p_Star, p_K_Star, vel_K_star
+        real(wp) :: pres_SL, pres_SR, Ms_L, Ms_R
+        real(wp) :: flux_ene_e
+        real(wp) :: zcoef, pcorr !< low Mach number correction
+
+        integer :: Re_max, i, j, k, l, q !< Generic loop iterators
+
+        ! Populating the buffers of the left and right Riemann problem
+        ! states variables, based on the choice of boundary conditions
+
+        call s_populate_riemann_states_variables_buffers( &
+            qL_prim_rsx_vf, qL_prim_rsy_vf, qL_prim_rsz_vf, dqL_prim_dx_vf, &
+            dqL_prim_dy_vf, &
+            dqL_prim_dz_vf, &
+            qR_prim_rsx_vf, qR_prim_rsy_vf, qR_prim_rsz_vf, dqR_prim_dx_vf, &
+            dqR_prim_dy_vf, &
+            dqR_prim_dz_vf, &
+            norm_dir, ix, iy, iz)
+
+        ! Reshaping inputted data based on dimensional splitting direction
+
+        call s_initialize_riemann_solver( &
+            flux_src_vf, &
+            norm_dir)
+
+        #:for NORM_DIR, XYZ in [(1, 'x'), (2, 'y'), (3, 'z')]
+
+            if (norm_dir == ${NORM_DIR}$) then
+
+                ! 6-EQUATION MODEL WITH HLLC
+                if (model_eqns == 3) then
+                    !ME3
+                    $:GPU_PARALLEL_LOOP(collapse=3, private='[i,j,k,l, q, vel_L, vel_R, Re_L, Re_R, alpha_L, alpha_R, Ys_L, Ys_R, Xs_L, Xs_R, Gamma_iL, Gamma_iR, Cp_iL, Cp_iR, Yi_avg, Phi_avg, h_iL, h_iR, h_avg_2, tau_e_L, tau_e_R, flux_ene_e, xi_field_L, xi_field_R, pcorr, zcoef, rho_L, rho_R, pres_L, pres_R, E_L, E_R, H_L, H_R, Cp_avg, Cv_avg, T_avg, eps, c_sum_Yi_Phi, T_L, T_R, Y_L, Y_R, MW_L, MW_R, R_gas_L, R_gas_R, Cp_L, Cp_R, Cv_L, Cv_R, Gamm_L, Gamm_R, gamma_L, gamma_R, pi_inf_L, pi_inf_R, qv_L, qv_R, qv_avg, c_L, c_R, G_L, G_R, rho_avg, H_avg, c_avg, gamma_avg, ptilde_L, ptilde_R, vel_L_rms, vel_R_rms, vel_avg_rms, vel_L_tmp, vel_R_tmp, Ms_L, Ms_R, pres_SL, pres_SR, alpha_L_sum, alpha_R_sum, rho_Star, E_Star, p_Star, p_K_Star, vel_K_star, s_L, s_R, s_M, s_P, s_S, xi_M, xi_P, xi_L, xi_R, xi_MP, xi_PP, Re_visc_per_phase_L, Re_visc_per_phase_R]')
+                    do l = is3%beg, is3%end
+                        do k = is2%beg, is2%end
+                            do j = is1%beg, is1%end
+
+                                vel_L_rms = 0._wp; vel_R_rms = 0._wp
+                                rho_L = 0._wp; rho_R = 0._wp
+                                gamma_L = 0._wp; gamma_R = 0._wp
+                                pi_inf_L = 0._wp; pi_inf_R = 0._wp
+                                qv_L = 0._wp; qv_R = 0._wp
+                                alpha_L_sum = 0._wp; alpha_R_sum = 0._wp
+
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = 1, num_dims
+                                    vel_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, contxe + i)
+                                    vel_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, contxe + i)
+                                    vel_L_rms = vel_L_rms + vel_L(i)**2._wp
+                                    vel_R_rms = vel_R_rms + vel_R(i)**2._wp
+                                end do
+
+                                pres_L = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx)
+                                pres_R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx)
+
+                                rho_L = 0._wp
+                                gamma_L = 0._wp
+                                pi_inf_L = 0._wp
+                                qv_L = 0._wp
+
+                                rho_R = 0._wp
+                                gamma_R = 0._wp
+                                pi_inf_R = 0._wp
+                                qv_R = 0._wp
+
+                                alpha_L_sum = 0._wp
+                                alpha_R_sum = 0._wp
+
+                                if (mpp_lim) then
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, num_fluids
+                                        qL_prim_rs${XYZ}$_vf(j, k, l, i) = max(0._wp, qL_prim_rs${XYZ}$_vf(j, k, l, i))
+                                        qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i) = min(max(0._wp, qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)), 1._wp)
+                                        alpha_L_sum = alpha_L_sum + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)
+                                    end do
+
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, num_fluids
+                                        qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) = max(0._wp, qR_prim_rs${XYZ}$_vf(j + 1, k, l, i))
+                                        qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i) = min(max(0._wp, qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)), 1._wp)
+                                        alpha_R_sum = alpha_R_sum + qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)
+                                    end do
+
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, num_fluids
+                                        qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)/max(alpha_L_sum, sgm_eps)
+                                        qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)/max(alpha_R_sum, sgm_eps)
+                                    end do
+                                end if
+
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = 1, num_fluids
+                                    rho_L = rho_L + qL_prim_rs${XYZ}$_vf(j, k, l, i)
+                                    gamma_L = gamma_L + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)*gammas(i)
+                                    pi_inf_L = pi_inf_L + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)*pi_infs(i)
+                                    qv_L = qv_L + qL_prim_rs${XYZ}$_vf(j, k, l, i)*qvs(i)
+
+                                    rho_R = rho_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)
+                                    gamma_R = gamma_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)*gammas(i)
+                                    pi_inf_R = pi_inf_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)*pi_infs(i)
+                                    qv_R = qv_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)*qvs(i)
+
+                                    alpha_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, advxb + i - 1)
+                                    alpha_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, advxb + i - 1)
+                                end do
+
+                                if (viscous) then
+                                    ! Map rotated (j,k,l) to physical (x,y,z) indices
+                                    #:if NORM_DIR == 1
+                                        call s_compute_re_visc(q_prim_vf, &
+                                            alpha_L, j, k, l, Re_visc_per_phase_L)
+                                        call s_compute_re_visc(q_prim_vf, &
+                                            alpha_R, j + 1, k, l, Re_visc_per_phase_R)
+                                    #:elif NORM_DIR == 2
+                                        call s_compute_re_visc(q_prim_vf, &
+                                            alpha_L, k, j, l, Re_visc_per_phase_L)
+                                        call s_compute_re_visc(q_prim_vf, &
+                                            alpha_R, k, j + 1, l, Re_visc_per_phase_R)
+                                    #:else
+                                        call s_compute_re_visc(q_prim_vf, &
+                                            alpha_L, l, k, j, Re_visc_per_phase_L)
+                                        call s_compute_re_visc(q_prim_vf, &
+                                            alpha_R, l, k, j + 1, Re_visc_per_phase_R)
+                                    #:endif
+                                    call s_compute_mixture_re( &
+                                        alpha_L, Re_visc_per_phase_L, Re_L)
+                                    call s_compute_mixture_re( &
+                                        alpha_R, Re_visc_per_phase_R, Re_R)
+                                end if
+
+                                E_L = gamma_L*pres_L + pi_inf_L + 5.e-1_wp*rho_L*vel_L_rms + qv_L
+                                E_R = gamma_R*pres_R + pi_inf_R + 5.e-1_wp*rho_R*vel_R_rms + qv_R
+
+                                ! ENERGY ADJUSTMENTS FOR HYPOELASTIC ENERGY
+                                if (hypoelasticity) then
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, strxe - strxb + 1
+                                        tau_e_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, strxb - 1 + i)
+                                        tau_e_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, strxb - 1 + i)
+                                    end do
+                                    G_L = 0._wp; G_R = 0._wp
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, num_fluids
+                                        G_L = G_L + alpha_L(i)*Gs_rs(i)
+                                        G_R = G_R + alpha_R(i)*Gs_rs(i)
+                                    end do
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, strxe - strxb + 1
+                                        ! Elastic contribution to energy if G large enough
+                                        if ((G_L > verysmall) .and. (G_R > verysmall)) then
+                                            E_L = E_L + (tau_e_L(i)*tau_e_L(i))/(4._wp*G_L)
+                                            E_R = E_R + (tau_e_R(i)*tau_e_R(i))/(4._wp*G_R)
+                                            ! Additional terms in 2D and 3D
+                                            if ((i == 2) .or. (i == 4) .or. (i == 5)) then
+                                                E_L = E_L + (tau_e_L(i)*tau_e_L(i))/(4._wp*G_L)
+                                                E_R = E_R + (tau_e_R(i)*tau_e_R(i))/(4._wp*G_R)
+                                            end if
+                                        end if
+                                    end do
+                                end if
+
+                                ! ENERGY ADJUSTMENTS FOR HYPERELASTIC ENERGY
+                                if (hyperelasticity) then
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, num_dims
+                                        xi_field_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, xibeg - 1 + i)
+                                        xi_field_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, xibeg - 1 + i)
+                                    end do
+                                    G_L = 0._wp; G_R = 0._wp; 
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, num_fluids
+                                        ! Mixture left and right shear modulus
+                                        G_L = G_L + alpha_L(i)*Gs_rs(i)
+                                        G_R = G_R + alpha_R(i)*Gs_rs(i)
+                                    end do
+                                    ! Elastic contribution to energy if G large enough
+                                    if (G_L > verysmall .and. G_R > verysmall) then
+                                        E_L = E_L + G_L*qL_prim_rs${XYZ}$_vf(j, k, l, xiend + 1)
+                                        E_R = E_R + G_R*qR_prim_rs${XYZ}$_vf(j + 1, k, l, xiend + 1)
+                                    end if
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, b_size - 1
+                                        tau_e_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, strxb - 1 + i)
+                                        tau_e_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, strxb - 1 + i)
+                                    end do
+                                end if
+
+                                H_L = (E_L + pres_L)/rho_L
+                                H_R = (E_R + pres_R)/rho_R
+
+                                @:compute_average_state()
+
+                                call s_compute_speed_of_sound(pres_L, rho_L, gamma_L, pi_inf_L, H_L, alpha_L, &
+                                                              vel_L_rms, 0._wp, c_L, qv_L)
+
+                                call s_compute_speed_of_sound(pres_R, rho_R, gamma_R, pi_inf_R, H_R, alpha_R, &
+                                                              vel_R_rms, 0._wp, c_R, qv_R)
+
+                                !> The computation of c_avg does not require all the variables, and therefore the non '_avg'
+                                ! variables are placeholders to call the subroutine.
+                                call s_compute_speed_of_sound(pres_R, rho_avg, gamma_avg, pi_inf_R, H_avg, alpha_R, &
+                                                              vel_avg_rms, 0._wp, c_avg, qv_avg)
+
+                                if (viscous) then
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, 2
+                                        Re_avg_rs${XYZ}$_vf(j, k, l, i) = 2._wp/(1._wp/Re_L(i) + 1._wp/Re_R(i))
+                                    end do
+                                end if
+
+                                ! Low Mach correction
+                                if (low_Mach == 2) then
+                                    @:compute_low_Mach_correction()
+                                end if
+
+                                ! COMPUTING THE DIRECT WAVE SPEEDS
+                                if (wave_speeds == 1) then
+                                    if (elasticity) then
+                                        s_L = min(vel_L(dir_idx(1)) - sqrt(c_L*c_L + &
+                                                                           (((4._wp*G_L)/3._wp) + tau_e_L(dir_idx_tau(1)))/rho_L), vel_R(dir_idx(1)) - sqrt(c_R*c_R + &
+                                                                                                                                                            (((4._wp*G_R)/3._wp) + tau_e_R(dir_idx_tau(1)))/rho_R))
+                                        s_R = max(vel_R(dir_idx(1)) + sqrt(c_R*c_R + &
+                                                                           (((4._wp*G_R)/3._wp) + tau_e_R(dir_idx_tau(1)))/rho_R), vel_L(dir_idx(1)) + sqrt(c_L*c_L + &
+                                                                                                                                                            (((4._wp*G_L)/3._wp) + tau_e_L(dir_idx_tau(1)))/rho_L))
+                                        s_S = (pres_R - tau_e_R(dir_idx_tau(1)) - pres_L + &
+                                               tau_e_L(dir_idx_tau(1)) + rho_L*vel_L(dir_idx(1))*(s_L - vel_L(dir_idx(1))) - &
+                                               rho_R*vel_R(dir_idx(1))*(s_R - vel_R(dir_idx(1))))/(rho_L*(s_L - vel_L(dir_idx(1))) - &
+                                                                                                   rho_R*(s_R - vel_R(dir_idx(1))))
+                                    else
+                                        s_L = min(vel_L(dir_idx(1)) - c_L, vel_R(dir_idx(1)) - c_R)
+                                        s_R = max(vel_R(dir_idx(1)) + c_R, vel_L(dir_idx(1)) + c_L)
+                                        s_S = (pres_R - pres_L + rho_L*vel_L(dir_idx(1))* &
+                                               (s_L - vel_L(dir_idx(1))) - rho_R*vel_R(dir_idx(1))*(s_R - vel_R(dir_idx(1)))) &
+                                              /(rho_L*(s_L - vel_L(dir_idx(1))) - rho_R*(s_R - vel_R(dir_idx(1))))
+
+                                    end if
+                                elseif (wave_speeds == 2) then
+                                    pres_SL = 5.e-1_wp*(pres_L + pres_R + rho_avg*c_avg* &
+                                                        (vel_L(dir_idx(1)) - &
+                                                         vel_R(dir_idx(1))))
+
+                                    pres_SR = pres_SL
+
+                                    Ms_L = max(1._wp, sqrt(1._wp + ((5.e-1_wp + gamma_L)/(1._wp + gamma_L))* &
+                                                           (pres_SL/pres_L - 1._wp)*pres_L/ &
+                                                           ((pres_L + pi_inf_L/(1._wp + gamma_L)))))
+                                    Ms_R = max(1._wp, sqrt(1._wp + ((5.e-1_wp + gamma_R)/(1._wp + gamma_R))* &
+                                                           (pres_SR/pres_R - 1._wp)*pres_R/ &
+                                                           ((pres_R + pi_inf_R/(1._wp + gamma_R)))))
+
+                                    s_L = vel_L(dir_idx(1)) - c_L*Ms_L
+                                    s_R = vel_R(dir_idx(1)) + c_R*Ms_R
+
+                                    s_S = 5.e-1_wp*((vel_L(dir_idx(1)) + vel_R(dir_idx(1))) + &
+                                                    (pres_L - pres_R)/ &
+                                                    (rho_avg*c_avg))
+                                end if
+
+                                ! follows Einfeldt et al.
+                                ! s_M/P = min/max(0.,s_L/R)
+                                s_M = min(0._wp, s_L); s_P = max(0._wp, s_R)
+
+                                ! goes with q_star_L/R = xi_L/R * (variable)
+                                ! xi_L/R = ( ( s_L/R - u_L/R )/(s_L/R - s_star) )
+                                xi_L = (s_L - vel_L(dir_idx(1)))/(s_L - s_S)
+                                xi_R = (s_R - vel_R(dir_idx(1)))/(s_R - s_S)
+
+                                ! goes with numerical star velocity in x/y/z directions
+                                ! xi_P/M = 0.5 +/m sgn(0.5,s_star)
+                                xi_M = (5.e-1_wp + sign(0.5_wp, s_S))
+                                xi_P = (5.e-1_wp - sign(0.5_wp, s_S))
+
+                                ! goes with the numerical velocity in x/y/z directions
+                                ! xi_P/M (pressure) = min/max(0. sgn(1,sL/sR))
+                                xi_MP = -min(0._wp, sign(1._wp, s_L))
+                                xi_PP = max(0._wp, sign(1._wp, s_R))
+
+                                E_star = xi_M*(E_L + xi_MP*(xi_L*(E_L + (s_S - vel_L(dir_idx(1)))* &
+                                                                  (rho_L*s_S + pres_L/(s_L - vel_L(dir_idx(1))))) - E_L)) + &
+                                         xi_P*(E_R + xi_PP*(xi_R*(E_R + (s_S - vel_R(dir_idx(1)))* &
+                                                                  (rho_R*s_S + pres_R/(s_R - vel_R(dir_idx(1))))) - E_R))
+                                p_Star = xi_M*(pres_L + xi_MP*(rho_L*(s_L - vel_L(dir_idx(1)))*(s_S - vel_L(dir_idx(1))))) + &
+                                         xi_P*(pres_R + xi_PP*(rho_R*(s_R - vel_R(dir_idx(1)))*(s_S - vel_R(dir_idx(1)))))
+
+                                rho_Star = xi_M*(rho_L*(xi_MP*xi_L + 1._wp - xi_MP)) + &
+                                           xi_P*(rho_R*(xi_PP*xi_R + 1._wp - xi_PP))
+
+                                vel_K_Star = vel_L(dir_idx(1))*(1._wp - xi_MP) + xi_MP*vel_R(dir_idx(1)) + &
+                                             xi_MP*xi_PP*(s_S - vel_R(dir_idx(1)))
+
+                                ! Low Mach correction
+                                if (low_Mach == 1) then
+                                    @:compute_low_Mach_correction()
+                                else
+                                    pcorr = 0._wp
+                                end if
+
+                                ! COMPUTING FLUXES
+                                ! MASS FLUX.
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = 1, contxe
+                                    flux_rs${XYZ}$_vf(j, k, l, i) = &
+                                        xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i)*(vel_L(dir_idx(1)) + s_M*(xi_L - 1._wp)) + &
+                                        xi_P*qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)*(vel_R(dir_idx(1)) + s_P*(xi_R - 1._wp))
+                                end do
+
+                                ! MOMENTUM FLUX.
+                                ! f = \rho u u - \sigma, q = \rho u, q_star = \xi * \rho*(s_star, v, w)
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = 1, num_dims
+                                    flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = rho_Star*vel_K_Star* &
+                                                                                      (dir_flg(dir_idx(i))*vel_K_Star + (1._wp - dir_flg(dir_idx(i)))*(xi_M*vel_L(dir_idx(i)) + xi_P*vel_R(dir_idx(i)))) + dir_flg(dir_idx(i))*p_Star &
+                                                                                      + (s_M/s_L)*(s_P/s_R)*dir_flg(dir_idx(i))*pcorr
+                                end do
+
+                                ! ENERGY FLUX.
+                                ! f = u*(E-\sigma), q = E, q_star = \xi*E+(s-u)(\rho s_star - \sigma/(s-u))
+                                flux_rs${XYZ}$_vf(j, k, l, E_idx) = (E_star + p_Star)*vel_K_Star &
+                                                                    + (s_M/s_L)*(s_P/s_R)*pcorr*s_S
+
+                                ! ELASTICITY. Elastic shear stress additions for the momentum and energy flux
+                                if (elasticity) then
+                                    flux_ene_e = 0._wp; 
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, num_dims
+                                        ! MOMENTUM ELASTIC FLUX.
+                                        flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = &
+                                            flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) &
+                                            - xi_M*tau_e_L(dir_idx_tau(i)) - xi_P*tau_e_R(dir_idx_tau(i))
+                                        ! ENERGY ELASTIC FLUX.
+                                        flux_ene_e = flux_ene_e - &
+                                                     xi_M*(vel_L(dir_idx(i))*tau_e_L(dir_idx_tau(i)) + &
+                                                           s_M*(xi_L*((s_S - vel_L(i))*(tau_e_L(dir_idx_tau(i))/(s_L - vel_L(i)))))) - &
+                                                     xi_P*(vel_R(dir_idx(i))*tau_e_R(dir_idx_tau(i)) + &
+                                                           s_P*(xi_R*((s_S - vel_R(i))*(tau_e_R(dir_idx_tau(i))/(s_R - vel_R(i))))))
+                                    end do
+                                    flux_rs${XYZ}$_vf(j, k, l, E_idx) = flux_rs${XYZ}$_vf(j, k, l, E_idx) + flux_ene_e
+                                end if
+
+                                ! VOLUME FRACTION FLUX.
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = advxb, advxe
+                                    flux_rs${XYZ}$_vf(j, k, l, i) = &
+                                        xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i)*s_S + &
+                                        xi_P*qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)*s_S
+                                end do
+
+                                ! SOURCE TERM FOR VOLUME FRACTION ADVECTION FLUX.
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = 1, num_dims
+                                    vel_src_rs${XYZ}$_vf(j, k, l, dir_idx(i)) = &
+                                        xi_M*(vel_L(dir_idx(i)) + dir_flg(dir_idx(i))*(s_S*(xi_MP*(xi_L - 1) + 1) - vel_L(dir_idx(i)))) + &
+                                        xi_P*(vel_R(dir_idx(i)) + dir_flg(dir_idx(i))*(s_S*(xi_PP*(xi_R - 1) + 1) - vel_R(dir_idx(i))))
+                                end do
+
+                                ! INTERNAL ENERGIES ADVECTION FLUX.
+                                ! K-th pressure and velocity in preparation for the internal energy flux
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = 1, num_fluids
+                                    p_K_Star = xi_M*(xi_MP*((pres_L + pi_infs(i)/(1._wp + gammas(i)))* &
+                                                            xi_L**(1._wp/gammas(i) + 1._wp) - pi_infs(i)/(1._wp + gammas(i)) - pres_L) + pres_L) + &
+                                               xi_P*(xi_PP*((pres_R + pi_infs(i)/(1._wp + gammas(i)))* &
+                                                            xi_R**(1._wp/gammas(i) + 1._wp) - pi_infs(i)/(1._wp + gammas(i)) - pres_R) + pres_R)
+
+                                    flux_rs${XYZ}$_vf(j, k, l, i + intxb - 1) = &
+                                        ((xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i + advxb - 1) + xi_P*qR_prim_rs${XYZ}$_vf(j + 1, k, l, i + advxb - 1))* &
+                                         (gammas(i)*p_K_Star + pi_infs(i)) + &
+                                         (xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i + contxb - 1) + xi_P*qR_prim_rs${XYZ}$_vf(j + 1, k, l, i + contxb - 1))* &
+                                         qvs(i))*vel_K_Star &
+                                        + (s_M/s_L)*(s_P/s_R)*pcorr*s_S*(xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i + advxb - 1) + xi_P*qR_prim_rs${XYZ}$_vf(j + 1, k, l, i + advxb - 1))
+                                end do
+
+                                flux_src_rs${XYZ}$_vf(j, k, l, advxb) = vel_src_rs${XYZ}$_vf(j, k, l, dir_idx(1))
+
+                                ! HYPOELASTIC STRESS EVOLUTION FLUX.
+                                if (hypoelasticity) then
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, strxe - strxb + 1
+                                        flux_rs${XYZ}$_vf(j, k, l, strxb - 1 + i) = &
+                                            xi_M*(s_S/(s_L - s_S))*(s_L*rho_L*tau_e_L(i) - rho_L*vel_L(dir_idx(1))*tau_e_L(i)) + &
+                                            xi_P*(s_S/(s_R - s_S))*(s_R*rho_R*tau_e_R(i) - rho_R*vel_R(dir_idx(1))*tau_e_R(i))
+                                    end do
+                                end if
+
+                                ! REFERENCE MAP FLUX.
+                                if (hyperelasticity) then
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, num_dims
+                                        flux_rs${XYZ}$_vf(j, k, l, xibeg - 1 + i) = &
+                                            xi_M*(s_S/(s_L - s_S))*(s_L*rho_L*xi_field_L(i) &
+                                                                    - rho_L*vel_L(dir_idx(1))*xi_field_L(i)) + &
+                                            xi_P*(s_S/(s_R - s_S))*(s_R*rho_R*xi_field_R(i) &
+                                                                    - rho_R*vel_R(dir_idx(1))*xi_field_R(i))
+                                    end do
+                                end if
+
+                                ! COLOR FUNCTION FLUX
+                                if (surface_tension) then
+                                    flux_rs${XYZ}$_vf(j, k, l, c_idx) = &
+                                        (xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, c_idx) + &
+                                         xi_P*qR_prim_rs${XYZ}$_vf(j + 1, k, l, c_idx))*s_S
+                                end if
+
+                                ! Geometrical source flux for cylindrical coordinates
+                                #:if (NORM_DIR == 2)
+                                    if (cyl_coord) then
+                                        !Substituting the advective flux into the inviscid geometrical source flux
+                                        $:GPU_LOOP(parallelism='[seq]')
+                                        do i = 1, E_idx
+                                            flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i)
+                                        end do
+                                        $:GPU_LOOP(parallelism='[seq]')
+                                        do i = intxb, intxe
+                                            flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i)
+                                        end do
+                                        ! Recalculating the radial momentum geometric source flux
+                                        flux_gsrc_rs${XYZ}$_vf(j, k, l, momxb - 1 + dir_idx(1)) = &
+                                            flux_gsrc_rs${XYZ}$_vf(j, k, l, momxb - 1 + dir_idx(1)) - p_Star
+                                        ! Geometrical source of the void fraction(s) is zero
+                                        $:GPU_LOOP(parallelism='[seq]')
+                                        do i = advxb, advxe
+                                            flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0._wp
+                                        end do
+                                    end if
+                                #:endif
+                                #:if (NORM_DIR == 3)
+                                    if (grid_geometry == 3) then
+                                        $:GPU_LOOP(parallelism='[seq]')
+                                        do i = 1, sys_size
+                                            flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0._wp
+                                        end do
+                                        flux_gsrc_rs${XYZ}$_vf(j, k, l, momxb - 1 + dir_idx(1)) = &
+                                            flux_gsrc_rs${XYZ}$_vf(j, k, l, momxb - 1 + dir_idx(1)) - p_Star
+
+                                        flux_gsrc_rs${XYZ}$_vf(j, k, l, momxe) = flux_rs${XYZ}$_vf(j, k, l, momxb + 1)
+                                    end if
+                                #:endif
+
+                            end do
+                        end do
+                    end do
+                    $:END_GPU_PARALLEL_LOOP()
+
+                elseif (model_eqns == 4) then
+                    !ME4
+                    $:GPU_PARALLEL_LOOP(collapse=3, private='[i, q, alpha_rho_L, alpha_rho_R, vel_L, vel_R, alpha_L, alpha_R, nbub_L, nbub_R, rho_L, rho_R, pres_L, pres_R, E_L, E_R, H_L, H_R, Cp_avg, Cv_avg, T_avg, eps, c_sum_Yi_Phi, T_L, T_R, Y_L, Y_R, MW_L, MW_R, R_gas_L, R_gas_R, Cp_L, Cp_R, Gamm_L, Gamm_R, gamma_L, gamma_R, pi_inf_L, pi_inf_R, qv_L, qv_R, qv_avg,c_L, c_R, G_L, G_R, rho_avg, H_avg, c_avg, gamma_avg, ptilde_L, ptilde_R, vel_L_rms, vel_R_rms, vel_avg_rms, vel_L_tmp, vel_R_tmp, Ms_L, Ms_R, pres_SL, pres_SR, alpha_L_sum, alpha_R_sum, rho_Star, E_Star, p_Star, p_K_Star, vel_K_star, s_L, s_R, s_M, s_P, s_S, xi_M, xi_P, xi_L, xi_R, xi_MP, xi_PP, Re_visc_per_phase_L, Re_visc_per_phase_R]')
+                    do l = is3%beg, is3%end
+                        do k = is2%beg, is2%end
+                            do j = is1%beg, is1%end
+
+                                vel_L_rms = 0._wp; vel_R_rms = 0._wp
+                                rho_L = 0._wp; rho_R = 0._wp
+                                gamma_L = 0._wp; gamma_R = 0._wp
+                                pi_inf_L = 0._wp; pi_inf_R = 0._wp
+                                qv_L = 0._wp; qv_R = 0._wp
+
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = 1, contxe
+                                    alpha_rho_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, i)
+                                    alpha_rho_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)
+                                end do
+
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = 1, num_dims
+                                    vel_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, contxe + i)
+                                    vel_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, contxe + i)
+                                    vel_L_rms = vel_L_rms + vel_L(i)**2._wp
+                                    vel_R_rms = vel_R_rms + vel_R(i)**2._wp
+                                end do
+
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = 1, num_fluids
+                                    alpha_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)
+                                    alpha_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)
+                                end do
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = 1, num_fluids
+                                    alpha_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)
+                                    alpha_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)
+                                end do
+
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = 1, num_fluids
+                                    rho_L = rho_L + alpha_rho_L(i)
+                                    gamma_L = gamma_L + alpha_L(i)*gammas(i)
+                                    pi_inf_L = pi_inf_L + alpha_L(i)*pi_infs(i)
+                                    qv_L = qv_L + alpha_rho_L(i)*qvs(i)
+
+                                    rho_R = rho_R + alpha_rho_R(i)
+                                    gamma_R = gamma_R + alpha_R(i)*gammas(i)
+                                    pi_inf_R = pi_inf_R + alpha_R(i)*pi_infs(i)
+                                    qv_R = qv_R + alpha_rho_R(i)*qvs(i)
+                                end do
+
+                                pres_L = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx)
+                                pres_R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx)
+
+                                E_L = gamma_L*pres_L + pi_inf_L + 5.e-1_wp*rho_L*vel_L_rms + qv_L
+                                E_R = gamma_R*pres_R + pi_inf_R + 5.e-1_wp*rho_R*vel_R_rms + qv_R
+
+                                H_L = (E_L + pres_L)/rho_L
+                                H_R = (E_R + pres_R)/rho_R
+
+                                @:compute_average_state()
+
+                                call s_compute_speed_of_sound(pres_L, rho_L, gamma_L, pi_inf_L, H_L, alpha_L, &
+                                                              vel_L_rms, 0._wp, c_L, qv_L)
+
+                                call s_compute_speed_of_sound(pres_R, rho_R, gamma_R, pi_inf_R, H_R, alpha_R, &
+                                                              vel_R_rms, 0._wp, c_R, qv_R)
+
+                                !> The computation of c_avg does not require all the variables, and therefore the non '_avg'
+                                ! variables are placeholders to call the subroutine.
+
+                                call s_compute_speed_of_sound(pres_R, rho_avg, gamma_avg, pi_inf_R, H_avg, alpha_R, &
+                                                              vel_avg_rms, 0._wp, c_avg, qv_avg)
+
+                                if (wave_speeds == 1) then
+                                    s_L = min(vel_L(dir_idx(1)) - c_L, vel_R(dir_idx(1)) - c_R)
+                                    s_R = max(vel_R(dir_idx(1)) + c_R, vel_L(dir_idx(1)) + c_L)
+
+                                    s_S = (pres_R - pres_L + rho_L*vel_L(dir_idx(1))* &
+                                           (s_L - vel_L(dir_idx(1))) - &
+                                           rho_R*vel_R(dir_idx(1))* &
+                                           (s_R - vel_R(dir_idx(1)))) &
+                                          /(rho_L*(s_L - vel_L(dir_idx(1))) - &
+                                            rho_R*(s_R - vel_R(dir_idx(1))))
+                                elseif (wave_speeds == 2) then
+                                    pres_SL = 5.e-1_wp*(pres_L + pres_R + rho_avg*c_avg* &
+                                                        (vel_L(dir_idx(1)) - &
+                                                         vel_R(dir_idx(1))))
+
+                                    pres_SR = pres_SL
+
+                                    Ms_L = max(1._wp, sqrt(1._wp + ((5.e-1_wp + gamma_L)/(1._wp + gamma_L))* &
+                                                           (pres_SL/pres_L - 1._wp)*pres_L/ &
+                                                           ((pres_L + pi_inf_L/(1._wp + gamma_L)))))
+                                    Ms_R = max(1._wp, sqrt(1._wp + ((5.e-1_wp + gamma_R)/(1._wp + gamma_R))* &
+                                                           (pres_SR/pres_R - 1._wp)*pres_R/ &
+                                                           ((pres_R + pi_inf_R/(1._wp + gamma_R)))))
+
+                                    s_L = vel_L(dir_idx(1)) - c_L*Ms_L
+                                    s_R = vel_R(dir_idx(1)) + c_R*Ms_R
+
+                                    s_S = 5.e-1_wp*((vel_L(dir_idx(1)) + vel_R(dir_idx(1))) + &
+                                                    (pres_L - pres_R)/ &
+                                                    (rho_avg*c_avg))
+                                end if
+
+                                ! follows Einfeldt et al.
+                                ! s_M/P = min/max(0.,s_L/R)
+                                s_M = min(0._wp, s_L); s_P = max(0._wp, s_R)
+
+                                ! goes with q_star_L/R = xi_L/R * (variable)
+                                ! xi_L/R = ( ( s_L/R - u_L/R )/(s_L/R - s_star) )
+                                xi_L = (s_L - vel_L(dir_idx(1)))/(s_L - s_S)
+                                xi_R = (s_R - vel_R(dir_idx(1)))/(s_R - s_S)
+
+                                ! goes with numerical velocity in x/y/z directions
+                                ! xi_P/M = 0.5 +/m sgn(0.5,s_star)
+                                xi_M = (5.e-1_wp + sign(5.e-1_wp, s_S))
+                                xi_P = (5.e-1_wp - sign(5.e-1_wp, s_S))
+
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = 1, contxe
+                                    flux_rs${XYZ}$_vf(j, k, l, i) = &
+                                        xi_M*alpha_rho_L(i) &
+                                        *(vel_L(dir_idx(1)) + s_M*(xi_L - 1._wp)) &
+                                        + xi_P*alpha_rho_R(i) &
+                                        *(vel_R(dir_idx(1)) + s_P*(xi_R - 1._wp))
+                                end do
+
+                                ! Momentum flux.
+                                ! f = \rho u u + p I, q = \rho u, q_star = \xi * \rho*(s_star, v, w)
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = 1, num_dims
+                                    flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = &
+                                        xi_M*(rho_L*(vel_L(dir_idx(1))* &
+                                                     vel_L(dir_idx(i)) + &
+                                                     s_M*(xi_L*(dir_flg(dir_idx(i))*s_S + &
+                                                                (1._wp - dir_flg(dir_idx(i)))* &
+                                                                vel_L(dir_idx(i))) - vel_L(dir_idx(i)))) + &
+                                              dir_flg(dir_idx(i))*pres_L) &
+                                        + xi_P*(rho_R*(vel_R(dir_idx(1))* &
+                                                       vel_R(dir_idx(i)) + &
+                                                       s_P*(xi_R*(dir_flg(dir_idx(i))*s_S + &
+                                                                  (1._wp - dir_flg(dir_idx(i)))* &
+                                                                  vel_R(dir_idx(i))) - vel_R(dir_idx(i)))) + &
+                                                dir_flg(dir_idx(i))*pres_R)
+                                end do
+
+                                if (bubbles_euler) then
+                                    ! Put p_tilde in
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, num_dims
+                                        flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = &
+                                            flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) + &
+                                            xi_M*(dir_flg(dir_idx(i))*(-1._wp*ptilde_L)) &
+                                            + xi_P*(dir_flg(dir_idx(i))*(-1._wp*ptilde_R))
+                                    end do
+                                end if
+
+                                flux_rs${XYZ}$_vf(j, k, l, E_idx) = 0._wp
+
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = alf_idx, alf_idx !only advect the void fraction
+                                    flux_rs${XYZ}$_vf(j, k, l, i) = &
+                                        xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i) &
+                                        *(vel_L(dir_idx(1)) + s_M*(xi_L - 1._wp)) &
+                                        + xi_P*qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) &
+                                        *(vel_R(dir_idx(1)) + s_P*(xi_R - 1._wp))
+                                end do
+
+                                ! Source for volume fraction advection equation
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = 1, num_dims
+
+                                    vel_src_rs${XYZ}$_vf(j, k, l, dir_idx(i)) = 0._wp
+                                    !IF ( (model_eqns == 4) .or. (num_fluids==1) ) vel_src_rs_vf(dir_idx(i))%sf(j,k,l) = 0._wp
+                                end do
+
+                                flux_src_rs${XYZ}$_vf(j, k, l, advxb) = vel_src_rs${XYZ}$_vf(j, k, l, dir_idx(1))
+
+                                ! Add advection flux for bubble variables
+                                if (bubbles_euler) then
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = bubxb, bubxe
+                                        flux_rs${XYZ}$_vf(j, k, l, i) = &
+                                            xi_M*nbub_L*qL_prim_rs${XYZ}$_vf(j, k, l, i) &
+                                            *(vel_L(dir_idx(1)) + s_M*(xi_L - 1._wp)) &
+                                            + xi_P*nbub_R*qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) &
+                                            *(vel_R(dir_idx(1)) + s_P*(xi_R - 1._wp))
+                                    end do
+                                end if
+
+                                ! Geometrical source flux for cylindrical coordinates
+
+                                #:if (NORM_DIR == 2)
+                                    if (cyl_coord) then
+                                        ! Substituting the advective flux into the inviscid geometrical source flux
+                                        $:GPU_LOOP(parallelism='[seq]')
+                                        do i = 1, E_idx
+                                            flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i)
+                                        end do
+                                        ! Recalculating the radial momentum geometric source flux
+                                        flux_gsrc_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(1)) = &
+                                            xi_M*(rho_L*(vel_L(dir_idx(1))* &
+                                                         vel_L(dir_idx(1)) + &
+                                                         s_M*(xi_L*(dir_flg(dir_idx(1))*s_S + &
+                                                                    (1._wp - dir_flg(dir_idx(1)))* &
+                                                                    vel_L(dir_idx(1))) - vel_L(dir_idx(1))))) &
+                                            + xi_P*(rho_R*(vel_R(dir_idx(1))* &
+                                                           vel_R(dir_idx(1)) + &
+                                                           s_P*(xi_R*(dir_flg(dir_idx(1))*s_S + &
+                                                                      (1._wp - dir_flg(dir_idx(1)))* &
+                                                                      vel_R(dir_idx(1))) - vel_R(dir_idx(1)))))
+                                        ! Geometrical source of the void fraction(s) is zero
+                                        $:GPU_LOOP(parallelism='[seq]')
+                                        do i = advxb, advxe
+                                            flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0._wp
+                                        end do
+                                    end if
+                                #:endif
+                                #:if (NORM_DIR == 3)
+                                    if (grid_geometry == 3) then
+                                        $:GPU_LOOP(parallelism='[seq]')
+                                        do i = 1, sys_size
+                                            flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0._wp
+                                        end do
+                                        flux_gsrc_rs${XYZ}$_vf(j, k, l, momxb + 1) = &
+                                            -xi_M*(rho_L*(vel_L(dir_idx(1))* &
+                                                          vel_L(dir_idx(1)) + &
+                                                          s_M*(xi_L*(dir_flg(dir_idx(1))*s_S + &
+                                                                     (1._wp - dir_flg(dir_idx(1)))* &
+                                                                     vel_L(dir_idx(1))) - vel_L(dir_idx(1))))) &
+                                            - xi_P*(rho_R*(vel_R(dir_idx(1))* &
+                                                           vel_R(dir_idx(1)) + &
+                                                           s_P*(xi_R*(dir_flg(dir_idx(1))*s_S + &
+                                                                      (1._wp - dir_flg(dir_idx(1)))* &
+                                                                      vel_R(dir_idx(1))) - vel_R(dir_idx(1)))))
+                                        flux_gsrc_rs${XYZ}$_vf(j, k, l, momxe) = flux_rs${XYZ}$_vf(j, k, l, momxb + 1)
+                                    end if
+                                #:endif
+                            end do
+                        end do
+                    end do
+                    $:END_GPU_PARALLEL_LOOP()
+
+                elseif (model_eqns == 2 .and. bubbles_euler) then
+                    $:GPU_PARALLEL_LOOP(collapse=3, private='[i, q, R0_L, R0_R, V0_L, V0_R, P0_L, P0_R, pbw_L, pbw_R, vel_L, vel_R, rho_avg, alpha_L, alpha_R, h_avg, gamma_avg, Re_L, Re_R, pcorr, zcoef, rho_L, rho_R, pres_L, pres_R, E_L, E_R, H_L, H_R, gamma_L, gamma_R, pi_inf_L, pi_inf_R, qv_L, qv_R, qv_avg, c_L, c_R, c_avg, vel_L_rms, vel_R_rms, vel_avg_rms, vel_L_tmp, vel_R_tmp, Ms_L, Ms_R, pres_SL, pres_SR, alpha_L_sum, alpha_R_sum, s_L, s_R, s_M, s_P, s_S, xi_M, xi_P, xi_L, xi_R, xi_MP, xi_PP, nbub_L, nbub_R, PbwR3Lbar, PbwR3Rbar, R3Lbar, R3Rbar, R3V2Lbar, R3V2Rbar, Re_visc_per_phase_L, Re_visc_per_phase_R]')
+                    do l = is3%beg, is3%end
+                        do k = is2%beg, is2%end
+                            do j = is1%beg, is1%end
+
+                                vel_L_rms = 0._wp; vel_R_rms = 0._wp
+                                rho_L = 0._wp; rho_R = 0._wp
+                                gamma_L = 0._wp; gamma_R = 0._wp
+                                pi_inf_L = 0._wp; pi_inf_R = 0._wp
+                                qv_L = 0._wp; qv_R = 0._wp
+
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = 1, num_fluids
+                                    alpha_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)
+                                    alpha_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)
+                                end do
+
+                                vel_L_rms = 0._wp; vel_R_rms = 0._wp
+
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = 1, num_dims
+                                    vel_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, contxe + i)
+                                    vel_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, contxe + i)
+                                    vel_L_rms = vel_L_rms + vel_L(i)**2._wp
+                                    vel_R_rms = vel_R_rms + vel_R(i)**2._wp
+                                end do
+
+                                ! Retain this in the refactor
+                                if (mpp_lim .and. (num_fluids > 2)) then
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, num_fluids
+                                        rho_L = rho_L + qL_prim_rs${XYZ}$_vf(j, k, l, i)
+                                        gamma_L = gamma_L + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)*gammas(i)
+                                        pi_inf_L = pi_inf_L + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)*pi_infs(i)
+                                        qv_L = qv_L + qL_prim_rs${XYZ}$_vf(j, k, l, i)*qvs(i)
+                                        rho_R = rho_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)
+                                        gamma_R = gamma_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)*gammas(i)
+                                        pi_inf_R = pi_inf_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)*pi_infs(i)
+                                        qv_R = qv_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)*qvs(i)
+                                    end do
+                                else if (num_fluids > 2) then
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, num_fluids - 1
+                                        rho_L = rho_L + qL_prim_rs${XYZ}$_vf(j, k, l, i)
+                                        gamma_L = gamma_L + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)*gammas(i)
+                                        pi_inf_L = pi_inf_L + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)*pi_infs(i)
+                                        qv_L = qv_L + qL_prim_rs${XYZ}$_vf(j, k, l, i)*qvs(i)
+                                        rho_R = rho_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)
+                                        gamma_R = gamma_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)*gammas(i)
+                                        pi_inf_R = pi_inf_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)*pi_infs(i)
+                                        qv_R = qv_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)*qvs(i)
+                                    end do
+                                else
+                                    rho_L = qL_prim_rs${XYZ}$_vf(j, k, l, 1)
+                                    gamma_L = gammas(1)
+                                    pi_inf_L = pi_infs(1)
+                                    qv_L = qvs(1)
+                                    rho_R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, 1)
+                                    gamma_R = gammas(1)
+                                    pi_inf_R = pi_infs(1)
+                                    qv_R = qvs(1)
+                                end if
+
+                                if (viscous) then
+                                    if (num_fluids == 1) then ! Need to consider case with num_fluids >= 2
+                                        ! Map rotated (j,k,l) to physical (x,y,z) indices
+                                        #:if NORM_DIR == 1
+                                            call s_compute_re_visc(q_prim_vf, &
+                                                alpha_L, j, k, l, Re_visc_per_phase_L)
+                                            call s_compute_re_visc(q_prim_vf, &
+                                                alpha_R, j + 1, k, l, Re_visc_per_phase_R)
+                                        #:elif NORM_DIR == 2
+                                            call s_compute_re_visc(q_prim_vf, &
+                                                alpha_L, k, j, l, Re_visc_per_phase_L)
+                                            call s_compute_re_visc(q_prim_vf, &
+                                                alpha_R, k, j + 1, l, Re_visc_per_phase_R)
+                                        #:else
+                                            call s_compute_re_visc(q_prim_vf, &
+                                                alpha_L, l, k, j, Re_visc_per_phase_L)
+                                            call s_compute_re_visc(q_prim_vf, &
+                                                alpha_R, l, k, j + 1, Re_visc_per_phase_R)
+                                        #:endif
+                                        call s_compute_mixture_re( &
+                                            alpha_L, Re_visc_per_phase_L, Re_L)
+                                        call s_compute_mixture_re( &
+                                            alpha_R, Re_visc_per_phase_R, Re_R)
+                                    end if
+                                end if
+
+                                pres_L = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx)
+                                pres_R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx)
+
+                                E_L = gamma_L*pres_L + pi_inf_L + 5.e-1_wp*rho_L*vel_L_rms
+                                E_R = gamma_R*pres_R + pi_inf_R + 5.e-1_wp*rho_R*vel_R_rms
+
+                                H_L = (E_L + pres_L)/rho_L
+                                H_R = (E_R + pres_R)/rho_R
+
+                                if (avg_state == 2) then
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, nb
+                                        R0_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, rs(i))
+                                        R0_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, rs(i))
+
+                                        V0_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, vs(i))
+                                        V0_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, vs(i))
+                                        if (.not. polytropic .and. .not. qbmm) then
+                                            P0_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, ps(i))
+                                            P0_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, ps(i))
+                                        end if
+                                    end do
+
+                                    if (.not. qbmm) then
+                                        if (adv_n) then
+                                            nbub_L = qL_prim_rs${XYZ}$_vf(j, k, l, n_idx)
+                                            nbub_R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, n_idx)
+                                        else
+                                            nbub_L = 0._wp
+                                            nbub_R = 0._wp
+                                            $:GPU_LOOP(parallelism='[seq]')
+                                            do i = 1, nb
+                                                nbub_L = nbub_L + (R0_L(i)**3._wp)*weight(i)
+                                                nbub_R = nbub_R + (R0_R(i)**3._wp)*weight(i)
+                                            end do
+
+                                            nbub_L = (3._wp/(4._wp*pi))*qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + num_fluids)/nbub_L
+                                            nbub_R = (3._wp/(4._wp*pi))*qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + num_fluids)/nbub_R
+                                        end if
+                                    else
+                                        !nb stored in 0th moment of first R0 bin in variable conversion module
+                                        nbub_L = qL_prim_rs${XYZ}$_vf(j, k, l, bubxb)
+                                        nbub_R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, bubxb)
+                                    end if
+
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, nb
+                                        if (.not. qbmm) then
+                                            pbw_L(i) = f_cpbw_KM(R0(i), R0_L(i), V0_L(i), P0_L(i))
+                                            pbw_R(i) = f_cpbw_KM(R0(i), R0_R(i), V0_R(i), P0_R(i))
+                                        end if
+                                    end do
+
+                                    if (qbmm) then
+                                        PbwR3Lbar = mom_sp_rs${XYZ}$_vf(j, k, l, 4)
+                                        PbwR3Rbar = mom_sp_rs${XYZ}$_vf(j + 1, k, l, 4)
+
+                                        R3Lbar = mom_sp_rs${XYZ}$_vf(j, k, l, 1)
+                                        R3Rbar = mom_sp_rs${XYZ}$_vf(j + 1, k, l, 1)
+
+                                        R3V2Lbar = mom_sp_rs${XYZ}$_vf(j, k, l, 3)
+                                        R3V2Rbar = mom_sp_rs${XYZ}$_vf(j + 1, k, l, 3)
+                                    else
+
+                                        PbwR3Lbar = 0._wp
+                                        PbwR3Rbar = 0._wp
+
+                                        R3Lbar = 0._wp
+                                        R3Rbar = 0._wp
+
+                                        R3V2Lbar = 0._wp
+                                        R3V2Rbar = 0._wp
+
+                                        $:GPU_LOOP(parallelism='[seq]')
+                                        do i = 1, nb
+                                            PbwR3Lbar = PbwR3Lbar + pbw_L(i)*(R0_L(i)**3._wp)*weight(i)
+                                            PbwR3Rbar = PbwR3Rbar + pbw_R(i)*(R0_R(i)**3._wp)*weight(i)
+
+                                            R3Lbar = R3Lbar + (R0_L(i)**3._wp)*weight(i)
+                                            R3Rbar = R3Rbar + (R0_R(i)**3._wp)*weight(i)
+
+                                            R3V2Lbar = R3V2Lbar + (R0_L(i)**3._wp)*(V0_L(i)**2._wp)*weight(i)
+                                            R3V2Rbar = R3V2Rbar + (R0_R(i)**3._wp)*(V0_R(i)**2._wp)*weight(i)
+                                        end do
+                                    end if
+
+                                    rho_avg = 5.e-1_wp*(rho_L + rho_R)
+                                    H_avg = 5.e-1_wp*(H_L + H_R)
+                                    gamma_avg = 5.e-1_wp*(gamma_L + gamma_R)
+                                    qv_avg = 5.e-1_wp*(qv_L + qv_R)
+                                    vel_avg_rms = 0._wp
+
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, num_dims
+                                        vel_avg_rms = vel_avg_rms + (5.e-1_wp*(vel_L(i) + vel_R(i)))**2._wp
+                                    end do
+
+                                end if
+
+                                call s_compute_speed_of_sound(pres_L, rho_L, gamma_L, pi_inf_L, H_L, alpha_L, &
+                                                              vel_L_rms, 0._wp, c_L, qv_L)
+
+                                call s_compute_speed_of_sound(pres_R, rho_R, gamma_R, pi_inf_R, H_R, alpha_R, &
+                                                              vel_R_rms, 0._wp, c_R, qv_R)
+
+                                !> The computation of c_avg does not require all the variables, and therefore the non '_avg'
+                                ! variables are placeholders to call the subroutine.
+                                call s_compute_speed_of_sound(pres_R, rho_avg, gamma_avg, pi_inf_R, H_avg, alpha_R, &
+                                                              vel_avg_rms, 0._wp, c_avg, qv_avg)
+
+                                if (viscous) then
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, 2
+                                        Re_avg_rs${XYZ}$_vf(j, k, l, i) = 2._wp/(1._wp/Re_L(i) + 1._wp/Re_R(i))
+                                    end do
+                                end if
+
+                                ! Low Mach correction
+                                if (low_Mach == 2) then
+                                    @:compute_low_Mach_correction()
+                                end if
+
+                                if (wave_speeds == 1) then
+                                    s_L = min(vel_L(dir_idx(1)) - c_L, vel_R(dir_idx(1)) - c_R)
+                                    s_R = max(vel_R(dir_idx(1)) + c_R, vel_L(dir_idx(1)) + c_L)
+
+                                    s_S = (pres_R - pres_L + rho_L*vel_L(dir_idx(1))* &
+                                           (s_L - vel_L(dir_idx(1))) - &
+                                           rho_R*vel_R(dir_idx(1))* &
+                                           (s_R - vel_R(dir_idx(1)))) &
+                                          /(rho_L*(s_L - vel_L(dir_idx(1))) - &
+                                            rho_R*(s_R - vel_R(dir_idx(1))))
+                                elseif (wave_speeds == 2) then
+                                    pres_SL = 5.e-1_wp*(pres_L + pres_R + rho_avg*c_avg* &
+                                                        (vel_L(dir_idx(1)) - &
+                                                         vel_R(dir_idx(1))))
+
+                                    pres_SR = pres_SL
+
+                                    Ms_L = max(1._wp, sqrt(1._wp + ((5.e-1_wp + gamma_L)/(1._wp + gamma_L))* &
+                                                           (pres_SL/pres_L - 1._wp)*pres_L/ &
+                                                           ((pres_L + pi_inf_L/(1._wp + gamma_L)))))
+                                    Ms_R = max(1._wp, sqrt(1._wp + ((5.e-1_wp + gamma_R)/(1._wp + gamma_R))* &
+                                                           (pres_SR/pres_R - 1._wp)*pres_R/ &
+                                                           ((pres_R + pi_inf_R/(1._wp + gamma_R)))))
+
+                                    s_L = vel_L(dir_idx(1)) - c_L*Ms_L
+                                    s_R = vel_R(dir_idx(1)) + c_R*Ms_R
+
+                                    s_S = 5.e-1_wp*((vel_L(dir_idx(1)) + vel_R(dir_idx(1))) + &
+                                                    (pres_L - pres_R)/ &
+                                                    (rho_avg*c_avg))
+                                end if
+
+                                ! follows Einfeldt et al.
+                                ! s_M/P = min/max(0.,s_L/R)
+                                s_M = min(0._wp, s_L); s_P = max(0._wp, s_R)
+
+                                ! goes with q_star_L/R = xi_L/R * (variable)
+                                ! xi_L/R = ( ( s_L/R - u_L/R )/(s_L/R - s_star) )
+                                xi_L = (s_L - vel_L(dir_idx(1)))/(s_L - s_S)
+                                xi_R = (s_R - vel_R(dir_idx(1)))/(s_R - s_S)
+
+                                ! goes with numerical velocity in x/y/z directions
+                                ! xi_P/M = 0.5 +/m sgn(0.5,s_star)
+                                xi_M = (5.e-1_wp + sign(5.e-1_wp, s_S))
+                                xi_P = (5.e-1_wp - sign(5.e-1_wp, s_S))
+
+                                ! Low Mach correction
+                                if (low_Mach == 1) then
+                                    @:compute_low_Mach_correction()
+                                else
+                                    pcorr = 0._wp
+                                end if
+
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = 1, contxe
+                                    flux_rs${XYZ}$_vf(j, k, l, i) = &
+                                        xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i) &
+                                        *(vel_L(dir_idx(1)) + s_M*(xi_L - 1._wp)) &
+                                        + xi_P*qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) &
+                                        *(vel_R(dir_idx(1)) + s_P*(xi_R - 1._wp))
+                                end do
+
+                                if (bubbles_euler .and. (num_fluids > 1)) then
+                                    ! Kill mass transport @ gas density
+                                    flux_rs${XYZ}$_vf(j, k, l, contxe) = 0._wp
+                                end if
+
+                                ! Momentum flux.
+                                ! f = \rho u u + p I, q = \rho u, q_star = \xi * \rho*(s_star, v, w)
+
+                                ! Include p_tilde
+
+                                if (avg_state == 2) then
+                                    if (alpha_L(num_fluids) < small_alf .or. R3Lbar < small_alf) then
+                                        pres_L = pres_L - alpha_L(num_fluids)*pres_L
+                                    else
+                                        pres_L = pres_L - alpha_L(num_fluids)*(pres_L - PbwR3Lbar/R3Lbar - &
+                                                                               rho_L*R3V2Lbar/R3Lbar)
+                                    end if
+
+                                    if (alpha_R(num_fluids) < small_alf .or. R3Rbar < small_alf) then
+                                        pres_R = pres_R - alpha_R(num_fluids)*pres_R
+                                    else
+                                        pres_R = pres_R - alpha_R(num_fluids)*(pres_R - PbwR3Rbar/R3Rbar - &
+                                                                               rho_R*R3V2Rbar/R3Rbar)
+                                    end if
+                                end if
+
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = 1, num_dims
+                                    flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = &
+                                        xi_M*(rho_L*(vel_L(dir_idx(1))* &
+                                                     vel_L(dir_idx(i)) + &
+                                                     s_M*(xi_L*(dir_flg(dir_idx(i))*s_S + &
+                                                                (1._wp - dir_flg(dir_idx(i)))* &
+                                                                vel_L(dir_idx(i))) - vel_L(dir_idx(i)))) + &
+                                              dir_flg(dir_idx(i))*(pres_L)) &
+                                        + xi_P*(rho_R*(vel_R(dir_idx(1))* &
+                                                       vel_R(dir_idx(i)) + &
+                                                       s_P*(xi_R*(dir_flg(dir_idx(i))*s_S + &
+                                                                  (1._wp - dir_flg(dir_idx(i)))* &
+                                                                  vel_R(dir_idx(i))) - vel_R(dir_idx(i)))) + &
+                                                dir_flg(dir_idx(i))*(pres_R)) &
+                                        + (s_M/s_L)*(s_P/s_R)*dir_flg(dir_idx(i))*pcorr
+                                end do
+
+                                ! Energy flux.
+                                ! f = u*(E+p), q = E, q_star = \xi*E+(s-u)(\rho s_star + p/(s-u))
+                                flux_rs${XYZ}$_vf(j, k, l, E_idx) = &
+                                    xi_M*(vel_L(dir_idx(1))*(E_L + pres_L) + &
+                                          s_M*(xi_L*(E_L + (s_S - vel_L(dir_idx(1)))* &
+                                                     (rho_L*s_S + (pres_L)/ &
+                                                      (s_L - vel_L(dir_idx(1))))) - E_L)) &
+                                    + xi_P*(vel_R(dir_idx(1))*(E_R + pres_R) + &
+                                            s_P*(xi_R*(E_R + (s_S - vel_R(dir_idx(1)))* &
+                                                       (rho_R*s_S + (pres_R)/ &
+                                                        (s_R - vel_R(dir_idx(1))))) - E_R)) &
+                                    + (s_M/s_L)*(s_P/s_R)*pcorr*s_S
+
+                                ! Volume fraction flux
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = advxb, advxe
+                                    flux_rs${XYZ}$_vf(j, k, l, i) = &
+                                        xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i) &
+                                        *(vel_L(dir_idx(1)) + s_M*(xi_L - 1._wp)) &
+                                        + xi_P*qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) &
+                                        *(vel_R(dir_idx(1)) + s_P*(xi_R - 1._wp))
+                                end do
+
+                                ! Source for volume fraction advection equation
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = 1, num_dims
+                                    vel_src_rs${XYZ}$_vf(j, k, l, dir_idx(i)) = &
+                                        xi_M*(vel_L(dir_idx(i)) + &
+                                              dir_flg(dir_idx(i))* &
+                                              s_M*(xi_L - 1._wp)) &
+                                        + xi_P*(vel_R(dir_idx(i)) + &
+                                                dir_flg(dir_idx(i))* &
+                                                s_P*(xi_R - 1._wp))
+
+                                    !IF ( (model_eqns == 4) .or. (num_fluids==1) ) vel_src_rs_vf(dir_idx(i))%sf(j,k,l) = 0._wp
+                                end do
+
+                                flux_src_rs${XYZ}$_vf(j, k, l, advxb) = vel_src_rs${XYZ}$_vf(j, k, l, dir_idx(1))
+
+                                ! Add advection flux for bubble variables
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = bubxb, bubxe
+                                    flux_rs${XYZ}$_vf(j, k, l, i) = &
+                                        xi_M*nbub_L*qL_prim_rs${XYZ}$_vf(j, k, l, i) &
+                                        *(vel_L(dir_idx(1)) + s_M*(xi_L - 1._wp)) &
+                                        + xi_P*nbub_R*qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) &
+                                        *(vel_R(dir_idx(1)) + s_P*(xi_R - 1._wp))
+                                end do
+
+                                if (qbmm) then
+                                    flux_rs${XYZ}$_vf(j, k, l, bubxb) = &
+                                        xi_M*nbub_L &
+                                        *(vel_L(dir_idx(1)) + s_M*(xi_L - 1._wp)) &
+                                        + xi_P*nbub_R &
+                                        *(vel_R(dir_idx(1)) + s_P*(xi_R - 1._wp))
+                                end if
+
+                                if (adv_n) then
+                                    flux_rs${XYZ}$_vf(j, k, l, n_idx) = &
+                                        xi_M*nbub_L &
+                                        *(vel_L(dir_idx(1)) + s_M*(xi_L - 1._wp)) &
+                                        + xi_P*nbub_R &
+                                        *(vel_R(dir_idx(1)) + s_P*(xi_R - 1._wp))
+                                end if
+
+                                ! Geometrical source flux for cylindrical coordinates
+                                #:if (NORM_DIR == 2)
+                                    if (cyl_coord) then
+                                        ! Substituting the advective flux into the inviscid geometrical source flux
+                                        $:GPU_LOOP(parallelism='[seq]')
+                                        do i = 1, E_idx
+                                            flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i)
+                                        end do
+                                        ! Recalculating the radial momentum geometric source flux
+                                        flux_gsrc_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(1)) = &
+                                            xi_M*(rho_L*(vel_L(dir_idx(1))* &
+                                                         vel_L(dir_idx(1)) + &
+                                                         s_M*(xi_L*(dir_flg(dir_idx(1))*s_S + &
+                                                                    (1._wp - dir_flg(dir_idx(1)))* &
+                                                                    vel_L(dir_idx(1))) - vel_L(dir_idx(1))))) &
+                                            + xi_P*(rho_R*(vel_R(dir_idx(1))* &
+                                                           vel_R(dir_idx(1)) + &
+                                                           s_P*(xi_R*(dir_flg(dir_idx(1))*s_S + &
+                                                                      (1._wp - dir_flg(dir_idx(1)))* &
+                                                                      vel_R(dir_idx(1))) - vel_R(dir_idx(1)))))
+                                        ! Geometrical source of the void fraction(s) is zero
+                                        $:GPU_LOOP(parallelism='[seq]')
+                                        do i = advxb, advxe
+                                            flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0._wp
+                                        end do
+                                    end if
+                                #:endif
+                                #:if (NORM_DIR == 3)
+                                    if (grid_geometry == 3) then
+                                        $:GPU_LOOP(parallelism='[seq]')
+                                        do i = 1, sys_size
+                                            flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0._wp
+                                        end do
+
+                                        flux_gsrc_rs${XYZ}$_vf(j, k, l, momxb + 1) = &
+                                            -xi_M*(rho_L*(vel_L(dir_idx(1))* &
+                                                          vel_L(dir_idx(1)) + &
+                                                          s_M*(xi_L*(dir_flg(dir_idx(1))*s_S + &
+                                                                     (1._wp - dir_flg(dir_idx(1)))* &
+                                                                     vel_L(dir_idx(1))) - vel_L(dir_idx(1))))) &
+                                            - xi_P*(rho_R*(vel_R(dir_idx(1))* &
+                                                           vel_R(dir_idx(1)) + &
+                                                           s_P*(xi_R*(dir_flg(dir_idx(1))*s_S + &
+                                                                      (1._wp - dir_flg(dir_idx(1)))* &
+                                                                      vel_R(dir_idx(1))) - vel_R(dir_idx(1)))))
+                                        flux_gsrc_rs${XYZ}$_vf(j, k, l, momxe) = flux_rs${XYZ}$_vf(j, k, l, momxb + 1)
+
+                                    end if
+                                #:endif
+                            end do
+                        end do
+                    end do
+                    $:END_GPU_PARALLEL_LOOP()
+                else
+                    ! 5-EQUATION MODEL WITH HLLC
+                    $:GPU_PARALLEL_LOOP(collapse=3, private='[Re_max, i, q, T_L, T_R, vel_L_rms, vel_R_rms, pres_L, pres_R, rho_L, gamma_L, pi_inf_L, qv_L, rho_R, gamma_R, pi_inf_R, qv_R, alpha_L_sum, alpha_R_sum, E_L, E_R, MW_L, MW_R, R_gas_L, R_gas_R, Cp_L, Cp_R, Cv_L, Cv_R, Gamm_L, Gamm_R, Y_L, Y_R, H_L, H_R, qv_avg, rho_avg, gamma_avg, H_avg, c_L, c_R, c_avg, s_P, s_M, xi_P, xi_M, xi_L, xi_R, Ms_L, Ms_R, pres_SL, pres_SR, vel_L, vel_R, Re_L, Re_R, alpha_L, alpha_R, s_L, s_R, s_S, vel_avg_rms, pcorr, zcoef, vel_L_tmp, vel_R_tmp, Ys_L, Ys_R, Xs_L, Xs_R, Gamma_iL, Gamma_iR, Cp_iL, Cp_iR, tau_e_L, tau_e_R, xi_field_L, xi_field_R, Yi_avg,Phi_avg, h_iL, h_iR, h_avg_2, G_L, G_R, Re_visc_per_phase_L, Re_visc_per_phase_R]', copyin='[is1, is2, is3]')
+                    do l = is3%beg, is3%end
+                        do k = is2%beg, is2%end
+                            do j = is1%beg, is1%end
+
+                                vel_L_rms = 0._wp; vel_R_rms = 0._wp
+                                rho_L = 0._wp; rho_R = 0._wp
+                                gamma_L = 0._wp; gamma_R = 0._wp
+                                pi_inf_L = 0._wp; pi_inf_R = 0._wp
+                                qv_L = 0._wp; qv_R = 0._wp
+                                alpha_L_sum = 0._wp; alpha_R_sum = 0._wp
+
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = 1, num_fluids
+                                    alpha_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)
+                                    alpha_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)
+                                end do
+
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = 1, num_dims
+                                    vel_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, contxe + i)
+                                    vel_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, contxe + i)
+                                    vel_L_rms = vel_L_rms + vel_L(i)**2._wp
+                                    vel_R_rms = vel_R_rms + vel_R(i)**2._wp
+                                end do
+
+                                pres_L = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx)
+                                pres_R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx)
+
+                                ! Change this by splitting it into the cases
+                                ! present in the bubbles_euler
+                                if (mpp_lim) then
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, num_fluids
+                                        qL_prim_rs${XYZ}$_vf(j, k, l, i) = max(0._wp, qL_prim_rs${XYZ}$_vf(j, k, l, i))
+                                        qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i) = min(max(0._wp, qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)), 1._wp)
+                                        qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) = max(0._wp, qR_prim_rs${XYZ}$_vf(j + 1, k, l, i))
+                                        qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i) = min(max(0._wp, qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)), 1._wp)
+                                        alpha_L_sum = alpha_L_sum + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)
+                                        alpha_R_sum = alpha_R_sum + qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)
+                                    end do
+
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, num_fluids
+                                        qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)/max(alpha_L_sum, sgm_eps)
+                                        qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)/max(alpha_R_sum, sgm_eps)
+                                    end do
+                                end if
+
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = 1, num_fluids
+                                    rho_L = rho_L + qL_prim_rs${XYZ}$_vf(j, k, l, i)
+                                    gamma_L = gamma_L + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)*gammas(i)
+                                    pi_inf_L = pi_inf_L + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)*pi_infs(i)
+                                    qv_L = qv_L + qL_prim_rs${XYZ}$_vf(j, k, l, i)*qvs(i)
+
+                                    rho_R = rho_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)
+                                    gamma_R = gamma_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)*gammas(i)
+                                    pi_inf_R = pi_inf_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)*pi_infs(i)
+                                    qv_R = qv_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)*qvs(i)
+                                end do
+
+                                Re_max = 0
+                                if (Re_size(1) > 0) Re_max = 1
+                                if (Re_size(2) > 0) Re_max = 2
+
+                                if (viscous) then
+                                    ! Map rotated (j,k,l) to physical (x,y,z) indices
+                                    #:if NORM_DIR == 1
+                                        call s_compute_re_visc(q_prim_vf, &
+                                            alpha_L, j, k, l, Re_visc_per_phase_L)
+                                        call s_compute_re_visc(q_prim_vf, &
+                                            alpha_R, j + 1, k, l, Re_visc_per_phase_R)
+                                    #:elif NORM_DIR == 2
+                                        call s_compute_re_visc(q_prim_vf, &
+                                            alpha_L, k, j, l, Re_visc_per_phase_L)
+                                        call s_compute_re_visc(q_prim_vf, &
+                                            alpha_R, k, j + 1, l, Re_visc_per_phase_R)
+                                    #:else
+                                        call s_compute_re_visc(q_prim_vf, &
+                                            alpha_L, l, k, j, Re_visc_per_phase_L)
+                                        call s_compute_re_visc(q_prim_vf, &
+                                            alpha_R, l, k, j + 1, Re_visc_per_phase_R)
+                                    #:endif
+                                    call s_compute_mixture_re( &
+                                        alpha_L, Re_visc_per_phase_L, Re_L)
+                                    call s_compute_mixture_re( &
+                                        alpha_R, Re_visc_per_phase_R, Re_R)
+                                end if
+
+                                if (chemistry) then
+                                    c_sum_Yi_Phi = 0.0_wp
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = chemxb, chemxe
+                                        Ys_L(i - chemxb + 1) = qL_prim_rs${XYZ}$_vf(j, k, l, i)
+                                        Ys_R(i - chemxb + 1) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)
+                                    end do
+
+                                    call get_mixture_molecular_weight(Ys_L, MW_L)
+                                    call get_mixture_molecular_weight(Ys_R, MW_R)
+
+                                    #:if USING_AMD
+                                        Xs_L(:) = Ys_L(:)*MW_L/molecular_weights_nonparameter(:)
+                                        Xs_R(:) = Ys_R(:)*MW_R/molecular_weights_nonparameter(:)
+                                    #:else
+                                        Xs_L(:) = Ys_L(:)*MW_L/molecular_weights(:)
+                                        Xs_R(:) = Ys_R(:)*MW_R/molecular_weights(:)
+                                    #:endif
+
+                                    R_gas_L = gas_constant/MW_L
+                                    R_gas_R = gas_constant/MW_R
+
+                                    T_L = pres_L/rho_L/R_gas_L
+                                    T_R = pres_R/rho_R/R_gas_R
+
+                                    call get_species_specific_heats_r(T_L, Cp_iL)
+                                    call get_species_specific_heats_r(T_R, Cp_iR)
+
+                                    if (chem_params%gamma_method == 1) then
+                                        !> gamma_method = 1: Ref. Section 2.3.1 Formulation of doi:10.7907/ZKW8-ES97.
+                                        Gamma_iL = Cp_iL/(Cp_iL - 1.0_wp)
+                                        Gamma_iR = Cp_iR/(Cp_iR - 1.0_wp)
+
+                                        gamma_L = sum(Xs_L(:)/(Gamma_iL(:) - 1.0_wp))
+                                        gamma_R = sum(Xs_R(:)/(Gamma_iR(:) - 1.0_wp))
+                                    else if (chem_params%gamma_method == 2) then
+                                        !> gamma_method = 2: c_p / c_v where c_p, c_v are specific heats.
+                                        call get_mixture_specific_heat_cp_mass(T_L, Ys_L, Cp_L)
+                                        call get_mixture_specific_heat_cp_mass(T_R, Ys_R, Cp_R)
+                                        call get_mixture_specific_heat_cv_mass(T_L, Ys_L, Cv_L)
+                                        call get_mixture_specific_heat_cv_mass(T_R, Ys_R, Cv_R)
+
+                                        Gamm_L = Cp_L/Cv_L; Gamm_R = Cp_R/Cv_R
+                                        gamma_L = 1.0_wp/(Gamm_L - 1.0_wp); gamma_R = 1.0_wp/(Gamm_R - 1.0_wp)
+                                    end if
+
+                                    call get_mixture_energy_mass(T_L, Ys_L, E_L)
+                                    call get_mixture_energy_mass(T_R, Ys_R, E_R)
+
+                                    E_L = rho_L*E_L + 5.e-1*rho_L*vel_L_rms
+                                    E_R = rho_R*E_R + 5.e-1*rho_R*vel_R_rms
+                                    H_L = (E_L + pres_L)/rho_L
+                                    H_R = (E_R + pres_R)/rho_R
+                                else
+                                    E_L = gamma_L*pres_L + pi_inf_L + 5.e-1*rho_L*vel_L_rms + qv_L
+                                    E_R = gamma_R*pres_R + pi_inf_R + 5.e-1*rho_R*vel_R_rms + qv_R
+
+                                    H_L = (E_L + pres_L)/rho_L
+                                    H_R = (E_R + pres_R)/rho_R
+                                end if
+
+                                ! ENERGY ADJUSTMENTS FOR HYPOELASTIC ENERGY
+                                if (hypoelasticity) then
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, strxe - strxb + 1
+                                        tau_e_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, strxb - 1 + i)
+                                        tau_e_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, strxb - 1 + i)
+                                    end do
+                                    G_L = 0._wp
+                                    G_R = 0._wp
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, num_fluids
+                                        G_L = G_L + alpha_L(i)*Gs_rs(i)
+                                        G_R = G_R + alpha_R(i)*Gs_rs(i)
+                                    end do
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, strxe - strxb + 1
+                                        ! Elastic contribution to energy if G large enough
+                                        if ((G_L > verysmall) .and. (G_R > verysmall)) then
+                                            E_L = E_L + (tau_e_L(i)*tau_e_L(i))/(4._wp*G_L)
+                                            E_R = E_R + (tau_e_R(i)*tau_e_R(i))/(4._wp*G_R)
+                                            ! Additional terms in 2D and 3D
+                                            if ((i == 2) .or. (i == 4) .or. (i == 5)) then
+                                                E_L = E_L + (tau_e_L(i)*tau_e_L(i))/(4._wp*G_L)
+                                                E_R = E_R + (tau_e_R(i)*tau_e_R(i))/(4._wp*G_R)
+                                            end if
+                                        end if
+                                    end do
+                                end if
+
+                                ! ENERGY ADJUSTMENTS FOR HYPERELASTIC ENERGY
+                                if (hyperelasticity) then
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, num_dims
+                                        xi_field_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, xibeg - 1 + i)
+                                        xi_field_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, xibeg - 1 + i)
+                                    end do
+                                    G_L = 0._wp
+                                    G_R = 0._wp
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, num_fluids
+                                        ! Mixture left and right shear modulus
+                                        G_L = G_L + alpha_L(i)*Gs_rs(i)
+                                        G_R = G_R + alpha_R(i)*Gs_rs(i)
+                                    end do
+                                    ! Elastic contribution to energy if G large enough
+                                    if (G_L > verysmall .and. G_R > verysmall) then
+                                        E_L = E_L + G_L*qL_prim_rs${XYZ}$_vf(j, k, l, xiend + 1)
+                                        E_R = E_R + G_R*qR_prim_rs${XYZ}$_vf(j + 1, k, l, xiend + 1)
+                                    end if
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, b_size - 1
+                                        tau_e_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, strxb - 1 + i)
+                                        tau_e_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, strxb - 1 + i)
+                                    end do
+                                end if
+
+                                H_L = (E_L + pres_L)/rho_L
+                                H_R = (E_R + pres_R)/rho_R
+
+                                @:compute_average_state()
+
+                                call s_compute_speed_of_sound(pres_L, rho_L, gamma_L, pi_inf_L, H_L, alpha_L, &
+                                                              vel_L_rms, 0._wp, c_L, qv_L)
+
+                                call s_compute_speed_of_sound(pres_R, rho_R, gamma_R, pi_inf_R, H_R, alpha_R, &
+                                                              vel_R_rms, 0._wp, c_R, qv_R)
+
+                                !> The computation of c_avg does not require all the variables, and therefore the non '_avg'
+                                !  variables are placeholders to call the subroutine.
+                                call s_compute_speed_of_sound(pres_R, rho_avg, gamma_avg, pi_inf_R, H_avg, alpha_R, &
+                                                              vel_avg_rms, c_sum_Yi_Phi, c_avg, qv_avg)
+
+                                if (viscous) then
+                                    if (chemistry) then
+                                        call compute_viscosity_and_inversion(T_L, Ys_L, T_R, Ys_R, Re_L(1), Re_R(1))
+                                    end if
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, 2
+                                        Re_avg_rs${XYZ}$_vf(j, k, l, i) = 2._wp/(1._wp/Re_L(i) + 1._wp/Re_R(i))
+                                    end do
+                                end if
+
+                                ! Low Mach correction
+                                if (low_Mach == 2) then
+                                    @:compute_low_Mach_correction()
+                                end if
+
+                                if (wave_speeds == 1) then
+                                    if (elasticity) then
+                                        s_L = min(vel_L(dir_idx(1)) - sqrt(c_L*c_L + &
+                                                                           (((4._wp*G_L)/3._wp) + tau_e_L(dir_idx_tau(1)))/rho_L), vel_R(dir_idx(1)) - sqrt(c_R*c_R + &
+                                                                                                                                                            (((4._wp*G_R)/3._wp) + tau_e_R(dir_idx_tau(1)))/rho_R))
+                                        s_R = max(vel_R(dir_idx(1)) + sqrt(c_R*c_R + &
+                                                                           (((4._wp*G_R)/3._wp) + tau_e_R(dir_idx_tau(1)))/rho_R), vel_L(dir_idx(1)) + sqrt(c_L*c_L + &
+                                                                                                                                                            (((4._wp*G_L)/3._wp) + tau_e_L(dir_idx_tau(1)))/rho_L))
+                                        s_S = (pres_R - tau_e_R(dir_idx_tau(1)) - pres_L + &
+                                               tau_e_L(dir_idx_tau(1)) + rho_L*vel_L(dir_idx(1))*(s_L - vel_L(dir_idx(1))) - &
+                                               rho_R*vel_R(dir_idx(1))*(s_R - vel_R(dir_idx(1))))/(rho_L*(s_L - vel_L(dir_idx(1))) - &
+                                                                                                   rho_R*(s_R - vel_R(dir_idx(1))))
+                                    else
+                                        s_L = min(vel_L(dir_idx(1)) - c_L, vel_R(dir_idx(1)) - c_R)
+                                        s_R = max(vel_R(dir_idx(1)) + c_R, vel_L(dir_idx(1)) + c_L)
+                                        s_S = (pres_R - pres_L + rho_L*vel_L(dir_idx(1))* &
+                                               (s_L - vel_L(dir_idx(1))) - rho_R*vel_R(dir_idx(1))*(s_R - vel_R(dir_idx(1)))) &
+                                              /(rho_L*(s_L - vel_L(dir_idx(1))) - rho_R*(s_R - vel_R(dir_idx(1))))
+
+                                    end if
+                                elseif (wave_speeds == 2) then
+                                    pres_SL = 5.e-1_wp*(pres_L + pres_R + rho_avg*c_avg* &
+                                                        (vel_L(dir_idx(1)) - &
+                                                         vel_R(dir_idx(1))))
+
+                                    pres_SR = pres_SL
+
+                                    Ms_L = max(1._wp, sqrt(1._wp + ((5.e-1_wp + gamma_L)/(1._wp + gamma_L))* &
+                                                           (pres_SL/pres_L - 1._wp)*pres_L/ &
+                                                           ((pres_L + pi_inf_L/(1._wp + gamma_L)))))
+                                    Ms_R = max(1._wp, sqrt(1._wp + ((5.e-1_wp + gamma_R)/(1._wp + gamma_R))* &
+                                                           (pres_SR/pres_R - 1._wp)*pres_R/ &
+                                                           ((pres_R + pi_inf_R/(1._wp + gamma_R)))))
+
+                                    s_L = vel_L(dir_idx(1)) - c_L*Ms_L
+                                    s_R = vel_R(dir_idx(1)) + c_R*Ms_R
+
+                                    s_S = 5.e-1_wp*((vel_L(dir_idx(1)) + vel_R(dir_idx(1))) + &
+                                                    (pres_L - pres_R)/ &
+                                                    (rho_avg*c_avg))
+                                end if
+
+                                ! follows Einfeldt et al.
+                                ! s_M/P = min/max(0.,s_L/R)
+                                s_M = min(0._wp, s_L); s_P = max(0._wp, s_R)
+
+                                ! goes with q_star_L/R = xi_L/R * (variable)
+                                ! xi_L/R = ( ( s_L/R - u_L/R )/(s_L/R - s_star) )
+                                xi_L = (s_L - vel_L(dir_idx(1)))/(s_L - s_S)
+                                xi_R = (s_R - vel_R(dir_idx(1)))/(s_R - s_S)
+
+                                ! goes with numerical velocity in x/y/z directions
+                                ! xi_P/M = 0.5 +/m sgn(0.5,s_star)
+                                xi_M = (5.e-1_wp + sign(5.e-1_wp, s_S))
+                                xi_P = (5.e-1_wp - sign(5.e-1_wp, s_S))
+
+                                ! Low Mach correction
+                                if (low_Mach == 1) then
+                                    @:compute_low_Mach_correction()
+                                else
+                                    pcorr = 0._wp
+                                end if
+
+                                ! COMPUTING THE HLLC FLUXES
+                                ! MASS FLUX.
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = 1, contxe
+                                    flux_rs${XYZ}$_vf(j, k, l, i) = &
+                                        xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i) &
+                                        *(vel_L(dir_idx(1)) + s_M*(xi_L - 1._wp)) &
+                                        + xi_P*qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) &
+                                        *(vel_R(dir_idx(1)) + s_P*(xi_R - 1._wp))
+                                end do
+
+                                ! MOMENTUM FLUX.
+                                ! f = \rho u u - \sigma, q = \rho u, q_star = \xi * \rho*(s_star, v, w)
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = 1, num_dims
+                                    flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = &
+                                        xi_M*(rho_L*(vel_L(dir_idx(1))* &
+                                                     vel_L(dir_idx(i)) + &
+                                                     s_M*(xi_L*(dir_flg(dir_idx(i))*s_S + &
+                                                                (1._wp - dir_flg(dir_idx(i)))* &
+                                                                vel_L(dir_idx(i))) - vel_L(dir_idx(i)))) + &
+                                              dir_flg(dir_idx(i))*(pres_L)) &
+                                        + xi_P*(rho_R*(vel_R(dir_idx(1))* &
+                                                       vel_R(dir_idx(i)) + &
+                                                       s_P*(xi_R*(dir_flg(dir_idx(i))*s_S + &
+                                                                  (1._wp - dir_flg(dir_idx(i)))* &
+                                                                  vel_R(dir_idx(i))) - vel_R(dir_idx(i)))) + &
+                                                dir_flg(dir_idx(i))*(pres_R)) &
+                                        + (s_M/s_L)*(s_P/s_R)*dir_flg(dir_idx(i))*pcorr
+                                end do
+
+                                ! ENERGY FLUX.
+                                ! f = u*(E-\sigma), q = E, q_star = \xi*E+(s-u)(\rho s_star - \sigma/(s-u))
+                                flux_rs${XYZ}$_vf(j, k, l, E_idx) = &
+                                    xi_M*(vel_L(dir_idx(1))*(E_L + pres_L) + &
+                                          s_M*(xi_L*(E_L + (s_S - vel_L(dir_idx(1)))* &
+                                                     (rho_L*s_S + pres_L/ &
+                                                      (s_L - vel_L(dir_idx(1))))) - E_L)) &
+                                    + xi_P*(vel_R(dir_idx(1))*(E_R + pres_R) + &
+                                            s_P*(xi_R*(E_R + (s_S - vel_R(dir_idx(1)))* &
+                                                       (rho_R*s_S + pres_R/ &
+                                                        (s_R - vel_R(dir_idx(1))))) - E_R)) &
+                                    + (s_M/s_L)*(s_P/s_R)*pcorr*s_S
+
+                                ! ELASTICITY. Elastic shear stress additions for the momentum and energy flux
+                                if (elasticity) then
+                                    flux_ene_e = 0._wp
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, num_dims
+                                        ! MOMENTUM ELASTIC FLUX.
+                                        flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = &
+                                            flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) &
+                                            - xi_M*tau_e_L(dir_idx_tau(i)) - xi_P*tau_e_R(dir_idx_tau(i))
+                                        ! ENERGY ELASTIC FLUX.
+                                        flux_ene_e = flux_ene_e - &
+                                                     xi_M*(vel_L(dir_idx(i))*tau_e_L(dir_idx_tau(i)) + &
+                                                           s_M*(xi_L*((s_S - vel_L(i))*(tau_e_L(dir_idx_tau(i))/(s_L - vel_L(i)))))) - &
+                                                     xi_P*(vel_R(dir_idx(i))*tau_e_R(dir_idx_tau(i)) + &
+                                                           s_P*(xi_R*((s_S - vel_R(i))*(tau_e_R(dir_idx_tau(i))/(s_R - vel_R(i))))))
+                                    end do
+                                    flux_rs${XYZ}$_vf(j, k, l, E_idx) = flux_rs${XYZ}$_vf(j, k, l, E_idx) + flux_ene_e
+                                end if
+
+                                ! HYPOELASTIC STRESS EVOLUTION FLUX.
+                                if (hypoelasticity) then
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, strxe - strxb + 1
+                                        flux_rs${XYZ}$_vf(j, k, l, strxb - 1 + i) = &
+                                            xi_M*(s_S/(s_L - s_S))*(s_L*rho_L*tau_e_L(i) - rho_L*vel_L(dir_idx(1))*tau_e_L(i)) + &
+                                            xi_P*(s_S/(s_R - s_S))*(s_R*rho_R*tau_e_R(i) - rho_R*vel_R(dir_idx(1))*tau_e_R(i))
+                                    end do
+                                end if
+
+                                ! VOLUME FRACTION FLUX.
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = advxb, advxe
+                                    flux_rs${XYZ}$_vf(j, k, l, i) = &
+                                        xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i) &
+                                        *(vel_L(dir_idx(1)) + s_M*(xi_L - 1._wp)) &
+                                        + xi_P*qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) &
+                                        *(vel_R(dir_idx(1)) + s_P*(xi_R - 1._wp))
+                                end do
+
+                                ! VOLUME FRACTION SOURCE FLUX.
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = 1, num_dims
+                                    vel_src_rs${XYZ}$_vf(j, k, l, dir_idx(i)) = &
+                                        xi_M*(vel_L(dir_idx(i)) + &
+                                              dir_flg(dir_idx(i))* &
+                                              s_M*(xi_L - 1._wp)) &
+                                        + xi_P*(vel_R(dir_idx(i)) + &
+                                                dir_flg(dir_idx(i))* &
+                                                s_P*(xi_R - 1._wp))
+                                end do
+
+                                ! COLOR FUNCTION FLUX
+                                if (surface_tension) then
+                                    flux_rs${XYZ}$_vf(j, k, l, c_idx) = &
+                                        xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, c_idx) &
+                                        *(vel_L(dir_idx(1)) + s_M*(xi_L - 1._wp)) &
+                                        + xi_P*qR_prim_rs${XYZ}$_vf(j + 1, k, l, c_idx) &
+                                        *(vel_R(dir_idx(1)) + s_P*(xi_R - 1._wp))
+                                end if
+
+                                ! REFERENCE MAP FLUX.
+                                if (hyperelasticity) then
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, num_dims
+                                        flux_rs${XYZ}$_vf(j, k, l, xibeg - 1 + i) = &
+                                            xi_M*(s_S/(s_L - s_S))*(s_L*rho_L*xi_field_L(i) &
+                                                                    - rho_L*vel_L(dir_idx(1))*xi_field_L(i)) + &
+                                            xi_P*(s_S/(s_R - s_S))*(s_R*rho_R*xi_field_R(i) &
+                                                                    - rho_R*vel_R(dir_idx(1))*xi_field_R(i))
+                                    end do
+                                end if
+
+                                flux_src_rs${XYZ}$_vf(j, k, l, advxb) = vel_src_rs${XYZ}$_vf(j, k, l, dir_idx(1))
+
+                                if (chemistry) then
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = chemxb, chemxe
+                                        Y_L = qL_prim_rs${XYZ}$_vf(j, k, l, i)
+                                        Y_R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)
+
+                                        flux_rs${XYZ}$_vf(j, k, l, i) = xi_M*rho_L*Y_L*(vel_L(dir_idx(1)) + s_M*(xi_L - 1._wp)) &
+                                                                        + xi_P*rho_R*Y_R*(vel_R(dir_idx(1)) + s_P*(xi_R - 1._wp))
+                                        flux_src_rs${XYZ}$_vf(j, k, l, i) = 0.0_wp
+                                    end do
+                                end if
+
+                                ! Geometrical source flux for cylindrical coordinates
+                                #:if (NORM_DIR == 2)
+                                    if (cyl_coord) then
+                                        !Substituting the advective flux into the inviscid geometrical source flux
+                                        $:GPU_LOOP(parallelism='[seq]')
+                                        do i = 1, E_idx
+                                            flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i)
+                                        end do
+                                        ! Recalculating the radial momentum geometric source flux
+                                        flux_gsrc_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(1)) = &
+                                            xi_M*(rho_L*(vel_L(dir_idx(1))* &
+                                                         vel_L(dir_idx(1)) + &
+                                                         s_M*(xi_L*(dir_flg(dir_idx(1))*s_S + &
+                                                                    (1._wp - dir_flg(dir_idx(1)))* &
+                                                                    vel_L(dir_idx(1))) - vel_L(dir_idx(1))))) &
+                                            + xi_P*(rho_R*(vel_R(dir_idx(1))* &
+                                                           vel_R(dir_idx(1)) + &
+                                                           s_P*(xi_R*(dir_flg(dir_idx(1))*s_S + &
+                                                                      (1._wp - dir_flg(dir_idx(1)))* &
+                                                                      vel_R(dir_idx(1))) - vel_R(dir_idx(1)))))
+                                        ! Geometrical source of the void fraction(s) is zero
+                                        $:GPU_LOOP(parallelism='[seq]')
+                                        do i = advxb, advxe
+                                            flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0._wp
+                                        end do
+                                    end if
+                                #:endif
+                                #:if (NORM_DIR == 3)
+                                    if (grid_geometry == 3) then
+                                        $:GPU_LOOP(parallelism='[seq]')
+                                        do i = 1, sys_size
+                                            flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0._wp
+                                        end do
+
+                                        flux_gsrc_rs${XYZ}$_vf(j, k, l, momxb + 1) = &
+                                            -xi_M*(rho_L*(vel_L(dir_idx(1))* &
+                                                          vel_L(dir_idx(1)) + &
+                                                          s_M*(xi_L*(dir_flg(dir_idx(1))*s_S + &
+                                                                     (1._wp - dir_flg(dir_idx(1)))* &
+                                                                     vel_L(dir_idx(1))) - vel_L(dir_idx(1))))) &
+                                            - xi_P*(rho_R*(vel_R(dir_idx(1))* &
+                                                           vel_R(dir_idx(1)) + &
+                                                           s_P*(xi_R*(dir_flg(dir_idx(1))*s_S + &
+                                                                      (1._wp - dir_flg(dir_idx(1)))* &
+                                                                      vel_R(dir_idx(1))) - vel_R(dir_idx(1)))))
+                                        flux_gsrc_rs${XYZ}$_vf(j, k, l, momxe) = flux_rs${XYZ}$_vf(j, k, l, momxb + 1)
+
+                                    end if
+                                #:endif
+
+                            end do
+                        end do
+                    end do
+                    $:END_GPU_PARALLEL_LOOP()
+                end if
+            end if
+        #:endfor
+        ! Computing HLLC flux and source flux for Euler system of equations
+
+        if (viscous .or. dummy) then
+            if (weno_Re_flux) then
+                call s_compute_viscous_source_flux( &
+                    qL_prim_vf(momxb:momxe), &
+                    dqL_prim_dx_vf(momxb:momxe), &
+                    dqL_prim_dy_vf(momxb:momxe), &
+                    dqL_prim_dz_vf(momxb:momxe), &
+                    qR_prim_vf(momxb:momxe), &
+                    dqR_prim_dx_vf(momxb:momxe), &
+                    dqR_prim_dy_vf(momxb:momxe), &
+                    dqR_prim_dz_vf(momxb:momxe), &
+                    flux_src_vf, norm_dir, ix, iy, iz)
+            else
+                call s_compute_viscous_source_flux( &
+                    q_prim_vf(momxb:momxe), &
+                    dqL_prim_dx_vf(momxb:momxe), &
+                    dqL_prim_dy_vf(momxb:momxe), &
+                    dqL_prim_dz_vf(momxb:momxe), &
+                    q_prim_vf(momxb:momxe), &
+                    dqR_prim_dx_vf(momxb:momxe), &
+                    dqR_prim_dy_vf(momxb:momxe), &
+                    dqR_prim_dz_vf(momxb:momxe), &
+                    flux_src_vf, norm_dir, ix, iy, iz)
+            end if
+        end if
+
+        if (surface_tension) then
+            call s_compute_capillary_source_flux( &
+                vel_src_rsx_vf, &
+                vel_src_rsy_vf, &
+                vel_src_rsz_vf, &
+                flux_src_vf, &
+                norm_dir, isx, isy, isz)
+        end if
+
+        call s_finalize_riemann_solver(flux_vf, flux_src_vf, &
+                                       flux_gsrc_vf, &
+                                       norm_dir)
+
+    end subroutine s_hllc_riemann_solver
+
+    !> HLLD Riemann solver resolves 5 of the 7 waves of MHD equations:
+        !!      1 entropy wave, 2 Alfvén waves, 2 fast magnetosonic waves.
+    subroutine s_hlld_riemann_solver(qL_prim_rsx_vf, qL_prim_rsy_vf, qL_prim_rsz_vf, &
+                                     dqL_prim_dx_vf, dqL_prim_dy_vf, dqL_prim_dz_vf, &
+                                     qL_prim_vf, &
+                                     qR_prim_rsx_vf, qR_prim_rsy_vf, qR_prim_rsz_vf, &
+                                     dqR_prim_dx_vf, dqR_prim_dy_vf, dqR_prim_dz_vf, &
+                                     qR_prim_vf, &
+                                     q_prim_vf, &
+                                     flux_vf, flux_src_vf, flux_gsrc_vf, &
+                                     norm_dir, ix, iy, iz)
+
+        real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:), intent(inout) :: qL_prim_rsx_vf, qL_prim_rsy_vf, qL_prim_rsz_vf, &
+                                                                                                     qR_prim_rsx_vf, qR_prim_rsy_vf, qR_prim_rsz_vf
+
+        type(scalar_field), allocatable, dimension(:), intent(inout) :: dqL_prim_dx_vf, dqR_prim_dx_vf, &
+                                                                        dqL_prim_dy_vf, dqR_prim_dy_vf, &
+                                                                        dqL_prim_dz_vf, dqR_prim_dz_vf
+
+        type(scalar_field), allocatable, dimension(:), intent(inout) :: qL_prim_vf, qR_prim_vf
+
+        type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf
+        type(scalar_field), dimension(sys_size), intent(inout) :: flux_vf, flux_src_vf, flux_gsrc_vf
+
+        integer, intent(in) :: norm_dir
+        type(int_bounds_info), intent(in) :: ix, iy, iz
+
+        ! Local variables:
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3) :: alpha_L, alpha_R, alpha_rho_L, alpha_rho_R
+        #:else
+            real(wp), dimension(num_fluids) :: alpha_L, alpha_R, alpha_rho_L, alpha_rho_R
+        #:endif
+        type(riemann_states_vec3) :: vel
+        type(riemann_states) :: rho, pres, E, H_no_mag
+        type(riemann_states) :: gamma, pi_inf, qv
+        type(riemann_states) :: vel_rms
+
+        type(riemann_states_vec3) :: B
+        type(riemann_states) :: c, c_fast, pres_mag
+
+        ! HLLD speeds and intermediate state variables:
+        real(wp) :: s_L, s_R, s_M, s_starL, s_starR
+        real(wp) :: pTot_L, pTot_R, p_star, rhoL_star, rhoR_star, E_starL, E_starR
+
+        real(wp), dimension(7) :: U_L, U_R, U_starL, U_starR, U_doubleL, U_doubleR
+        real(wp), dimension(7) :: F_L, F_R, F_starL, F_starR, F_hlld
+
+        ! Indices for U and F: (rho, rho*vel(1), rho*vel(2), rho*vel(3), By, Bz, E)
+        !   Note: vel and B are permutated, so vel(1) is the normal velocity, and x is the normal direction
+        !   Note: Bx is omitted as the magnetic flux is always zero in the normal direction
+
+        real(wp) :: sqrt_rhoL_star, sqrt_rhoR_star, denom_ds, sign_Bx
+        real(wp) :: vL_star, vR_star, wL_star, wR_star
+        real(wp) :: v_double, w_double, By_double, Bz_double, E_doubleL, E_doubleR, E_double
+
+        integer :: i, j, k, l
+
+        call s_populate_riemann_states_variables_buffers( &
+            qL_prim_rsx_vf, qL_prim_rsy_vf, qL_prim_rsz_vf, dqL_prim_dx_vf, &
+            dqL_prim_dy_vf, dqL_prim_dz_vf, &
+            qR_prim_rsx_vf, qR_prim_rsy_vf, qR_prim_rsz_vf, dqR_prim_dx_vf, &
+            dqR_prim_dy_vf, dqR_prim_dz_vf, &
+            norm_dir, ix, iy, iz)
+
+        call s_initialize_riemann_solver( &
+            flux_src_vf, norm_dir)
+
+        #:for NORM_DIR, XYZ in [(1, 'x'), (2, 'y'), (3, 'z')]
+            if (norm_dir == ${NORM_DIR}$) then
+                $:GPU_PARALLEL_LOOP(collapse=3, private='[alpha_rho_L, alpha_rho_R, vel, alpha_L, alpha_R, rho, pres,E, H_no_mag, gamma, pi_inf, qv, vel_rms, B, c, c_fast, pres_mag, U_L, U_R, U_starL, U_starR, U_doubleL, U_doubleR, F_L, F_R, F_starL, F_starR, F_hlld, s_L, s_R, s_M, s_starL, s_starR, pTot_L, pTot_R, p_star, rhoL_star, rhoR_star, E_starL, E_starR, sqrt_rhoL_star, sqrt_rhoR_star, denom_ds, sign_Bx, vL_star, vR_star, wL_star, wR_star, v_double, w_double, By_double, Bz_double, E_doubleL, E_doubleR, E_double]', copyin='[norm_dir]')
+                do l = is3%beg, is3%end
+                    do k = is2%beg, is2%end
+                        do j = is1%beg, is1%end
+
+                            ! (1) Extract the left/right primitive states
+                            do i = 1, contxe
+                                alpha_rho_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, i)
+                                alpha_rho_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)
+                            end do
+
+                            ! NOTE: unlike HLL & HLLC, vel_L here is permutated by dir_idx for simpler logic
+                            do i = 1, num_vels
+                                vel%L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i))
+                                vel%R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, contxe + dir_idx(i))
+                            end do
+
+                            vel_rms%L = sum(vel%L**2._wp)
+                            vel_rms%R = sum(vel%R**2._wp)
+
+                            do i = 1, num_fluids
+                                alpha_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)
+                                alpha_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)
+                            end do
+
+                            pres%L = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx)
+                            pres%R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx)
+
+                            ! NOTE: unlike HLL, Bx, By, Bz are permutated by dir_idx for simpler logic
+                            if (mhd) then
+                                if (n == 0) then ! 1D: constant Bx; By, Bz as variables; only in x so not permutated
+                                    B%L = [Bx0, qL_prim_rs${XYZ}$_vf(j, k, l, B_idx%beg), qL_prim_rs${XYZ}$_vf(j, k, l, B_idx%beg + 1)]
+                                    B%R = [Bx0, qR_prim_rs${XYZ}$_vf(j + 1, k, l, B_idx%beg), qR_prim_rs${XYZ}$_vf(j + 1, k, l, B_idx%beg + 1)]
+                                else ! 2D/3D: Bx, By, Bz as variables
+                                    B%L = [qL_prim_rs${XYZ}$_vf(j, k, l, B_idx%beg + dir_idx(1) - 1), &
+                                           qL_prim_rs${XYZ}$_vf(j, k, l, B_idx%beg + dir_idx(2) - 1), &
+                                           qL_prim_rs${XYZ}$_vf(j, k, l, B_idx%beg + dir_idx(3) - 1)]
+                                    B%R = [qR_prim_rs${XYZ}$_vf(j + 1, k, l, B_idx%beg + dir_idx(1) - 1), &
+                                           qR_prim_rs${XYZ}$_vf(j + 1, k, l, B_idx%beg + dir_idx(2) - 1), &
+                                           qR_prim_rs${XYZ}$_vf(j + 1, k, l, B_idx%beg + dir_idx(3) - 1)]
+                                end if
+                            end if
+
+                            ! Sum properties of all fluid components
+                            rho%L = 0._wp; gamma%L = 0._wp; pi_inf%L = 0._wp; qv%L = 0._wp
+                            rho%R = 0._wp; gamma%R = 0._wp; pi_inf%R = 0._wp; qv%R = 0._wp
+                            $:GPU_LOOP(parallelism='[seq]')
+                            do i = 1, num_fluids
+                                rho%L = rho%L + alpha_rho_L(i)
+                                gamma%L = gamma%L + alpha_L(i)*gammas(i)
+                                pi_inf%L = pi_inf%L + alpha_L(i)*pi_infs(i)
+                                qv%L = qv%L + alpha_rho_L(i)*qvs(i)
+
+                                rho%R = rho%R + alpha_rho_R(i)
+                                gamma%R = gamma%R + alpha_R(i)*gammas(i)
+                                pi_inf%R = pi_inf%R + alpha_R(i)*pi_infs(i)
+                                qv%R = qv%R + alpha_rho_R(i)*qvs(i)
+                            end do
+
+                            pres_mag%L = 0.5_wp*sum(B%L**2._wp)
+                            pres_mag%R = 0.5_wp*sum(B%R**2._wp)
+                            E%L = gamma%L*pres%L + pi_inf%L + 0.5_wp*rho%L*vel_rms%L + qv%L + pres_mag%L
+                            E%R = gamma%R*pres%R + pi_inf%R + 0.5_wp*rho%R*vel_rms%R + qv%R + pres_mag%R ! includes magnetic energy
+                            H_no_mag%L = (E%L + pres%L - pres_mag%L)/rho%L
+                            H_no_mag%R = (E%R + pres%R - pres_mag%R)/rho%R ! stagnation enthalpy here excludes magnetic energy (only used to find speed of sound)
+
+                            ! (2) Compute fast wave speeds
+                            call s_compute_speed_of_sound(pres%L, rho%L, gamma%L, pi_inf%L, H_no_mag%L, alpha_L, vel_rms%L, 0._wp, c%L, qv%L)
+                            call s_compute_speed_of_sound(pres%R, rho%R, gamma%R, pi_inf%R, H_no_mag%R, alpha_R, vel_rms%R, 0._wp, c%R, qv%R)
+                            call s_compute_fast_magnetosonic_speed(rho%L, c%L, B%L, norm_dir, c_fast%L, H_no_mag%L)
+                            call s_compute_fast_magnetosonic_speed(rho%R, c%R, B%R, norm_dir, c_fast%R, H_no_mag%R)
+
+                            ! (3) Compute contact speed s_M [Miyoshi Equ. (38)]
+                            s_L = min(vel%L(1) - c_fast%L, vel%R(1) - c_fast%R)
+                            s_R = max(vel%R(1) + c_fast%R, vel%L(1) + c_fast%L)
+
+                            pTot_L = pres%L + pres_mag%L
+                            pTot_R = pres%R + pres_mag%R
+
+                            s_M = (((s_R - vel%R(1))*rho%R*vel%R(1) - &
+                                    (s_L - vel%L(1))*rho%L*vel%L(1) - pTot_R + pTot_L)/ &
+                                   ((s_R - vel%R(1))*rho%R - (s_L - vel%L(1))*rho%L))
+
+                            ! (4) Compute star state variables
+                            rhoL_star = rho%L*(s_L - vel%L(1))/(s_L - s_M)
+                            rhoR_star = rho%R*(s_R - vel%R(1))/(s_R - s_M)
+                            p_star = pTot_L + rho%L*(s_L - vel%L(1))*(s_M - vel%L(1))/(s_L - s_M)
+                            E_starL = ((s_L - vel%L(1))*E%L - pTot_L*vel%L(1) + p_star*s_M)/(s_L - s_M)
+                            E_starR = ((s_R - vel%R(1))*E%R - pTot_R*vel%R(1) + p_star*s_M)/(s_R - s_M)
+
+                            ! (5) Compute left/right state vectors and fluxes
+                            U_L = [rho%L, rho%L*vel%L(1:3), B%L(2:3), E%L]
+                            U_starL = [rhoL_star, rhoL_star*s_M, rhoL_star*vel%L(2:3), B%L(2:3), E_starL]
+                            U_R = [rho%R, rho%R*vel%R(1:3), B%R(2:3), E%R]
+                            U_starR = [rhoR_star, rhoR_star*s_M, rhoR_star*vel%R(2:3), B%R(2:3), E_starR]
+
+                            ! Compute the left/right fluxes
+                            F_L(1) = U_L(2)
+                            F_L(2) = U_L(2)*vel%L(1) - B%L(1)*B%L(1) + pTot_L
+                            F_L(3:4) = U_L(2)*vel%L(2:3) - B%L(1)*B%L(2:3)
+                            F_L(5:6) = vel%L(1)*B%L(2:3) - vel%L(2:3)*B%L(1)
+                            F_L(7) = (E%L + pTot_L)*vel%L(1) - B%L(1)*(vel%L(1)*B%L(1) + vel%L(2)*B%L(2) + vel%L(3)*B%L(3))
+
+                            F_R(1) = U_R(2)
+                            F_R(2) = U_R(2)*vel%R(1) - B%R(1)*B%R(1) + pTot_R
+                            F_R(3:4) = U_R(2)*vel%R(2:3) - B%R(1)*B%R(2:3)
+                            F_R(5:6) = vel%R(1)*B%R(2:3) - vel%R(2:3)*B%R(1)
+                            F_R(7) = (E%R + pTot_R)*vel%R(1) - B%R(1)*(vel%R(1)*B%R(1) + vel%R(2)*B%R(2) + vel%R(3)*B%R(3))
+                            ! Compute the star flux using HLL relation
+                            F_starL = F_L + s_L*(U_starL - U_L)
+                            F_starR = F_R + s_R*(U_starR - U_R)
+                            ! Compute the rotational (Alfvén) speeds
+                            s_starL = s_M - abs(B%L(1))/sqrt(rhoL_star)
+                            s_starR = s_M + abs(B%L(1))/sqrt(rhoR_star)
+                            ! Compute the double–star states [Miyoshi Eqns. (59)-(62)]
+                            sqrt_rhoL_star = sqrt(rhoL_star); sqrt_rhoR_star = sqrt(rhoR_star)
+                            vL_star = vel%L(2); wL_star = vel%L(3)
+                            vR_star = vel%R(2); wR_star = vel%R(3)
+
+                            ! (6) Compute the double–star states [Miyoshi Eqns. (59)-(62)]
+                            denom_ds = sqrt_rhoL_star + sqrt_rhoR_star
+                            sign_Bx = sign(1._wp, B%L(1))
+                            v_double = (sqrt_rhoL_star*vL_star + sqrt_rhoR_star*vR_star + (B%R(2) - B%L(2))*sign_Bx)/denom_ds
+                            w_double = (sqrt_rhoL_star*wL_star + sqrt_rhoR_star*wR_star + (B%R(3) - B%L(3))*sign_Bx)/denom_ds
+                            By_double = (sqrt_rhoL_star*B%R(2) + sqrt_rhoR_star*B%L(2) + sqrt_rhoL_star*sqrt_rhoR_star*(vR_star - vL_star)*sign_Bx)/denom_ds
+                            Bz_double = (sqrt_rhoL_star*B%R(3) + sqrt_rhoR_star*B%L(3) + sqrt_rhoL_star*sqrt_rhoR_star*(wR_star - wL_star)*sign_Bx)/denom_ds
+
+                            E_doubleL = E_starL - sqrt_rhoL_star*((vL_star*B%L(2) + wL_star*B%L(3)) - (v_double*By_double + w_double*Bz_double))*sign_Bx
+                            E_doubleR = E_starR + sqrt_rhoR_star*((vR_star*B%R(2) + wR_star*B%R(3)) - (v_double*By_double + w_double*Bz_double))*sign_Bx
+                            E_double = 0.5_wp*(E_doubleL + E_doubleR)
+
+                            U_doubleL = [rhoL_star, rhoL_star*s_M, rhoL_star*v_double, rhoL_star*w_double, By_double, Bz_double, E_double]
+                            U_doubleR = [rhoR_star, rhoR_star*s_M, rhoR_star*v_double, rhoR_star*w_double, By_double, Bz_double, E_double]
+
+                            ! (11) Choose HLLD flux based on wave-speed regions
+                            if (0.0_wp <= s_L) then
+                                F_hlld = F_L
+                            else if (0.0_wp <= s_starL) then
+                                F_hlld = F_L + s_L*(U_starL - U_L)
+                            else if (0.0_wp <= s_M) then
+                                F_hlld = F_starL + s_starL*(U_doubleL - U_starL)
+                            else if (0.0_wp <= s_starR) then
+                                F_hlld = F_starR + s_starR*(U_doubleR - U_starR)
+                            else if (0.0_wp <= s_R) then
+                                F_hlld = F_R + s_R*(U_starR - U_R)
+                            else
+                                F_hlld = F_R
+                            end if
+
+                            ! (12) Reorder and write temporary variables to the flux array
+                            ! Mass
+                            flux_rs${XYZ}$_vf(j, k, l, 1) = F_hlld(1) ! TODO multi-component
+                            ! Momentum
+                            flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(1)) = F_hlld(2)
+                            flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(2)) = F_hlld(3)
+                            flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(3)) = F_hlld(4)
+                            ! Magnetic field
+                            if (n == 0) then
+                                flux_rs${XYZ}$_vf(j, k, l, B_idx%beg) = F_hlld(5)
+                                flux_rs${XYZ}$_vf(j, k, l, B_idx%beg + 1) = F_hlld(6)
+                            else
+                                flux_rs${XYZ}$_vf(j, k, l, B_idx%beg + dir_idx(2) - 1) = F_hlld(5)
+                                flux_rs${XYZ}$_vf(j, k, l, B_idx%beg + dir_idx(3) - 1) = F_hlld(6)
+                            end if
+                            ! Energy
+                            flux_rs${XYZ}$_vf(j, k, l, E_idx) = F_hlld(7)
+                            ! Partial fraction
+                            $:GPU_LOOP(parallelism='[seq]')
+                            do i = advxb, advxe
+                                flux_rs${XYZ}$_vf(j, k, l, i) = 0._wp ! TODO multi-component (zero for now)
+                            end do
+
+                            flux_src_rs${XYZ}$_vf(j, k, l, advxb) = 0._wp
+                        end do
+                    end do
+                end do
+                $:END_GPU_PARALLEL_LOOP()
+            end if
+        #:endfor
+
+        call s_finalize_riemann_solver(flux_vf, flux_src_vf, flux_gsrc_vf, &
+                                       norm_dir)
+    end subroutine s_hlld_riemann_solver
+
+    !>  The computation of parameters, the allocation of memory,
+        !!      the association of pointers and/or the execution of any
+        !!      other procedures that are necessary to setup the module.
+    impure subroutine s_initialize_riemann_solvers_module
+
+        ! Allocating the variables that will be utilized to formulate the
+        ! left, right, and average states of the Riemann problem, as well
+        ! the Riemann problem solution
+        integer :: i, j
+
+        @:ALLOCATE(Gs_rs(1:num_fluids))
+
+        do i = 1, num_fluids
+            Gs_rs(i) = fluid_pp(i)%G
+        end do
+        $:GPU_UPDATE(device='[Gs_rs]')
+
+        if (viscous) then
+            $:GPU_UPDATE(device='[Re_idx,Re_size]')
+        end if
+
+        $:GPU_ENTER_DATA(copyin='[is1,is2,is3,isx,isy,isz]')
+
+        is1%beg = -1; is2%beg = 0; is3%beg = 0
+        is1%end = m; is2%end = n; is3%end = p
+
+        @:ALLOCATE(flux_rsx_vf(is1%beg:is1%end, &
+            is2%beg:is2%end, &
+            is3%beg:is3%end, 1:sys_size))
+        @:ALLOCATE(flux_gsrc_rsx_vf(is1%beg:is1%end, &
+            is2%beg:is2%end, &
+            is3%beg:is3%end, 1:sys_size))
+        @:ALLOCATE(flux_src_rsx_vf(is1%beg:is1%end, &
+            is2%beg:is2%end, &
+            is3%beg:is3%end, advxb:sys_size))
+        @:ALLOCATE(vel_src_rsx_vf(is1%beg:is1%end, &
+            is2%beg:is2%end, &
+            is3%beg:is3%end, 1:num_vels))
+        if (qbmm) then
+            @:ALLOCATE(mom_sp_rsx_vf(is1%beg:is1%end + 1, is2%beg:is2%end, is3%beg:is3%end, 1:4))
+        end if
+
+        if (viscous) then
+            @:ALLOCATE(Re_avg_rsx_vf(is1%beg:is1%end, &
+                is2%beg:is2%end, &
+                is3%beg:is3%end, 1:2))
+        end if
+
+        if (n == 0) return
+
+        is1%beg = -1; is2%beg = 0; is3%beg = 0
+        is1%end = n; is2%end = m; is3%end = p
+
+        @:ALLOCATE(flux_rsy_vf(is1%beg:is1%end, &
+            is2%beg:is2%end, &
+            is3%beg:is3%end, 1:sys_size))
+        @:ALLOCATE(flux_gsrc_rsy_vf(is1%beg:is1%end, &
+            is2%beg:is2%end, &
+            is3%beg:is3%end, 1:sys_size))
+        @:ALLOCATE(flux_src_rsy_vf(is1%beg:is1%end, &
+            is2%beg:is2%end, &
+            is3%beg:is3%end, advxb:sys_size))
+        @:ALLOCATE(vel_src_rsy_vf(is1%beg:is1%end, &
+            is2%beg:is2%end, &
+            is3%beg:is3%end, 1:num_vels))
+
+        if (qbmm) then
+            @:ALLOCATE(mom_sp_rsy_vf(is1%beg:is1%end + 1, is2%beg:is2%end, is3%beg:is3%end, 1:4))
+        end if
+
+        if (viscous) then
+            @:ALLOCATE(Re_avg_rsy_vf(is1%beg:is1%end, &
+                is2%beg:is2%end, &
+                is3%beg:is3%end, 1:2))
+        end if
+
+        if (p == 0) return
+
+        is1%beg = -1; is2%beg = 0; is3%beg = 0
+        is1%end = p; is2%end = n; is3%end = m
+
+        @:ALLOCATE(flux_rsz_vf(is1%beg:is1%end, &
+            is2%beg:is2%end, &
+            is3%beg:is3%end, 1:sys_size))
+        @:ALLOCATE(flux_gsrc_rsz_vf(is1%beg:is1%end, &
+            is2%beg:is2%end, &
+            is3%beg:is3%end, 1:sys_size))
+        @:ALLOCATE(flux_src_rsz_vf(is1%beg:is1%end, &
+            is2%beg:is2%end, &
+            is3%beg:is3%end, advxb:sys_size))
+        @:ALLOCATE(vel_src_rsz_vf(is1%beg:is1%end, &
+            is2%beg:is2%end, &
+            is3%beg:is3%end, 1:num_vels))
+
+        if (qbmm) then
+            @:ALLOCATE(mom_sp_rsz_vf(is1%beg:is1%end + 1, is2%beg:is2%end, is3%beg:is3%end, 1:4))
+        end if
+
+        if (viscous) then
+            @:ALLOCATE(Re_avg_rsz_vf(is1%beg:is1%end, &
+                is2%beg:is2%end, &
+                is3%beg:is3%end, 1:2))
+        end if
+
+    end subroutine s_initialize_riemann_solvers_module
+
+    !>  The purpose of this subroutine is to populate the buffers
+        !!      of the left and right Riemann states variables, depending
+        !!      on the boundary conditions.
+        !!  @param qL_prim_rsx_vf Left WENO-reconstructed cell-boundary values (x-dir)
+        !!  @param qL_prim_rsy_vf Left WENO-reconstructed cell-boundary values (y-dir)
+        !!  @param qL_prim_rsz_vf Left WENO-reconstructed cell-boundary values (z-dir)
+        !!  @param dqL_prim_dx_vf The  left WENO-reconstructed cell-boundary values of the
+        !!      first-order x-dir spatial derivatives
+        !!  @param dqL_prim_dy_vf The  left WENO-reconstructed cell-boundary values of the
+        !!      first-order y-dir spatial derivatives
+        !!  @param dqL_prim_dz_vf The  left WENO-reconstructed cell-boundary values of the
+        !!      first-order z-dir spatial derivatives
+        !!  @param qR_prim_rsx_vf Right WENO-reconstructed cell-boundary values (x-dir)
+        !!  @param qR_prim_rsy_vf Right WENO-reconstructed cell-boundary values (y-dir)
+        !!  @param qR_prim_rsz_vf Right WENO-reconstructed cell-boundary values (z-dir)
+        !!  @param dqR_prim_dx_vf The right WENO-reconstructed cell-boundary values of the
+        !!      first-order x-dir spatial derivatives
+        !!  @param dqR_prim_dy_vf The right WENO-reconstructed cell-boundary values of the
+        !!      first-order y-dir spatial derivatives
+        !!  @param dqR_prim_dz_vf The right WENO-reconstructed cell-boundary values of the
+        !!      first-order z-dir spatial derivatives
+        !!  @param norm_dir Dir. splitting direction
+        !!  @param ix Index bounds in the x-dir
+        !!  @param iy Index bounds in the y-dir
+        !!  @param iz Index bounds in the z-dir
+    subroutine s_populate_riemann_states_variables_buffers( &
+        qL_prim_rsx_vf, qL_prim_rsy_vf, qL_prim_rsz_vf, dqL_prim_dx_vf, &
+        dqL_prim_dy_vf, &
+        dqL_prim_dz_vf, &
+        qR_prim_rsx_vf, qR_prim_rsy_vf, qR_prim_rsz_vf, dqR_prim_dx_vf, &
+        dqR_prim_dy_vf, &
+        dqR_prim_dz_vf, &
+        norm_dir, ix, iy, iz)
+
+        real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:), intent(inout) :: qL_prim_rsx_vf, qL_prim_rsy_vf, qL_prim_rsz_vf, qR_prim_rsx_vf, qR_prim_rsy_vf, qR_prim_rsz_vf
+
+        type(scalar_field), &
+            allocatable, dimension(:), &
+            intent(inout) :: dqL_prim_dx_vf, dqR_prim_dx_vf, &
+                             dqL_prim_dy_vf, dqR_prim_dy_vf, &
+                             dqL_prim_dz_vf, dqR_prim_dz_vf
+
+        integer, intent(in) :: norm_dir
+        type(int_bounds_info), intent(in) :: ix, iy, iz
+
+        integer :: i, j, k, l !< Generic loop iterator
+
+        if (norm_dir == 1) then
+            is1 = ix; is2 = iy; is3 = iz
+            dir_idx = (/1, 2, 3/); dir_flg = (/1._wp, 0._wp, 0._wp/)
+        elseif (norm_dir == 2) then
+            is1 = iy; is2 = ix; is3 = iz
+            dir_idx = (/2, 1, 3/); dir_flg = (/0._wp, 1._wp, 0._wp/)
+        else
+            is1 = iz; is2 = iy; is3 = ix
+            dir_idx = (/3, 1, 2/); dir_flg = (/0._wp, 0._wp, 1._wp/)
+        end if
+
+        $:GPU_UPDATE(device='[is1,is2,is3]')
+
+        if (elasticity) then
+            if (norm_dir == 1) then
+                dir_idx_tau = (/1, 2, 4/)
+            else if (norm_dir == 2) then
+                dir_idx_tau = (/3, 2, 5/)
+            else
+                dir_idx_tau = (/6, 4, 5/)
+            end if
+        end if
+
+        isx = ix; isy = iy; isz = iz
+        ! for stuff in the same module
+        $:GPU_UPDATE(device='[isx,isy,isz]')
+        ! for stuff in different modules
+        $:GPU_UPDATE(device='[dir_idx,dir_flg,dir_idx_tau]')
+
+        ! Population of Buffers in x-direction
+        if (norm_dir == 1) then
+
+            if (bc_x%beg == BC_RIEMANN_EXTRAP) then    ! Riemann state extrap. BC at beginning
+                $:GPU_PARALLEL_LOOP(collapse=3)
+                do i = 1, sys_size
+                    do l = is3%beg, is3%end
+                        do k = is2%beg, is2%end
+                            qL_prim_rsx_vf(-1, k, l, i) = &
+                                qR_prim_rsx_vf(0, k, l, i)
+                        end do
+                    end do
+                end do
+                $:END_GPU_PARALLEL_LOOP()
+
+                if (viscous .or. dummy) then
+                    $:GPU_PARALLEL_LOOP(collapse=3)
+                    do i = momxb, momxe
+                        do l = isz%beg, isz%end
+                            do k = isy%beg, isy%end
+
+                                dqL_prim_dx_vf(i)%sf(-1, k, l) = &
+                                    dqR_prim_dx_vf(i)%sf(0, k, l)
+                            end do
+                        end do
+                    end do
+                    $:END_GPU_PARALLEL_LOOP()
+
+                    if (n > 0) then
+                        $:GPU_PARALLEL_LOOP(collapse=3)
+                        do i = momxb, momxe
+                            do l = isz%beg, isz%end
+                                do k = isy%beg, isy%end
+
+                                    dqL_prim_dy_vf(i)%sf(-1, k, l) = &
+                                        dqR_prim_dy_vf(i)%sf(0, k, l)
+                                end do
+                            end do
+                        end do
+                        $:END_GPU_PARALLEL_LOOP()
+
+                        if (p > 0) then
+                            $:GPU_PARALLEL_LOOP(collapse=3)
+                            do i = momxb, momxe
+                                do l = isz%beg, isz%end
+                                    do k = isy%beg, isy%end
+
+                                        dqL_prim_dz_vf(i)%sf(-1, k, l) = &
+                                            dqR_prim_dz_vf(i)%sf(0, k, l)
+                                    end do
+                                end do
+                            end do
+                            $:END_GPU_PARALLEL_LOOP()
+                        end if
+
+                    end if
+
+                end if
+
+            end if
+
+            if (bc_x%end == BC_RIEMANN_EXTRAP) then    ! Riemann state extrap. BC at end
+
+                $:GPU_PARALLEL_LOOP(collapse=3)
+                do i = 1, sys_size
+                    do l = is3%beg, is3%end
+                        do k = is2%beg, is2%end
+                            qR_prim_rsx_vf(m + 1, k, l, i) = &
+                                qL_prim_rsx_vf(m, k, l, i)
+                        end do
+                    end do
+                end do
+                $:END_GPU_PARALLEL_LOOP()
+
+                if (viscous .or. dummy) then
+
+                    $:GPU_PARALLEL_LOOP(collapse=3)
+                    do i = momxb, momxe
+                        do l = isz%beg, isz%end
+                            do k = isy%beg, isy%end
+
+                                dqR_prim_dx_vf(i)%sf(m + 1, k, l) = &
+                                    dqL_prim_dx_vf(i)%sf(m, k, l)
+                            end do
+                        end do
+                    end do
+                    $:END_GPU_PARALLEL_LOOP()
+
+                    if (n > 0) then
+                        $:GPU_PARALLEL_LOOP(collapse=3)
+                        do i = momxb, momxe
+                            do l = isz%beg, isz%end
+                                do k = isy%beg, isy%end
+
+                                    dqR_prim_dy_vf(i)%sf(m + 1, k, l) = &
+                                        dqL_prim_dy_vf(i)%sf(m, k, l)
+                                end do
+                            end do
+                        end do
+                        $:END_GPU_PARALLEL_LOOP()
+
+                        if (p > 0) then
+                            $:GPU_PARALLEL_LOOP(collapse=3)
+                            do i = momxb, momxe
+                                do l = isz%beg, isz%end
+                                    do k = isy%beg, isy%end
+
+                                        dqR_prim_dz_vf(i)%sf(m + 1, k, l) = &
+                                            dqL_prim_dz_vf(i)%sf(m, k, l)
+                                    end do
+                                end do
+                            end do
+                            $:END_GPU_PARALLEL_LOOP()
+                        end if
+
+                    end if
+
+                end if
+
+            end if
+            ! END: Population of Buffers in x-direction
+
+            ! Population of Buffers in y-direction
+        elseif (norm_dir == 2) then
+
+            if (bc_y%beg == BC_RIEMANN_EXTRAP) then    ! Riemann state extrap. BC at beginning
+                $:GPU_PARALLEL_LOOP(collapse=3)
+                do i = 1, sys_size
+                    do l = is3%beg, is3%end
+                        do k = is2%beg, is2%end
+                            qL_prim_rsy_vf(-1, k, l, i) = &
+                                qR_prim_rsy_vf(0, k, l, i)
+                        end do
+                    end do
+                end do
+                $:END_GPU_PARALLEL_LOOP()
+
+                if (viscous .or. dummy) then
+
+                    $:GPU_PARALLEL_LOOP(collapse=3)
+                    do i = momxb, momxe
+                        do l = isz%beg, isz%end
+                            do j = isx%beg, isx%end
+                                dqL_prim_dx_vf(i)%sf(j, -1, l) = &
+                                    dqR_prim_dx_vf(i)%sf(j, 0, l)
+                            end do
+                        end do
+                    end do
+                    $:END_GPU_PARALLEL_LOOP()
+
+                    $:GPU_PARALLEL_LOOP(collapse=3)
+                    do i = momxb, momxe
+                        do l = isz%beg, isz%end
+                            do j = isx%beg, isx%end
+                                dqL_prim_dy_vf(i)%sf(j, -1, l) = &
+                                    dqR_prim_dy_vf(i)%sf(j, 0, l)
+                            end do
+                        end do
+                    end do
+                    $:END_GPU_PARALLEL_LOOP()
+
+                    if (p > 0) then
+                        $:GPU_PARALLEL_LOOP(collapse=3)
+                        do i = momxb, momxe
+                            do l = isz%beg, isz%end
+                                do j = isx%beg, isx%end
+                                    dqL_prim_dz_vf(i)%sf(j, -1, l) = &
+                                        dqR_prim_dz_vf(i)%sf(j, 0, l)
+                                end do
+                            end do
+                        end do
+                        $:END_GPU_PARALLEL_LOOP()
+                    end if
+
+                end if
+
+            end if
+
+            if (bc_y%end == BC_RIEMANN_EXTRAP) then    ! Riemann state extrap. BC at end
+
+                $:GPU_PARALLEL_LOOP(collapse=3)
+                do i = 1, sys_size
+                    do l = is3%beg, is3%end
+                        do k = is2%beg, is2%end
+                            qR_prim_rsy_vf(n + 1, k, l, i) = &
+                                qL_prim_rsy_vf(n, k, l, i)
+                        end do
+                    end do
+                end do
+                $:END_GPU_PARALLEL_LOOP()
+
+                if (viscous .or. dummy) then
+
+                    $:GPU_PARALLEL_LOOP(collapse=3)
+                    do i = momxb, momxe
+                        do l = isz%beg, isz%end
+                            do j = isx%beg, isx%end
+                                dqR_prim_dx_vf(i)%sf(j, n + 1, l) = &
+                                    dqL_prim_dx_vf(i)%sf(j, n, l)
+                            end do
+                        end do
+                    end do
+                    $:END_GPU_PARALLEL_LOOP()
+
+                    $:GPU_PARALLEL_LOOP(collapse=3)
+                    do i = momxb, momxe
+                        do l = isz%beg, isz%end
+                            do j = isx%beg, isx%end
+                                dqR_prim_dy_vf(i)%sf(j, n + 1, l) = &
+                                    dqL_prim_dy_vf(i)%sf(j, n, l)
+                            end do
+                        end do
+                    end do
+                    $:END_GPU_PARALLEL_LOOP()
+
+                    if (p > 0) then
+                        $:GPU_PARALLEL_LOOP(collapse=3)
+                        do i = momxb, momxe
+                            do l = isz%beg, isz%end
+                                do j = isx%beg, isx%end
+                                    dqR_prim_dz_vf(i)%sf(j, n + 1, l) = &
+                                        dqL_prim_dz_vf(i)%sf(j, n, l)
+                                end do
+                            end do
+                        end do
+                        $:END_GPU_PARALLEL_LOOP()
+                    end if
+
+                end if
+
+            end if
+            ! END: Population of Buffers in y-direction
+
+            ! Population of Buffers in z-direction
+        else
+
+            if (bc_z%beg == BC_RIEMANN_EXTRAP) then    ! Riemann state extrap. BC at beginning
+                $:GPU_PARALLEL_LOOP(collapse=3)
+                do i = 1, sys_size
+                    do l = is3%beg, is3%end
+                        do k = is2%beg, is2%end
+                            qL_prim_rsz_vf(-1, k, l, i) = &
+                                qR_prim_rsz_vf(0, k, l, i)
+                        end do
+                    end do
+                end do
+                $:END_GPU_PARALLEL_LOOP()
+
+                if (viscous .or. dummy) then
+                    $:GPU_PARALLEL_LOOP(collapse=3)
+                    do i = momxb, momxe
+                        do k = isy%beg, isy%end
+                            do j = isx%beg, isx%end
+                                dqL_prim_dx_vf(i)%sf(j, k, -1) = &
+                                    dqR_prim_dx_vf(i)%sf(j, k, 0)
+                            end do
+                        end do
+                    end do
+                    $:END_GPU_PARALLEL_LOOP()
+                    $:GPU_PARALLEL_LOOP(collapse=3)
+                    do i = momxb, momxe
+                        do k = isy%beg, isy%end
+                            do j = isx%beg, isx%end
+                                dqL_prim_dy_vf(i)%sf(j, k, -1) = &
+                                    dqR_prim_dy_vf(i)%sf(j, k, 0)
+                            end do
+                        end do
+                    end do
+                    $:END_GPU_PARALLEL_LOOP()
+                    $:GPU_PARALLEL_LOOP(collapse=3)
+                    do i = momxb, momxe
+                        do k = isy%beg, isy%end
+                            do j = isx%beg, isx%end
+                                dqL_prim_dz_vf(i)%sf(j, k, -1) = &
+                                    dqR_prim_dz_vf(i)%sf(j, k, 0)
+                            end do
+                        end do
+                    end do
+                    $:END_GPU_PARALLEL_LOOP()
+                end if
+
+            end if
+
+            if (bc_z%end == BC_RIEMANN_EXTRAP) then    ! Riemann state extrap. BC at end
+
+                $:GPU_PARALLEL_LOOP(collapse=3)
+                do i = 1, sys_size
+                    do l = is3%beg, is3%end
+                        do k = is2%beg, is2%end
+                            qR_prim_rsz_vf(p + 1, k, l, i) = &
+                                qL_prim_rsz_vf(p, k, l, i)
+                        end do
+                    end do
+                end do
+                $:END_GPU_PARALLEL_LOOP()
+
+                if (viscous .or. dummy) then
+                    $:GPU_PARALLEL_LOOP(collapse=3)
+                    do i = momxb, momxe
+                        do k = isy%beg, isy%end
+                            do j = isx%beg, isx%end
+                                dqR_prim_dx_vf(i)%sf(j, k, p + 1) = &
+                                    dqL_prim_dx_vf(i)%sf(j, k, p)
+                            end do
+                        end do
+                    end do
+                    $:END_GPU_PARALLEL_LOOP()
+
+                    $:GPU_PARALLEL_LOOP(collapse=3)
+                    do i = momxb, momxe
+                        do k = isy%beg, isy%end
+                            do j = isx%beg, isx%end
+                                dqR_prim_dy_vf(i)%sf(j, k, p + 1) = &
+                                    dqL_prim_dy_vf(i)%sf(j, k, p)
+                            end do
+                        end do
+                    end do
+                    $:END_GPU_PARALLEL_LOOP()
+
+                    $:GPU_PARALLEL_LOOP(collapse=3)
+                    do i = momxb, momxe
+                        do k = isy%beg, isy%end
+                            do j = isx%beg, isx%end
+                                dqR_prim_dz_vf(i)%sf(j, k, p + 1) = &
+                                    dqL_prim_dz_vf(i)%sf(j, k, p)
+                            end do
+                        end do
+                    end do
+                    $:END_GPU_PARALLEL_LOOP()
+                end if
+
+            end if
+
+        end if
+        ! END: Population of Buffers in z-direction
+
+    end subroutine s_populate_riemann_states_variables_buffers
+
+    !>  The computation of parameters, the allocation of memory,
+        !!      the association of pointers and/or the execution of any
+        !!      other procedures needed to configure the chosen Riemann
+        !!      solver algorithm.
+        !!  @param flux_src_vf Intra-cell fluxes sources
+        !!  @param norm_dir Dir. splitting direction
+    subroutine s_initialize_riemann_solver( &
+        flux_src_vf, &
+        norm_dir)
+
+        type(scalar_field), &
+            dimension(sys_size), &
+            intent(inout) :: flux_src_vf
+
+        integer, intent(in) :: norm_dir
+
+        integer :: i, j, k, l ! Generic loop iterators
+
+        ! Reshaping Inputted Data in x-direction
+
+        if (norm_dir == 1) then
+
+            if (viscous .or. (surface_tension) .or. dummy) then
+
+                $:GPU_PARALLEL_LOOP(collapse=4)
+                do i = momxb, E_idx
+                    do l = is3%beg, is3%end
+                        do k = is2%beg, is2%end
+                            do j = is1%beg, is1%end
+                                flux_src_vf(i)%sf(j, k, l) = 0._wp
+                            end do
+                        end do
+                    end do
+                end do
+                $:END_GPU_PARALLEL_LOOP()
+            end if
+
+            if (chem_params%diffusion) then
+                $:GPU_PARALLEL_LOOP(collapse=4)
+                do i = E_idx, chemxe
+                    do l = is3%beg, is3%end
+                        do k = is2%beg, is2%end
+                            do j = is1%beg, is1%end
+                                if (i == E_idx .or. i >= chemxb) then
+                                    flux_src_vf(i)%sf(j, k, l) = 0._wp
+                                end if
+                            end do
+                        end do
+                    end do
+                end do
+                $:END_GPU_PARALLEL_LOOP()
+            end if
+
+            if (qbmm) then
+                $:GPU_PARALLEL_LOOP(collapse=4)
+                do i = 1, 4
+                    do l = is3%beg, is3%end
+                        do k = is2%beg, is2%end
+                            do j = is1%beg, is1%end + 1
+                                mom_sp_rsx_vf(j, k, l, i) = mom_sp(i)%sf(j, k, l)
+                            end do
+                        end do
+                    end do
+                end do
+                $:END_GPU_PARALLEL_LOOP()
+            end if
+
+            ! Reshaping Inputted Data in y-direction
+        elseif (norm_dir == 2) then
+
+            if (viscous .or. (surface_tension) .or. dummy) then
+                $:GPU_PARALLEL_LOOP(collapse=4)
+                do i = momxb, E_idx
+                    do l = is3%beg, is3%end
+                        do j = is1%beg, is1%end
+                            do k = is2%beg, is2%end
+                                flux_src_vf(i)%sf(k, j, l) = 0._wp
+                            end do
+                        end do
+                    end do
+                end do
+                $:END_GPU_PARALLEL_LOOP()
+            end if
+
+            if (chem_params%diffusion) then
+                $:GPU_PARALLEL_LOOP(collapse=4)
+                do i = E_idx, chemxe
+                    do l = is3%beg, is3%end
+                        do j = is1%beg, is1%end
+                            do k = is2%beg, is2%end
+                                if (i == E_idx .or. i >= chemxb) then
+                                    flux_src_vf(i)%sf(k, j, l) = 0._wp
+                                end if
+                            end do
+                        end do
+                    end do
+                end do
+                $:END_GPU_PARALLEL_LOOP()
+            end if
+
+            if (qbmm) then
+                $:GPU_PARALLEL_LOOP(collapse=4)
+                do i = 1, 4
+                    do l = is3%beg, is3%end
+                        do k = is2%beg, is2%end
+                            do j = is1%beg, is1%end + 1
+                                mom_sp_rsy_vf(j, k, l, i) = mom_sp(i)%sf(k, j, l)
+                            end do
+                        end do
+                    end do
+                end do
+                $:END_GPU_PARALLEL_LOOP()
+            end if
+
+            ! Reshaping Inputted Data in z-direction
+        else
+
+            if (viscous .or. (surface_tension) .or. dummy) then
+                $:GPU_PARALLEL_LOOP(collapse=4)
+                do i = momxb, E_idx
+                    do j = is1%beg, is1%end
+                        do k = is2%beg, is2%end
+                            do l = is3%beg, is3%end
+                                flux_src_vf(i)%sf(l, k, j) = 0._wp
+                            end do
+                        end do
+                    end do
+                end do
+                $:END_GPU_PARALLEL_LOOP()
+            end if
+
+            if (chem_params%diffusion) then
+                $:GPU_PARALLEL_LOOP(collapse=4)
+                do i = E_idx, chemxe
+                    do j = is1%beg, is1%end
+                        do k = is2%beg, is2%end
+                            do l = is3%beg, is3%end
+                                if (i == E_idx .or. i >= chemxb) then
+                                    flux_src_vf(i)%sf(l, k, j) = 0._wp
+                                end if
+                            end do
+                        end do
+                    end do
+                end do
+                $:END_GPU_PARALLEL_LOOP()
+            end if
+
+            if (qbmm) then
+                $:GPU_PARALLEL_LOOP(collapse=4)
+                do i = 1, 4
+                    do l = is3%beg, is3%end
+                        do k = is2%beg, is2%end
+                            do j = is1%beg, is1%end + 1
+                                mom_sp_rsz_vf(j, k, l, i) = mom_sp(i)%sf(l, k, j)
+                            end do
+                        end do
+                    end do
+                end do
+                $:END_GPU_PARALLEL_LOOP()
+            end if
+
+        end if
+
+    end subroutine s_initialize_riemann_solver
+
+    !> @brief Computes cylindrical viscous source flux contributions for momentum and energy.
+        !! Calculates Cartesian components of the stress tensor using averaged velocity derivatives
+        !! and cylindrical geometric factors, then updates `flux_src_vf`.
+        !! Assumes x-dir is axial (z_cyl), y-dir is radial (r_cyl), z-dir is azimuthal (theta_cyl for derivatives).
+        !! @param[in] velL_vf Left boundary velocity (\f$v_x, v_y, v_z\f$) (num_dims scalar_field).
+        !! @param[in] dvelL_dx_vf Left boundary \f$\partial v_i/\partial x\f$ (num_dims scalar_field).
+        !! @param[in] dvelL_dy_vf Left boundary \f$\partial v_i/\partial y\f$ (num_dims scalar_field).
+        !! @param[in] dvelL_dz_vf Left boundary \f$\partial v_i/\partial z\f$ (num_dims scalar_field).
+        !! @param[in] velR_vf Right boundary velocity (\f$v_x, v_y, v_z\f$) (num_dims scalar_field).
+        !! @param[in] dvelR_dx_vf Right boundary \f$\partial v_i/\partial x\f$ (num_dims scalar_field).
+        !! @param[in] dvelR_dy_vf Right boundary \f$\partial v_i/\partial y\f$ (num_dims scalar_field).
+        !! @param[in] dvelR_dz_vf Right boundary \f$\partial v_i/\partial z\f$ (num_dims scalar_field).
+        !! @param[inout] flux_src_vf Intercell source flux array to update (sys_size scalar_field).
+        !! @param[in] norm_dir Interface normal direction (1=x-face, 2=y-face, 3=z-face).
+        !! @param[in] ix Global X-direction loop bounds (int_bounds_info).
+        !! @param[in] iy Global Y-direction loop bounds (int_bounds_info).
+        !! @param[in] iz Global Z-direction loop bounds (int_bounds_info).
+    subroutine s_compute_cylindrical_viscous_source_flux(velL_vf, &
+                                                         dvelL_dx_vf, dvelL_dy_vf, dvelL_dz_vf, &
+                                                         velR_vf, &
+                                                         dvelR_dx_vf, dvelR_dy_vf, dvelR_dz_vf, &
+                                                         flux_src_vf, norm_dir, ix, iy, iz)
+
+        type(scalar_field), dimension(num_dims), intent(in) :: velL_vf, velR_vf
+        type(scalar_field), dimension(num_dims), intent(in) :: dvelL_dx_vf, dvelR_dx_vf
+        type(scalar_field), dimension(num_dims), intent(in) :: dvelL_dy_vf, dvelR_dy_vf
+        type(scalar_field), dimension(num_dims), intent(in) :: dvelL_dz_vf, dvelR_dz_vf
+        type(scalar_field), dimension(sys_size), intent(inout) :: flux_src_vf
+        integer, intent(in) :: norm_dir
+        type(int_bounds_info), intent(in) :: ix, iy, iz
+
+        ! Local variables
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3) :: avg_v_int       !!< Averaged interface velocity (\f$v_x, v_y, v_z\f$) (grid directions).
+            real(wp), dimension(3) :: avg_dvdx_int    !!< Averaged interface \f$\partial v_i/\partial x\f$ (grid dir 1).
+            real(wp), dimension(3) :: avg_dvdy_int    !!< Averaged interface \f$\partial v_i/\partial y\f$ (grid dir 2).
+            real(wp), dimension(3) :: avg_dvdz_int    !!< Averaged interface \f$\partial v_i/\partial z\f$ (grid dir 3).
+            real(wp), dimension(3) :: vel_src_int !!< Interface velocity (\f$v_1,v_2,v_3\f$) (grid directions) for viscous work.
+            real(wp), dimension(3) :: stress_vector_shear !!< Shear stress vector (\f$\sigma_{N1}, \sigma_{N2}, \sigma_{N3}\f$) on N-face (grid directions).
+        #:else
+            real(wp), dimension(num_dims) :: avg_v_int       !!< Averaged interface velocity (\f$v_x, v_y, v_z\f$) (grid directions).
+            real(wp), dimension(num_dims) :: avg_dvdx_int    !!< Averaged interface \f$\partial v_i/\partial x\f$ (grid dir 1).
+            real(wp), dimension(num_dims) :: avg_dvdy_int    !!< Averaged interface \f$\partial v_i/\partial y\f$ (grid dir 2).
+            real(wp), dimension(num_dims) :: avg_dvdz_int    !!< Averaged interface \f$\partial v_i/\partial z\f$ (grid dir 3).
+            real(wp), dimension(num_dims) :: vel_src_int !!< Interface velocity (\f$v_1,v_2,v_3\f$) (grid directions) for viscous work.
+            real(wp), dimension(num_dims) :: stress_vector_shear !!< Shear stress vector (\f$\sigma_{N1}, \sigma_{N2}, \sigma_{N3}\f$) on N-face (grid directions).
+        #:endif
+        real(wp) :: stress_normal_bulk  !!< Normal bulk stress component \f$\sigma_{NN}\f$ on N-face.
+
+        real(wp) :: Re_s, Re_b        !!< Effective interface shear and bulk Reynolds numbers.
+        real(wp) :: r_eff             !!< Effective radius at interface for cylindrical terms.
+        real(wp) :: div_v_term_const  !!< Common term \f$-(2/3)(\nabla \cdot \mathbf{v}) / \text{Re}_s\f$ for shear stress diagonal.
+        real(wp) :: divergence_cyl    !!< Full divergence \f$\nabla \cdot \mathbf{v}\f$ in cylindrical coordinates.
+
+        integer :: j, k, l           !!< Loop iterators for \f$x, y, z\f$ grid directions.
+        integer :: i_vel             !!< Loop iterator for velocity components.
+        integer :: idx_rp(3)         !!< Indices \f$(j,k,l)\f$ of 'right' point for averaging.
+
+        $:GPU_PARALLEL_LOOP(collapse=3, private='[idx_rp, avg_v_int, avg_dvdx_int, avg_dvdy_int, avg_dvdz_int, Re_s, Re_b, vel_src_int, r_eff, divergence_cyl, stress_vector_shear, stress_normal_bulk, div_v_term_const]')
+        do l = iz%beg, iz%end
+            do k = iy%beg, iy%end
+                do j = ix%beg, ix%end
+
+                    ! Determine indices for the 'right' state for averaging across the interface
+                    idx_rp = [j, k, l]
+                    idx_rp(norm_dir) = idx_rp(norm_dir) + 1
+
+                    ! Average velocities and their derivatives at the interface
+                    ! For cylindrical: x-dir ~ axial (z_cyl), y-dir ~ radial (r_cyl), z-dir ~ azimuthal (theta_cyl)
+                    $:GPU_LOOP(parallelism='[seq]')
+                    do i_vel = 1, num_dims
+                        avg_v_int(i_vel) = 0.5_wp*(velL_vf(i_vel)%sf(j, k, l) + velR_vf(i_vel)%sf(idx_rp(1), idx_rp(2), idx_rp(3)))
+
+                        avg_dvdx_int(i_vel) = 0.5_wp*(dvelL_dx_vf(i_vel)%sf(j, k, l) + &
+                                                      dvelR_dx_vf(i_vel)%sf(idx_rp(1), idx_rp(2), idx_rp(3)))
+                        if (num_dims > 1) then
+                            avg_dvdy_int(i_vel) = 0.5_wp*(dvelL_dy_vf(i_vel)%sf(j, k, l) + &
+                                                          dvelR_dy_vf(i_vel)%sf(idx_rp(1), idx_rp(2), idx_rp(3)))
+                        else
+                            avg_dvdy_int(i_vel) = 0.0_wp
+                        end if
+                        if (num_dims > 2) then
+                            avg_dvdz_int(i_vel) = 0.5_wp*(dvelL_dz_vf(i_vel)%sf(j, k, l) + &
+                                                          dvelR_dz_vf(i_vel)%sf(idx_rp(1), idx_rp(2), idx_rp(3)))
+                        else
+                            avg_dvdz_int(i_vel) = 0.0_wp
+                        end if
+                    end do
+
+                    ! Get Re numbers and interface velocity for viscous work
+                    select case (norm_dir)
+                    case (1) ! x-face (axial face in z_cyl direction)
+                        Re_s = Re_avg_rsx_vf(j, k, l, 1)
+                        Re_b = Re_avg_rsx_vf(j, k, l, 2)
+                        vel_src_int = vel_src_rsx_vf(j, k, l, 1:num_dims)
+                        r_eff = y_cc(k)
+                    case (2) ! y-face (radial face in r_cyl direction)
+                        Re_s = Re_avg_rsy_vf(k, j, l, 1)
+                        Re_b = Re_avg_rsy_vf(k, j, l, 2)
+                        vel_src_int = vel_src_rsy_vf(k, j, l, 1:num_dims)
+                        r_eff = y_cb(k)
+                    case (3) ! z-face (azimuthal face in theta_cyl direction)
+                        Re_s = Re_avg_rsz_vf(l, k, j, 1)
+                        Re_b = Re_avg_rsz_vf(l, k, j, 2)
+                        vel_src_int = vel_src_rsz_vf(l, k, j, 1:num_dims)
+                        r_eff = y_cc(k)
+                    end select
+
+                    ! Divergence in cylindrical coordinates (vx=vz_cyl, vy=vr_cyl, vz=vtheta_cyl)
+                    #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
+                        divergence_cyl = avg_dvdx_int(1) + avg_dvdy_int(2) + avg_v_int(2)/r_eff
+                        if (num_dims > 2) then
+                            #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
+                                divergence_cyl = divergence_cyl + avg_dvdz_int(3)/r_eff
+                            #:endif
+                        end if
+                    #:endif
+
+                    stress_vector_shear = 0.0_wp
+                    stress_normal_bulk = 0.0_wp
+
+                    if (shear_stress) then
+                        div_v_term_const = -(2.0_wp/3.0_wp)*divergence_cyl/Re_s
+
+                        select case (norm_dir)
+                        case (1) ! X-face (axial normal, z_cyl)
+                            stress_vector_shear(1) = (2.0_wp*avg_dvdx_int(1))/Re_s + div_v_term_const
+                            if (num_dims > 1) then
+                                #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
+                                    stress_vector_shear(2) = (avg_dvdy_int(1) + avg_dvdx_int(2))/Re_s
+                                #:endif
+                            end if
+                            if (num_dims > 2) then
+                                #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
+                                    stress_vector_shear(3) = (avg_dvdz_int(1)/r_eff + avg_dvdx_int(3))/Re_s
+                                #:endif
+                            end if
+                        case (2) ! Y-face (radial normal, r_cyl)
+                            if (num_dims > 1) then
+                                #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
+                                    stress_vector_shear(1) = (avg_dvdy_int(1) + avg_dvdx_int(2))/Re_s
+                                    stress_vector_shear(2) = (2.0_wp*avg_dvdy_int(2))/Re_s + div_v_term_const
+                                    if (num_dims > 2) then
+                                        #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
+                                            stress_vector_shear(3) = (avg_dvdz_int(2)/r_eff - avg_v_int(3)/r_eff + avg_dvdy_int(3))/Re_s
+                                        #:endif
+                                    end if
+                                #:endif
+                            else
+                                stress_vector_shear(1) = (2.0_wp*avg_dvdx_int(1))/Re_s + div_v_term_const
+                            end if
+                        case (3) ! Z-face (azimuthal normal, theta_cyl)
+                            if (num_dims > 2) then
+                                #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
+                                    stress_vector_shear(1) = (avg_dvdz_int(1)/r_eff + avg_dvdx_int(3))/Re_s
+                                    stress_vector_shear(2) = (avg_dvdz_int(2)/r_eff - avg_v_int(3)/r_eff + avg_dvdy_int(3))/Re_s
+                                    stress_vector_shear(3) = (2.0_wp*(avg_dvdz_int(3)/r_eff + avg_v_int(2)/r_eff))/Re_s + div_v_term_const
+                                #:endif
+                            end if
+                        end select
+
+                        $:GPU_LOOP(parallelism='[seq]')
+                        do i_vel = 1, num_dims
+                            flux_src_vf(momxb + i_vel - 1)%sf(j, k, l) = flux_src_vf(momxb + i_vel - 1)%sf(j, k, l) - stress_vector_shear(i_vel)
+                            flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - vel_src_int(i_vel)*stress_vector_shear(i_vel)
+                        end do
+                    end if
+
+                    if (bulk_stress) then
+                        stress_normal_bulk = divergence_cyl/Re_b
+
+                        flux_src_vf(momxb + norm_dir - 1)%sf(j, k, l) = flux_src_vf(momxb + norm_dir - 1)%sf(j, k, l) - stress_normal_bulk
+                        flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - vel_src_int(norm_dir)*stress_normal_bulk
+                    end if
+
+                end do
+            end do
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+    end subroutine s_compute_cylindrical_viscous_source_flux
+
+    !> @brief Computes Cartesian viscous source flux contributions for momentum and energy.
+    !! Calculates averaged velocity gradients, gets Re and interface velocities,
+    !! calls helpers for shear/bulk stress, then updates `flux_src_vf`.
+    !! @param[in] dvelL_dx_vf Left boundary d(vel)/dx (num_dims scalar_field).
+    !! @param[in] dvelL_dy_vf Left boundary d(vel)/dy (num_dims scalar_field).
+    !! @param[in] dvelL_dz_vf Left boundary d(vel)/dz (num_dims scalar_field).
+    !! @param[in] dvelR_dx_vf Right boundary d(vel)/dx (num_dims scalar_field).
+    !! @param[in] dvelR_dy_vf Right boundary d(vel)/dy (num_dims scalar_field).
+    !! @param[in] dvelR_dz_vf Right boundary d(vel)/dz (num_dims scalar_field).
+    !! @param[inout] flux_src_vf Intercell source flux array to update (sys_size scalar_field).
+    !! @param[in] norm_dir Interface normal direction (1=x, 2=y, 3=z).
+    subroutine s_compute_cartesian_viscous_source_flux(dvelL_dx_vf, &
+                                                       dvelL_dy_vf, &
+                                                       dvelL_dz_vf, &
+                                                       dvelR_dx_vf, &
+                                                       dvelR_dy_vf, &
+                                                       dvelR_dz_vf, &
+                                                       flux_src_vf, &
+                                                       norm_dir)
+
+        ! Arguments
+        type(scalar_field), dimension(num_dims), intent(in) :: dvelL_dx_vf, dvelR_dx_vf
+        type(scalar_field), dimension(num_dims), intent(in) :: dvelL_dy_vf, dvelR_dy_vf
+        type(scalar_field), dimension(num_dims), intent(in) :: dvelL_dz_vf, dvelR_dz_vf
+        type(scalar_field), dimension(sys_size), intent(inout) :: flux_src_vf
+        integer, intent(in) :: norm_dir
+
+        ! Local variables
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3, 3) :: vel_grad_avg        !< Averaged velocity gradient tensor `d(vel_i)/d(coord_j)`.
+            real(wp), dimension(3, 3) :: current_tau_shear   !< Current shear stress tensor.
+            real(wp), dimension(3, 3) :: current_tau_bulk    !< Current bulk stress tensor.
+            real(wp), dimension(3) :: vel_src_at_interface         !< Interface velocities (u,v,w) for viscous work.
+        #:else
+            real(wp), dimension(num_dims, num_dims) :: vel_grad_avg        !< Averaged velocity gradient tensor `d(vel_i)/d(coord_j)`.
+            real(wp), dimension(num_dims, num_dims) :: current_tau_shear   !< Current shear stress tensor.
+            real(wp), dimension(num_dims, num_dims) :: current_tau_bulk    !< Current bulk stress tensor.
+            real(wp), dimension(num_dims) :: vel_src_at_interface         !< Interface velocities (u,v,w) for viscous work.
+        #:endif
+        integer, dimension(3) :: idx_right_phys                     !< Physical (j,k,l) indices for right state.
+
+        real(wp) :: Re_shear !< Interface shear Reynolds number.
+        real(wp) :: Re_bulk  !< Interface bulk Reynolds number.
+
+        integer :: j_loop         !< Physical x-index loop iterator.
+        integer :: k_loop         !< Physical y-index loop iterator.
+        integer :: l_loop         !< Physical z-index loop iterator.
+        integer :: i_dim          !< Generic dimension/component iterator.
+        integer :: vel_comp_idx   !< Velocity component iterator (1=u, 2=v, 3=w).
+
+        real(wp) :: divergence_v   !< Velocity divergence at interface.
+
+        $:GPU_PARALLEL_LOOP(collapse=3, private='[idx_right_phys, vel_grad_avg, current_tau_shear, current_tau_bulk, vel_src_at_interface, Re_shear, Re_bulk, divergence_v, i_dim, vel_comp_idx]')
+        do l_loop = isz%beg, isz%end
+            do k_loop = isy%beg, isy%end
+                do j_loop = isx%beg, isx%end
+
+                    idx_right_phys(1) = j_loop
+                    idx_right_phys(2) = k_loop
+                    idx_right_phys(3) = l_loop
+                    idx_right_phys(norm_dir) = idx_right_phys(norm_dir) + 1
+
+                    vel_grad_avg = 0.0_wp
+                    do vel_comp_idx = 1, num_dims
+                        vel_grad_avg(vel_comp_idx, 1) = 0.5_wp*(dvelL_dx_vf(vel_comp_idx)%sf(j_loop, k_loop, l_loop) + &
+                                                                dvelR_dx_vf(vel_comp_idx)%sf(idx_right_phys(1), idx_right_phys(2), idx_right_phys(3)))
+                        if (num_dims > 1) then
+                            #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
+                                vel_grad_avg(vel_comp_idx, 2) = 0.5_wp*(dvelL_dy_vf(vel_comp_idx)%sf(j_loop, k_loop, l_loop) + &
+                                                                        dvelR_dy_vf(vel_comp_idx)%sf(idx_right_phys(1), idx_right_phys(2), idx_right_phys(3)))
+                            #:endif
+                        end if
+                        if (num_dims > 2) then
+                            #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
+                                vel_grad_avg(vel_comp_idx, 3) = 0.5_wp*(dvelL_dz_vf(vel_comp_idx)%sf(j_loop, k_loop, l_loop) + &
+                                                                        dvelR_dz_vf(vel_comp_idx)%sf(idx_right_phys(1), idx_right_phys(2), idx_right_phys(3)))
+                            #:endif
+                        end if
+                    end do
+
+                    divergence_v = 0.0_wp
+                    do i_dim = 1, num_dims
+                        divergence_v = divergence_v + vel_grad_avg(i_dim, i_dim)
+                    end do
+
+                    vel_src_at_interface = 0.0_wp
+                    if (norm_dir == 1) then
+                        Re_shear = Re_avg_rsx_vf(j_loop, k_loop, l_loop, 1)
+                        Re_bulk = Re_avg_rsx_vf(j_loop, k_loop, l_loop, 2)
+                        do i_dim = 1, num_dims
+                            vel_src_at_interface(i_dim) = vel_src_rsx_vf(j_loop, k_loop, l_loop, i_dim)
+                        end do
+                    else if (norm_dir == 2) then
+                        Re_shear = Re_avg_rsy_vf(k_loop, j_loop, l_loop, 1)
+                        Re_bulk = Re_avg_rsy_vf(k_loop, j_loop, l_loop, 2)
+                        do i_dim = 1, num_dims
+                            vel_src_at_interface(i_dim) = vel_src_rsy_vf(k_loop, j_loop, l_loop, i_dim)
+                        end do
+                    else
+                        Re_shear = Re_avg_rsz_vf(l_loop, k_loop, j_loop, 1)
+                        Re_bulk = Re_avg_rsz_vf(l_loop, k_loop, j_loop, 2)
+                        do i_dim = 1, num_dims
+                            vel_src_at_interface(i_dim) = vel_src_rsz_vf(l_loop, k_loop, j_loop, i_dim)
+                        end do
+                    end if
+
+                    if (shear_stress) then
+                        ! current_tau_shear = 0.0_wp
+                        call s_calculate_shear_stress_tensor(vel_grad_avg, Re_shear, divergence_v, current_tau_shear)
+
+                        do i_dim = 1, num_dims
+                            flux_src_vf(momxb + i_dim - 1)%sf(j_loop, k_loop, l_loop) = &
+                                flux_src_vf(momxb + i_dim - 1)%sf(j_loop, k_loop, l_loop) - current_tau_shear(norm_dir, i_dim)
+
+                            flux_src_vf(E_idx)%sf(j_loop, k_loop, l_loop) = &
+                                flux_src_vf(E_idx)%sf(j_loop, k_loop, l_loop) - &
+                                vel_src_at_interface(i_dim)*current_tau_shear(norm_dir, i_dim)
+                        end do
+                    end if
+
+                    if (bulk_stress) then
+                        ! current_tau_bulk = 0.0_wp
+                        call s_calculate_bulk_stress_tensor(Re_bulk, divergence_v, current_tau_bulk)
+
+                        do i_dim = 1, num_dims
+                            flux_src_vf(momxb + i_dim - 1)%sf(j_loop, k_loop, l_loop) = &
+                                flux_src_vf(momxb + i_dim - 1)%sf(j_loop, k_loop, l_loop) - current_tau_bulk(norm_dir, i_dim)
+
+                            flux_src_vf(E_idx)%sf(j_loop, k_loop, l_loop) = &
+                                flux_src_vf(E_idx)%sf(j_loop, k_loop, l_loop) - &
+                                vel_src_at_interface(i_dim)*current_tau_bulk(norm_dir, i_dim)
+                        end do
+                    end if
+
+                end do
+            end do
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+    end subroutine s_compute_cartesian_viscous_source_flux
+
+    !> @brief Calculates shear stress tensor components.
+    !! tau_ij_shear = ( (dui/dxj + duj/dxi) - (2/3)*(div_v)*delta_ij ) / Re_shear
+    !! @param[in] vel_grad_avg Averaged velocity gradient tensor (d(vel_i)/d(coord_j)).
+    !! @param[in] Re_shear Shear Reynolds number.
+    !! @param[in] divergence_v Velocity divergence (du/dx + dv/dy + dw/dz).
+    !! @param[out] tau_shear_out Calculated shear stress tensor (stress on i-face, j-direction).
+    subroutine s_calculate_shear_stress_tensor(vel_grad_avg, Re_shear, divergence_v, tau_shear_out)
+        $:GPU_ROUTINE(parallelism='[seq]')
+
+        ! Arguments
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3, 3), intent(in) :: vel_grad_avg
+            real(wp), dimension(3, 3), intent(out) :: tau_shear_out
+        #:else
+            real(wp), dimension(num_dims, num_dims), intent(in) :: vel_grad_avg
+            real(wp), dimension(num_dims, num_dims), intent(out) :: tau_shear_out
+        #:endif
+        real(wp), intent(in) :: Re_shear
+        real(wp), intent(in) :: divergence_v
+
+        ! Local variables
+        integer :: i_dim !< Loop iterator for face normal.
+        integer :: j_dim !< Loop iterator for force component direction.
+
+        tau_shear_out = 0.0_wp
+
+        do i_dim = 1, num_dims
+            do j_dim = 1, num_dims
+                tau_shear_out(i_dim, j_dim) = (vel_grad_avg(j_dim, i_dim) + vel_grad_avg(i_dim, j_dim))/Re_shear
+                if (i_dim == j_dim) then
+                    tau_shear_out(i_dim, j_dim) = tau_shear_out(i_dim, j_dim) - &
+                                                  (2.0_wp/3.0_wp)*divergence_v/Re_shear
+                end if
+            end do
+        end do
+
+    end subroutine s_calculate_shear_stress_tensor
+
+    !> @brief Calculates bulk stress tensor components (diagonal only).
+    !! tau_ii_bulk = (div_v) / Re_bulk. Off-diagonals are zero.
+    !! @param[in] Re_bulk Bulk Reynolds number.
+    !! @param[in] divergence_v Velocity divergence (du/dx + dv/dy + dw/dz).
+    !! @param[out] tau_bulk_out Calculated bulk stress tensor (stress on i-face, i-direction).
+    subroutine s_calculate_bulk_stress_tensor(Re_bulk, divergence_v, tau_bulk_out)
+        $:GPU_ROUTINE(parallelism='[seq]')
+
+        ! Arguments
+        real(wp), intent(in) :: Re_bulk
+        real(wp), intent(in) :: divergence_v
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3, 3), intent(out) :: tau_bulk_out
+        #:else
+            real(wp), dimension(num_dims, num_dims), intent(out) :: tau_bulk_out
+        #:endif
+
+        ! Local variables
+        integer :: i_dim !< Loop iterator for diagonal components.
+
+        tau_bulk_out = 0.0_wp
+
+        do i_dim = 1, num_dims
+            tau_bulk_out(i_dim, i_dim) = divergence_v/Re_bulk
+        end do
+
+    end subroutine s_calculate_bulk_stress_tensor
+
+    !>  Deallocation and/or disassociation procedures that are
+        !!      needed to finalize the selected Riemann problem solver
+        !!  @param flux_vf       Intercell fluxes
+        !!  @param flux_src_vf   Intercell source fluxes
+        !!  @param flux_gsrc_vf  Intercell geometric source fluxes
+        !!  @param norm_dir Dimensional splitting coordinate direction
+    subroutine s_finalize_riemann_solver(flux_vf, flux_src_vf, &
+                                         flux_gsrc_vf, &
+                                         norm_dir)
+
+        type(scalar_field), &
+            dimension(sys_size), &
+            intent(inout) :: flux_vf, flux_src_vf, flux_gsrc_vf
+
+        integer, intent(in) :: norm_dir
+
+        integer :: i, j, k, l !< Generic loop iterators
+
+        ! Reshaping Outputted Data in y-direction
+        if (norm_dir == 2) then
+            $:GPU_PARALLEL_LOOP(collapse=4)
+            do i = 1, sys_size
+                do l = is3%beg, is3%end
+                    do j = is1%beg, is1%end
+                        do k = is2%beg, is2%end
+                            flux_vf(i)%sf(k, j, l) = &
+                                flux_rsy_vf(j, k, l, i)
+                        end do
+                    end do
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+
+            if (cyl_coord) then
+                $:GPU_PARALLEL_LOOP(collapse=4)
+                do i = 1, sys_size
+                    do l = is3%beg, is3%end
+                        do j = is1%beg, is1%end
+                            do k = is2%beg, is2%end
+                                flux_gsrc_vf(i)%sf(k, j, l) = &
+                                    flux_gsrc_rsy_vf(j, k, l, i)
+                            end do
+                        end do
+                    end do
+                end do
+                $:END_GPU_PARALLEL_LOOP()
+            end if
+
+            $:GPU_PARALLEL_LOOP(collapse=3)
+            do l = is3%beg, is3%end
+                do j = is1%beg, is1%end
+                    do k = is2%beg, is2%end
+                        flux_src_vf(advxb)%sf(k, j, l) = &
+                            flux_src_rsy_vf(j, k, l, advxb)
+                    end do
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+
+            if (riemann_solver == 1 .or. riemann_solver == 4) then
+                $:GPU_PARALLEL_LOOP(collapse=4)
+                do i = advxb + 1, advxe
+                    do l = is3%beg, is3%end
+                        do j = is1%beg, is1%end
+                            do k = is2%beg, is2%end
+                                flux_src_vf(i)%sf(k, j, l) = &
+                                    flux_src_rsy_vf(j, k, l, i)
+                            end do
+                        end do
+                    end do
+                end do
+                $:END_GPU_PARALLEL_LOOP()
+
+            end if
+            ! Reshaping Outputted Data in z-direction
+        elseif (norm_dir == 3) then
+            $:GPU_PARALLEL_LOOP(collapse=4)
+            do i = 1, sys_size
+                do j = is1%beg, is1%end
+                    do k = is2%beg, is2%end
+                        do l = is3%beg, is3%end
+
+                            flux_vf(i)%sf(l, k, j) = &
+                                flux_rsz_vf(j, k, l, i)
+                        end do
+                    end do
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+            if (grid_geometry == 3) then
+                $:GPU_PARALLEL_LOOP(collapse=4)
+                do i = 1, sys_size
+                    do j = is1%beg, is1%end
+                        do k = is2%beg, is2%end
+                            do l = is3%beg, is3%end
+
+                                flux_gsrc_vf(i)%sf(l, k, j) = &
+                                    flux_gsrc_rsz_vf(j, k, l, i)
+                            end do
+                        end do
+                    end do
+                end do
+                $:END_GPU_PARALLEL_LOOP()
+            end if
+
+            $:GPU_PARALLEL_LOOP(collapse=3)
+            do j = is1%beg, is1%end
+                do k = is2%beg, is2%end
+                    do l = is3%beg, is3%end
+                        flux_src_vf(advxb)%sf(l, k, j) = &
+                            flux_src_rsz_vf(j, k, l, advxb)
+                    end do
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+
+            if (riemann_solver == 1 .or. riemann_solver == 4) then
+                $:GPU_PARALLEL_LOOP(collapse=4)
+                do i = advxb + 1, advxe
+                    do j = is1%beg, is1%end
+                        do k = is2%beg, is2%end
+                            do l = is3%beg, is3%end
+                                flux_src_vf(i)%sf(l, k, j) = &
+                                    flux_src_rsz_vf(j, k, l, i)
+                            end do
+                        end do
+                    end do
+                end do
+                $:END_GPU_PARALLEL_LOOP()
+
+            end if
+        elseif (norm_dir == 1) then
+            $:GPU_PARALLEL_LOOP(collapse=4)
+            do i = 1, sys_size
+                do l = is3%beg, is3%end
+                    do k = is2%beg, is2%end
+                        do j = is1%beg, is1%end
+                            flux_vf(i)%sf(j, k, l) = &
+                                flux_rsx_vf(j, k, l, i)
+                        end do
+                    end do
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+
+            $:GPU_PARALLEL_LOOP(collapse=3)
+            do l = is3%beg, is3%end
+                do k = is2%beg, is2%end
+                    do j = is1%beg, is1%end
+                        flux_src_vf(advxb)%sf(j, k, l) = &
+                            flux_src_rsx_vf(j, k, l, advxb)
+                    end do
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+
+            if (riemann_solver == 1 .or. riemann_solver == 4) then
+                $:GPU_PARALLEL_LOOP(collapse=4)
+                do i = advxb + 1, advxe
+                    do l = is3%beg, is3%end
+                        do k = is2%beg, is2%end
+                            do j = is1%beg, is1%end
+                                flux_src_vf(i)%sf(j, k, l) = &
+                                    flux_src_rsx_vf(j, k, l, i)
+                            end do
+                        end do
+                    end do
+                end do
+                $:END_GPU_PARALLEL_LOOP()
+            end if
+        end if
+
+    end subroutine s_finalize_riemann_solver
+
+    !> Module deallocation and/or disassociation procedures
+    impure subroutine s_finalize_riemann_solvers_module
+
+        if (viscous) then
+            @:DEALLOCATE(Re_avg_rsx_vf)
+        end if
+        @:DEALLOCATE(vel_src_rsx_vf)
+        @:DEALLOCATE(flux_rsx_vf)
+        @:DEALLOCATE(flux_src_rsx_vf)
+        @:DEALLOCATE(flux_gsrc_rsx_vf)
+        if (qbmm) then
+            @:DEALLOCATE(mom_sp_rsx_vf)
+        end if
+
+        if (n == 0) return
+
+        if (viscous) then
+            @:DEALLOCATE(Re_avg_rsy_vf)
+        end if
+        @:DEALLOCATE(vel_src_rsy_vf)
+        @:DEALLOCATE(flux_rsy_vf)
+        @:DEALLOCATE(flux_src_rsy_vf)
+        @:DEALLOCATE(flux_gsrc_rsy_vf)
+        if (qbmm) then
+            @:DEALLOCATE(mom_sp_rsy_vf)
+        end if
+
+        if (p == 0) return
+
+        if (viscous) then
+            @:DEALLOCATE(Re_avg_rsz_vf)
+        end if
+        @:DEALLOCATE(vel_src_rsz_vf)
+        @:DEALLOCATE(flux_rsz_vf)
+        @:DEALLOCATE(flux_src_rsz_vf)
+        @:DEALLOCATE(flux_gsrc_rsz_vf)
+        if (qbmm) then
+            @:DEALLOCATE(mom_sp_rsz_vf)
+        end if
+
+    end subroutine s_finalize_riemann_solvers_module
+
+end module m_riemann_solvers
diff --git a/src/simulation/m_time_steppers.fpp b/src/simulation/m_time_steppers.fpp
index 2835055667..c57338f9fd 100644
--- a/src/simulation/m_time_steppers.fpp
+++ b/src/simulation/m_time_steppers.fpp
@@ -1,1061 +1,1072 @@
-!>
-!! @file
-!! @brief Contains module m_time_steppers
-
-#:include 'macros.fpp'
-#:include 'case.fpp'
-
-!> @brief Total-variation-diminishing (TVD) Runge--Kutta time integrators (1st-, 2nd-, and 3rd-order SSP)
-module m_time_steppers
-
-    use m_derived_types        !< Definitions of the derived types
-
-    use m_global_parameters    !< Definitions of the global parameters
-
-    use m_rhs                  !< Right-hane-side (RHS) evaluation procedures
-
-    use m_pressure_relaxation  !< Pressure relaxation procedures
-
-    use m_data_output          !< Run-time info & solution data output procedures
-
-    use m_bubbles_EE           !< Ensemble-averaged bubble dynamics routines
-
-    use m_bubbles_EL           !< Lagrange bubble dynamics routines
-
-    use m_ibm
-
-    use m_hyperelastic
-
-    use m_mpi_proxy            !< Message passing interface (MPI) module proxy
-
-    use m_boundary_common
-
-    use m_helper
-
-    use m_sim_helpers
-
-    use m_fftw
-
-    use m_nvtx
-
-    use m_thermochem, only: num_species
-
-    use m_body_forces
-
-    use m_derived_variables
-
-    implicit none
-
-    type(vector_field), allocatable, dimension(:) :: q_cons_ts !<
-    !! Cell-average conservative variables at each time-stage (TS)
-
-    type(scalar_field), allocatable, dimension(:) :: q_prim_vf !<
-    !! Cell-average primitive variables at the current time-stage
-
-    type(scalar_field), allocatable, dimension(:) :: rhs_vf !<
-    !! Cell-average RHS variables at the current time-stage
-
-    type(integer_field), allocatable, dimension(:, :) :: bc_type !<
-    !! Boundary condition identifiers
-
-    type(vector_field), allocatable, dimension(:) :: q_prim_ts1, q_prim_ts2 !<
-    !! Cell-average primitive variables at consecutive TIMESTEPS
-
-    real(wp), allocatable, dimension(:, :, :, :, :) :: rhs_pb
-
-    type(scalar_field) :: q_T_sf !<
-    !! Cell-average temperature variables at the current time-stage
-
-    real(wp), allocatable, dimension(:, :, :, :, :) :: rhs_mv
-
-    real(wp), allocatable, dimension(:, :, :) :: max_dt
-
-    integer, private :: num_ts !<
-    !! Number of time stages in the time-stepping scheme
-
-    integer :: stor !< storage index
-    real(wp), allocatable, dimension(:, :) :: rk_coef
-    integer, private :: num_probe_ts
-
-    $:GPU_DECLARE(create='[q_cons_ts,q_prim_vf,q_T_sf,rhs_vf,q_prim_ts1,q_prim_ts2,rhs_mv,rhs_pb,max_dt,rk_coef,stor,bc_type]')
-
-!> @cond
-#if defined(__NVCOMPILER_GPU_UNIFIED_MEM)
-    real(stp), allocatable, dimension(:, :, :, :), pinned, target :: q_cons_ts_pool_host
-#elif defined(FRONTIER_UNIFIED)
-    real(stp), pointer, contiguous, dimension(:, :, :, :) :: q_cons_ts_pool_host, q_cons_ts_pool_device
-    integer(kind=8) :: pool_dims(4), pool_starts(4)
-    integer(kind=8) :: pool_size
-    type(c_ptr) :: cptr_host, cptr_device
-#endif
-!> @endcond
-
-contains
-
-    !> The computation of parameters, the allocation of memory,
-        !!      the association of pointers and/or the execution of any
-        !!      other procedures that are necessary to setup the module.
-    impure subroutine s_initialize_time_steppers_module
-#ifdef FRONTIER_UNIFIED
-        use hipfort
-        use hipfort_hipmalloc
-        use hipfort_check
-#if defined(MFC_OpenACC)
-        use openacc
-#endif
-#endif
-        integer :: i, j !< Generic loop iterators
-
-        ! Setting number of time-stages for selected time-stepping scheme
-        if (time_stepper == 1) then
-            num_ts = 1
-        elseif (any(time_stepper == (/2, 3/))) then
-            num_ts = 2
-        end if
-
-        if (probe_wrt) then
-            num_probe_ts = 2
-        end if
-
-        ! Allocating the cell-average conservative variables
-        @:ALLOCATE(q_cons_ts(1:num_ts))
-        @:PREFER_GPU(q_cons_ts)
-
-        do i = 1, num_ts
-            @:ALLOCATE(q_cons_ts(i)%vf(1:sys_size))
-            @:PREFER_GPU(q_cons_ts(i)%vf)
-        end do
-
-!> @cond
-#if defined(__NVCOMPILER_GPU_UNIFIED_MEM)
-        if (num_ts == 2 .and. nv_uvm_out_of_core) then
-            ! host allocation for q_cons_ts(2)%vf(j)%sf for all j
-            allocate (q_cons_ts_pool_host(idwbuff(1)%beg:idwbuff(1)%end, &
-                                          idwbuff(2)%beg:idwbuff(2)%end, &
-                                          idwbuff(3)%beg:idwbuff(3)%end, &
-                                          1:sys_size))
-        end if
-
-        do j = 1, sys_size
-            ! q_cons_ts(1) lives on the device
-            @:ALLOCATE(q_cons_ts(1)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
-                idwbuff(2)%beg:idwbuff(2)%end, &
-                idwbuff(3)%beg:idwbuff(3)%end))
-            @:PREFER_GPU(q_cons_ts(1)%vf(j)%sf)
-            if (num_ts == 2) then
-                if (nv_uvm_out_of_core) then
-                    ! q_cons_ts(2) lives on the host
-                    q_cons_ts(2)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
-                                          idwbuff(2)%beg:idwbuff(2)%end, &
-                                          idwbuff(3)%beg:idwbuff(3)%end) => q_cons_ts_pool_host(:, :, :, j)
-                else
-                    @:ALLOCATE(q_cons_ts(2)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
-                        idwbuff(2)%beg:idwbuff(2)%end, &
-                        idwbuff(3)%beg:idwbuff(3)%end))
-                    @:PREFER_GPU(q_cons_ts(2)%vf(j)%sf)
-                end if
-            end if
-        end do
-
-        do i = 1, num_ts
-            @:ACC_SETUP_VFs(q_cons_ts(i))
-        end do
-#elif defined(FRONTIER_UNIFIED)
-        ! Allocate to memory regions using hip calls
-        ! that we will attach pointers to
-        do i = 1, 3
-            pool_dims(i) = idwbuff(i)%end - idwbuff(i)%beg + 1
-            pool_starts(i) = idwbuff(i)%beg
-        end do
-        pool_dims(4) = sys_size
-        pool_starts(4) = 1
-#ifdef MFC_MIXED_PRECISION
-        pool_size = 1_8*(idwbuff(1)%end - idwbuff(1)%beg + 1)*(idwbuff(2)%end - idwbuff(2)%beg + 1)*(idwbuff(3)%end - idwbuff(3)%beg + 1)*sys_size
-        call hipCheck(hipMalloc_(cptr_device, pool_size*2_8))
-        call c_f_pointer(cptr_device, q_cons_ts_pool_device, shape=pool_dims)
-        q_cons_ts_pool_device(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:) => q_cons_ts_pool_device
-
-        call hipCheck(hipMallocManaged_(cptr_host, pool_size*2_8, hipMemAttachGlobal))
-        call c_f_pointer(cptr_host, q_cons_ts_pool_host, shape=pool_dims)
-        q_cons_ts_pool_host(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:) => q_cons_ts_pool_host
-#else
-        ! Doing hipMalloc then mapping should be most performant
-        call hipCheck(hipMalloc(q_cons_ts_pool_device, dims8=pool_dims, lbounds8=pool_starts))
-        ! Without this map CCE will still create a device copy, because it's silly like that
-#if defined(MFC_OpenACC)
-        call acc_map_data(q_cons_ts_pool_device, c_loc(q_cons_ts_pool_device), c_sizeof(q_cons_ts_pool_device))
-#endif
-        ! CCE see it can access this and will leave it on the host. It will stay on the host so long as HSA_XNACK=1
-        ! NOTE: WE CANNOT DO ATOMICS INTO THIS MEMORY. We have to change a property to use atomics here
-        ! Otherwise leaving this as fine-grained will actually help performance since it can't be cached in GPU L2
-        if (num_ts == 2) then
-            call hipCheck(hipMallocManaged(q_cons_ts_pool_host, dims8=pool_dims, lbounds8=pool_starts, flags=hipMemAttachGlobal))
-#if defined(MFC_OpenMP)
-            call hipCheck(hipMemAdvise(c_loc(q_cons_ts_pool_host), c_sizeof(q_cons_ts_pool_host), hipMemAdviseSetPreferredLocation, -1))
-#endif
-        end if
-#endif
-
-        do j = 1, sys_size
-            ! q_cons_ts(1) lives on the device
-            q_cons_ts(1)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
-                                  idwbuff(2)%beg:idwbuff(2)%end, &
-                                  idwbuff(3)%beg:idwbuff(3)%end) => q_cons_ts_pool_device(:, :, :, j)
-            if (num_ts == 2) then
-                ! q_cons_ts(2) lives on the host
-                q_cons_ts(2)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
-                                      idwbuff(2)%beg:idwbuff(2)%end, &
-                                      idwbuff(3)%beg:idwbuff(3)%end) => q_cons_ts_pool_host(:, :, :, j)
-            end if
-        end do
-
-        do i = 1, num_ts
-            @:ACC_SETUP_VFs(q_cons_ts(i))
-            do j = 1, sys_size
-                $:GPU_UPDATE(device='[q_cons_ts(i)%vf(j)]')
-            end do
-        end do
-#else
-!> @endcond
-        do i = 1, num_ts
-            do j = 1, sys_size
-                @:ALLOCATE(q_cons_ts(i)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
-                    idwbuff(2)%beg:idwbuff(2)%end, &
-                    idwbuff(3)%beg:idwbuff(3)%end))
-            end do
-            @:ACC_SETUP_VFs(q_cons_ts(i))
-        end do
-!> @cond
-#endif
-!> @endcond
-
-        ! Allocating the cell-average primitive ts variables
-        if (probe_wrt) then
-            @:ALLOCATE(q_prim_ts1(1:num_probe_ts))
-
-            do i = 1, num_probe_ts
-                @:ALLOCATE(q_prim_ts1(i)%vf(1:sys_size))
-            end do
-
-            do i = 1, num_probe_ts
-                do j = 1, sys_size
-                    @:ALLOCATE(q_prim_ts1(i)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
-                        idwbuff(2)%beg:idwbuff(2)%end, &
-                        idwbuff(3)%beg:idwbuff(3)%end))
-                end do
-                @:ACC_SETUP_VFs(q_prim_ts1(i))
-            end do
-
-            @:ALLOCATE(q_prim_ts2(1:num_probe_ts))
-
-            do i = 1, num_probe_ts
-                @:ALLOCATE(q_prim_ts2(i)%vf(1:sys_size))
-            end do
-
-            do i = 1, num_probe_ts
-                do j = 1, sys_size
-                    @:ALLOCATE(q_prim_ts2(i)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
-                        idwbuff(2)%beg:idwbuff(2)%end, &
-                        idwbuff(3)%beg:idwbuff(3)%end))
-                end do
-                @:ACC_SETUP_VFs(q_prim_ts2(i))
-            end do
-        end if
-
-        ! Allocating the cell-average primitive variables
-        @:ALLOCATE(q_prim_vf(1:sys_size))
-
-        if (.not. igr) then
-            do i = 1, adv_idx%end
-                @:ALLOCATE(q_prim_vf(i)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
-                    idwbuff(2)%beg:idwbuff(2)%end, &
-                    idwbuff(3)%beg:idwbuff(3)%end))
-                @:ACC_SETUP_SFs(q_prim_vf(i))
-            end do
-
-            if (bubbles_euler) then
-                do i = bub_idx%beg, bub_idx%end
-                    @:ALLOCATE(q_prim_vf(i)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
-                        idwbuff(2)%beg:idwbuff(2)%end, &
-                        idwbuff(3)%beg:idwbuff(3)%end))
-                    @:ACC_SETUP_SFs(q_prim_vf(i))
-                end do
-                if (adv_n) then
-                    @:ALLOCATE(q_prim_vf(n_idx)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
-                        idwbuff(2)%beg:idwbuff(2)%end, &
-                        idwbuff(3)%beg:idwbuff(3)%end))
-                    @:ACC_SETUP_SFs(q_prim_vf(n_idx))
-                end if
-            end if
-
-            if (mhd) then
-                do i = B_idx%beg, B_idx%end
-                    @:ALLOCATE(q_prim_vf(i)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
-                        idwbuff(2)%beg:idwbuff(2)%end, &
-                        idwbuff(3)%beg:idwbuff(3)%end))
-                    @:ACC_SETUP_SFs(q_prim_vf(i))
-                end do
-            end if
-
-            if (elasticity) then
-                do i = stress_idx%beg, stress_idx%end
-                    @:ALLOCATE(q_prim_vf(i)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
-                        idwbuff(2)%beg:idwbuff(2)%end, &
-                        idwbuff(3)%beg:idwbuff(3)%end))
-                    @:ACC_SETUP_SFs(q_prim_vf(i))
-                end do
-            end if
-
-            if (hyperelasticity) then
-                do i = xibeg, xiend + 1
-                    @:ALLOCATE(q_prim_vf(i)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
-                        idwbuff(2)%beg:idwbuff(2)%end, &
-                        idwbuff(3)%beg:idwbuff(3)%end))
-                    @:ACC_SETUP_SFs(q_prim_vf(i))
-                end do
-            end if
-
-            if (cont_damage) then
-                @:ALLOCATE(q_prim_vf(damage_idx)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
-                    idwbuff(2)%beg:idwbuff(2)%end, &
-                    idwbuff(3)%beg:idwbuff(3)%end))
-                @:ACC_SETUP_SFs(q_prim_vf(damage_idx))
-            end if
-
-            if (hyper_cleaning) then
-                @:ALLOCATE(q_prim_vf(psi_idx)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
-                    idwbuff(2)%beg:idwbuff(2)%end, &
-                    idwbuff(3)%beg:idwbuff(3)%end))
-                @:ACC_SETUP_SFs(q_prim_vf(psi_idx))
-            end if
-
-            if (model_eqns == 3) then
-                do i = internalEnergies_idx%beg, internalEnergies_idx%end
-                    @:ALLOCATE(q_prim_vf(i)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
-                        idwbuff(2)%beg:idwbuff(2)%end, &
-                        idwbuff(3)%beg:idwbuff(3)%end))
-                    @:ACC_SETUP_SFs(q_prim_vf(i))
-                end do
-            end if
-
-            if (surface_tension) then
-                @:ALLOCATE(q_prim_vf(c_idx)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
-                    idwbuff(2)%beg:idwbuff(2)%end, &
-                    idwbuff(3)%beg:idwbuff(3)%end))
-                @:ACC_SETUP_SFs(q_prim_vf(c_idx))
-            end if
-
-            if (chemistry) then
-                do i = chemxb, chemxe
-                    @:ALLOCATE(q_prim_vf(i)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
-                        idwbuff(2)%beg:idwbuff(2)%end, &
-                        idwbuff(3)%beg:idwbuff(3)%end))
-                    @:ACC_SETUP_SFs(q_prim_vf(i))
-                end do
-
-                @:ALLOCATE(q_T_sf%sf(idwbuff(1)%beg:idwbuff(1)%end, &
-                    idwbuff(2)%beg:idwbuff(2)%end, &
-                    idwbuff(3)%beg:idwbuff(3)%end))
-                @:ACC_SETUP_SFs(q_T_sf)
-            end if
-        end if
-
-        @:ALLOCATE(pb_ts(1:2))
-        !Initialize bubble variables pb and mv at all quadrature nodes for all R0 bins
-        if (qbmm .and. (.not. polytropic)) then
-            @:ALLOCATE(pb_ts(1)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
-                idwbuff(2)%beg:idwbuff(2)%end, &
-                idwbuff(3)%beg:idwbuff(3)%end, 1:nnode, 1:nb))
-            @:ACC_SETUP_SFs(pb_ts(1))
-
-            @:ALLOCATE(pb_ts(2)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
-                idwbuff(2)%beg:idwbuff(2)%end, &
-                idwbuff(3)%beg:idwbuff(3)%end, 1:nnode, 1:nb))
-            @:ACC_SETUP_SFs(pb_ts(2))
-
-            @:ALLOCATE(rhs_pb(idwbuff(1)%beg:idwbuff(1)%end, &
-                idwbuff(2)%beg:idwbuff(2)%end, &
-                idwbuff(3)%beg:idwbuff(3)%end, 1:nnode, 1:nb))
-        else if (qbmm .and. polytropic) then
-            @:ALLOCATE(pb_ts(1)%sf(idwbuff(1)%beg:idwbuff(1)%beg + 1, &
-                idwbuff(2)%beg:idwbuff(2)%beg + 1, &
-                idwbuff(3)%beg:idwbuff(3)%beg + 1, 1:nnode, 1:nb))
-            @:ACC_SETUP_SFs(pb_ts(1))
-
-            @:ALLOCATE(pb_ts(2)%sf(idwbuff(1)%beg:idwbuff(1)%beg + 1, &
-                idwbuff(2)%beg:idwbuff(2)%beg + 1, &
-                idwbuff(3)%beg:idwbuff(3)%beg + 1, 1:nnode, 1:nb))
-            @:ACC_SETUP_SFs(pb_ts(2))
-
-            @:ALLOCATE(rhs_pb(idwbuff(1)%beg:idwbuff(1)%beg + 1, &
-                idwbuff(2)%beg:idwbuff(2)%beg + 1, &
-                idwbuff(3)%beg:idwbuff(3)%beg + 1, 1:nnode, 1:nb))
-        else
-            @:ALLOCATE(pb_ts(1)%sf(0,0,0,0,0))
-            @:ACC_SETUP_SFs(pb_ts(1))
-
-            @:ALLOCATE(pb_ts(2)%sf(0,0,0,0,0))
-            @:ACC_SETUP_SFs(pb_ts(2))
-
-            @:ALLOCATE(rhs_pb(0,0,0,0,0))
-        end if
-
-        @:ALLOCATE(mv_ts(1:2))
-
-        if (qbmm .and. (.not. polytropic)) then
-            @:ALLOCATE(mv_ts(1)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
-                idwbuff(2)%beg:idwbuff(2)%end, &
-                idwbuff(3)%beg:idwbuff(3)%end, 1:nnode, 1:nb))
-            @:ACC_SETUP_SFs(mv_ts(1))
-
-            @:ALLOCATE(mv_ts(2)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
-                idwbuff(2)%beg:idwbuff(2)%end, &
-                idwbuff(3)%beg:idwbuff(3)%end, 1:nnode, 1:nb))
-            @:ACC_SETUP_SFs(mv_ts(2))
-
-            @:ALLOCATE(rhs_mv(idwbuff(1)%beg:idwbuff(1)%end, &
-                idwbuff(2)%beg:idwbuff(2)%end, &
-                idwbuff(3)%beg:idwbuff(3)%end, 1:nnode, 1:nb))
-
-        else if (qbmm .and. polytropic) then
-            @:ALLOCATE(mv_ts(1)%sf(idwbuff(1)%beg:idwbuff(1)%beg + 1, &
-                idwbuff(2)%beg:idwbuff(2)%beg + 1, &
-                idwbuff(3)%beg:idwbuff(3)%beg + 1, 1:nnode, 1:nb))
-            @:ACC_SETUP_SFs(mv_ts(1))
-
-            @:ALLOCATE(mv_ts(2)%sf(idwbuff(1)%beg:idwbuff(1)%beg + 1, &
-                idwbuff(2)%beg:idwbuff(2)%beg + 1, &
-                idwbuff(3)%beg:idwbuff(3)%beg + 1, 1:nnode, 1:nb))
-            @:ACC_SETUP_SFs(mv_ts(2))
-
-            @:ALLOCATE(rhs_mv(idwbuff(1)%beg:idwbuff(1)%beg + 1, &
-                idwbuff(2)%beg:idwbuff(2)%beg + 1, &
-                idwbuff(3)%beg:idwbuff(3)%beg + 1, 1:nnode, 1:nb))
-        else
-            @:ALLOCATE(mv_ts(1)%sf(0,0,0,0,0))
-            @:ACC_SETUP_SFs(mv_ts(1))
-
-            @:ALLOCATE(mv_ts(2)%sf(0,0,0,0,0))
-            @:ACC_SETUP_SFs(mv_ts(2))
-
-            @:ALLOCATE(rhs_mv(0,0,0,0,0))
-        end if
-
-        ! Allocating the cell-average RHS variables
-        @:ALLOCATE(rhs_vf(1:sys_size))
-        @:PREFER_GPU(rhs_vf)
-
-        if (igr) then
-            do i = 1, sys_size
-                @:ALLOCATE(rhs_vf(i)%sf(-1:m+1,-1:n+1,-1:p+1))
-                @:ACC_SETUP_SFs(rhs_vf(i))
-                @:PREFER_GPU(rhs_vf(i)%sf)
-            end do
-        else
-            do i = 1, sys_size
-                @:ALLOCATE(rhs_vf(i)%sf(0:m, 0:n, 0:p))
-                @:ACC_SETUP_SFs(rhs_vf(i))
-            end do
-        end if
-
-        ! Opening and writing the header of the run-time information file
-        if (proc_rank == 0 .and. run_time_info) then
-            call s_open_run_time_information_file()
-        end if
-
-        if (cfl_dt) then
-            @:ALLOCATE(max_dt(0:m, 0:n, 0:p))
-        end if
-
-        ! Allocating arrays to store the bc types
-        @:ALLOCATE(bc_type(1:num_dims,1:2))
-
-        @:ALLOCATE(bc_type(1,1)%sf(0:0,0:n,0:p))
-        @:ALLOCATE(bc_type(1,2)%sf(0:0,0:n,0:p))
-        #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
-            if (n > 0) then
-                @:ALLOCATE(bc_type(2,1)%sf(-buff_size:m+buff_size,0:0,0:p))
-                @:ALLOCATE(bc_type(2,2)%sf(-buff_size:m+buff_size,0:0,0:p))
-                #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
-                    if (p > 0) then
-                        @:ALLOCATE(bc_type(3,1)%sf(-buff_size:m+buff_size,-buff_size:n+buff_size,0:0))
-                        @:ALLOCATE(bc_type(3,2)%sf(-buff_size:m+buff_size,-buff_size:n+buff_size,0:0))
-                    end if
-                #:endif
-            end if
-        #:endif
-
-        do i = 1, num_dims
-            do j = 1, 2
-                @:ACC_SETUP_SFs(bc_type(i,j))
-            end do
-        end do
-
-        if (any(time_stepper == (/1, 2, 3/))) then
-            ! temporary array index for TVD RK
-            if (time_stepper == 1) then
-                stor = 1
-            else
-                stor = 2
-            end if
-
-            ! TVD RK coefficients
-            @:ALLOCATE (rk_coef(time_stepper, 4))
-            if (time_stepper == 1) then
-                rk_coef(1, :) = (/1._wp, 0._wp, 1._wp, 1._wp/)
-            else if (time_stepper == 2) then
-                rk_coef(1, :) = (/1._wp, 0._wp, 1._wp, 1._wp/)
-                rk_coef(2, :) = (/1._wp, 1._wp, 1._wp, 2._wp/)
-            else if (time_stepper == 3) then
-                rk_coef(1, :) = (/1._wp, 0._wp, 1._wp, 1._wp/)
-                rk_coef(2, :) = (/1._wp, 3._wp, 1._wp, 4._wp/)
-                rk_coef(3, :) = (/2._wp, 1._wp, 2._wp, 3._wp/)
-            end if
-            $:GPU_UPDATE(device='[rk_coef, stor]')
-        end if
-
-    end subroutine s_initialize_time_steppers_module
-
-    !> @brief Advances the solution one full step using a TVD Runge-Kutta time integrator.
-    impure subroutine s_tvd_rk(t_step, time_avg, nstage)
-#ifdef _CRAYFTN
-        !DIR$ OPTIMIZE (-haggress)
-#endif
-        integer, intent(in) :: t_step
-        real(wp), intent(inout) :: time_avg
-        integer, intent(in) :: nstage
-
-        integer :: i, j, k, l, q, s !< Generic loop iterator
-        real(wp) :: start, finish
-        integer :: dest
-
-        call cpu_time(start)
-        call nvtxStartRange("TIMESTEP")
-
-        ! Adaptive dt: initial stage
-        if (adap_dt) call s_adaptive_dt_bubble(1)
-
-        do s = 1, nstage
-            call s_compute_rhs(q_cons_ts(1)%vf, q_T_sf, q_prim_vf, bc_type, rhs_vf, pb_ts(1)%sf, rhs_pb, mv_ts(1)%sf, rhs_mv, t_step, time_avg, s)
-
-            if (s == 1) then
-                if (run_time_info) then
-                    if (igr .or. dummy) then
-                        call s_write_run_time_information(q_cons_ts(1)%vf, t_step)
-                    end if
-                    if (.not. igr .or. dummy) then
-                        call s_write_run_time_information(q_prim_vf, t_step)
-                    end if
-                end if
-
-                if (probe_wrt) then
-                    call s_time_step_cycling(t_step)
-                    call s_compute_derived_variables(t_step, q_cons_ts(1)%vf, q_prim_ts1, q_prim_ts2)
-                end if
-
-                if (cfl_dt) then
-                    if (mytime >= t_stop) return
-                else
-                    if (t_step == t_step_stop) return
-                end if
-            end if
-
-            if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=s)
-            $:GPU_PARALLEL_LOOP(collapse=4)
-            do i = 1, sys_size
-                do l = 0, p
-                    do k = 0, n
-                        do j = 0, m
-                            if (s == 1 .and. nstage > 1) then
-                                q_cons_ts(stor)%vf(i)%sf(j, k, l) = &
-                                    q_cons_ts(1)%vf(i)%sf(j, k, l)
-                            end if
-                            if (igr) then
-                                q_cons_ts(1)%vf(i)%sf(j, k, l) = &
-                                    (rk_coef(s, 1)*q_cons_ts(1)%vf(i)%sf(j, k, l) &
-                                     + rk_coef(s, 2)*q_cons_ts(stor)%vf(i)%sf(j, k, l) &
-                                     + rk_coef(s, 3)*rhs_vf(i)%sf(j, k, l))/rk_coef(s, 4)
-                            else
-                                q_cons_ts(1)%vf(i)%sf(j, k, l) = &
-                                    (rk_coef(s, 1)*q_cons_ts(1)%vf(i)%sf(j, k, l) &
-                                     + rk_coef(s, 2)*q_cons_ts(stor)%vf(i)%sf(j, k, l) &
-                                     + rk_coef(s, 3)*dt*rhs_vf(i)%sf(j, k, l))/rk_coef(s, 4)
-                            end if
-                        end do
-                    end do
-                end do
-            end do
-            $:END_GPU_PARALLEL_LOOP()
-            !Evolve pb and mv for non-polytropic qbmm
-            if (qbmm .and. (.not. polytropic)) then
-                $:GPU_PARALLEL_LOOP(collapse=5)
-                do i = 1, nb
-                    do l = 0, p
-                        do k = 0, n
-                            do j = 0, m
-                                do q = 1, nnode
-                                    if (s == 1 .and. nstage > 1) then
-                                        pb_ts(stor)%sf(j, k, l, q, i) = &
-                                            pb_ts(1)%sf(j, k, l, q, i)
-                                        mv_ts(stor)%sf(j, k, l, q, i) = &
-                                            mv_ts(1)%sf(j, k, l, q, i)
-                                    end if
-                                    pb_ts(1)%sf(j, k, l, q, i) = &
-                                        (rk_coef(s, 1)*pb_ts(1)%sf(j, k, l, q, i) &
-                                         + rk_coef(s, 2)*pb_ts(stor)%sf(j, k, l, q, i) &
-                                         + rk_coef(s, 3)*dt*rhs_pb(j, k, l, q, i))/rk_coef(s, 4)
-                                    mv_ts(1)%sf(j, k, l, q, i) = &
-                                        (rk_coef(s, 1)*mv_ts(1)%sf(j, k, l, q, i) &
-                                         + rk_coef(s, 2)*mv_ts(stor)%sf(j, k, l, q, i) &
-                                         + rk_coef(s, 3)*dt*rhs_mv(j, k, l, q, i))/rk_coef(s, 4)
-                                end do
-                            end do
-                        end do
-                    end do
-                end do
-                $:END_GPU_PARALLEL_LOOP()
-            end if
-
-            if (bodyForces) call s_apply_bodyforces(q_cons_ts(1)%vf, q_prim_vf, rhs_vf, rk_coef(s, 3)*dt/rk_coef(s, 4))
-
-            if (grid_geometry == 3) call s_apply_fourier_filter(q_cons_ts(1)%vf)
-
-            if (model_eqns == 3 .and. (.not. relax)) then
-                call s_pressure_relaxation_procedure(q_cons_ts(1)%vf)
-            end if
-
-            if (adv_n) call s_comp_alpha_from_n(q_cons_ts(1)%vf)
-
-            if (ib) then
-                ! check if any IBMS are moving, and if so, update the markers, ghost points, levelsets, and levelset norms
-                if (moving_immersed_boundary_flag) then
-                    call s_propagate_immersed_boundaries(s)
-                end if
-
-                ! update the ghost fluid properties point values based on IB state
-                if (qbmm .and. .not. polytropic) then
-                    call s_ibm_correct_state(q_cons_ts(1)%vf, q_prim_vf, pb_ts(1)%sf, mv_ts(1)%sf)
-                else
-                    call s_ibm_correct_state(q_cons_ts(1)%vf, q_prim_vf)
-                end if
-            end if
-
-        end do
-
-        if (moving_immersed_boundary_flag) call s_wrap_periodic_ibs()
-
-        ! Adaptive dt: final stage
-        if (adap_dt) call s_adaptive_dt_bubble(3)
-
-        call nvtxEndRange
-        call cpu_time(finish)
-
-        wall_time = abs(finish - start)
-
-        if (t_step >= 2) then
-            wall_time_avg = (wall_time + (t_step - 2)*wall_time_avg)/(t_step - 1)
-        else
-            wall_time_avg = 0._wp
-        end if
-
-    end subroutine s_tvd_rk
-
-    !> Bubble source part in Strang operator splitting scheme
-        !! @param stage Current time-stage
-    impure subroutine s_adaptive_dt_bubble(stage)
-
-        integer, intent(in) :: stage
-
-        type(vector_field) :: gm_alpha_qp
-
-        call s_convert_conservative_to_primitive_variables( &
-            q_cons_ts(1)%vf, &
-            q_T_sf, &
-            q_prim_vf, &
-            idwint)
-
-        if (bubbles_euler) then
-
-            call s_compute_bubble_EE_source(q_cons_ts(1)%vf, q_prim_vf, rhs_vf, divu)
-            call s_comp_alpha_from_n(q_cons_ts(1)%vf)
-
-        elseif (bubbles_lagrange) then
-
-            call s_populate_variables_buffers(bc_type, q_prim_vf, pb_ts(1)%sf, mv_ts(1)%sf)
-            call s_compute_bubble_EL_dynamics(q_prim_vf, stage)
-            call s_transfer_data_to_tmp()
-            call s_smear_voidfraction()
-            if (stage == 3) then
-                if (lag_params%write_bubbles_stats) call s_calculate_lag_bubble_stats()
-                if (lag_params%write_bubbles) then
-                    $:GPU_UPDATE(host='[gas_p,gas_mv,intfc_rad,intfc_vel]')
-                    call s_write_lag_particles(mytime)
-                end if
-                call s_write_void_evol(mytime)
-            end if
-
-        end if
-
-    end subroutine s_adaptive_dt_bubble
-
-    !> @brief Computes the global time step size from CFL stability constraints across all cells.
-    impure subroutine s_compute_dt()
-
-        real(wp) :: rho        !< Cell-avg. density
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
-            real(wp), dimension(3) :: vel        !< Cell-avg. velocity
-            real(wp), dimension(3) :: alpha      !< Cell-avg. volume fraction
-        #:else
-            real(wp), dimension(num_vels) :: vel        !< Cell-avg. velocity
-            real(wp), dimension(num_fluids) :: alpha      !< Cell-avg. volume fraction
-        #:endif
-        real(wp) :: vel_sum    !< Cell-avg. velocity sum
-        real(wp) :: pres       !< Cell-avg. pressure
-        real(wp) :: gamma      !< Cell-avg. sp. heat ratio
-        real(wp) :: pi_inf     !< Cell-avg. liquid stiffness function
-        real(wp) :: qv         !< Cell-avg. fluid reference energy
-        real(wp) :: c          !< Cell-avg. sound speed
-        real(wp) :: H          !< Cell-avg. enthalpy
-        real(wp), dimension(2) :: Re         !< Cell-avg. Reynolds numbers
-        type(vector_field) :: gm_alpha_qp
-
-        real(wp) :: dt_local
-        integer :: j, k, l !< Generic loop iterators
-
-        if (.not. igr .or. dummy) then
-            call s_convert_conservative_to_primitive_variables( &
-                q_cons_ts(1)%vf, &
-                q_T_sf, &
-                q_prim_vf, &
-                idwint)
-        end if
-
-        $:GPU_PARALLEL_LOOP(collapse=3, private='[vel, alpha, Re, rho, vel_sum, pres, gamma, pi_inf, c, H, qv]')
-        do l = 0, p
-            do k = 0, n
-                do j = 0, m
-                    if (igr) then
-                        call s_compute_enthalpy(q_cons_ts(1)%vf, pres, rho, gamma, pi_inf, Re, H, alpha, vel, vel_sum, qv, j, k, l)
-                    else
-                        call s_compute_enthalpy(q_prim_vf, pres, rho, gamma, pi_inf, Re, H, alpha, vel, vel_sum, qv, j, k, l)
-                    end if
-
-                    ! Compute mixture sound speed
-                    call s_compute_speed_of_sound(pres, rho, gamma, pi_inf, H, alpha, vel_sum, 0._wp, c, qv)
-
-                    call s_compute_dt_from_cfl(vel, c, max_dt, rho, Re, j, k, l)
-                end do
-            end do
-        end do
-        $:END_GPU_PARALLEL_LOOP()
-
-        #:call GPU_PARALLEL(copyout='[dt_local]', copyin='[max_dt]')
-            dt_local = minval(max_dt)
-        #:endcall GPU_PARALLEL
-
-        if (num_procs == 1) then
-            dt = dt_local
-        else
-            call s_mpi_allreduce_min(dt_local, dt)
-        end if
-
-        $:GPU_UPDATE(device='[dt]')
-
-    end subroutine s_compute_dt
-
-    !> This subroutine applies the body forces source term at each
-        !! Runge-Kutta stage
-        !! @param q_cons_vf Conservative variables
-        !! @param q_prim_vf_in Primitive variables
-        !! @param rhs_vf_in Right-hand side variables
-    subroutine s_apply_bodyforces(q_cons_vf, q_prim_vf_in, rhs_vf_in, ldt)
-
-        type(scalar_field), dimension(1:sys_size), intent(inout) :: q_cons_vf
-        type(scalar_field), dimension(1:sys_size), intent(in) :: q_prim_vf_in
-        type(scalar_field), dimension(1:sys_size), intent(inout) :: rhs_vf_in
-
-        real(wp), intent(in) :: ldt !< local dt
-
-        integer :: i, j, k, l
-
-        call nvtxStartRange("RHS-BODYFORCES")
-        call s_compute_body_forces_rhs(q_prim_vf_in, q_cons_vf, rhs_vf_in)
-
-        $:GPU_PARALLEL_LOOP(collapse=4)
-        do i = momxb, E_idx
-            do l = 0, p
-                do k = 0, n
-                    do j = 0, m
-                        q_cons_vf(i)%sf(j, k, l) = q_cons_vf(i)%sf(j, k, l) + &
-                                                   ldt*rhs_vf_in(i)%sf(j, k, l)
-                    end do
-                end do
-            end do
-        end do
-        $:END_GPU_PARALLEL_LOOP()
-
-        call nvtxEndRange
-
-    end subroutine s_apply_bodyforces
-
-    !> @brief Updates immersed boundary positions and velocities at the current Runge-Kutta stage.
-    subroutine s_propagate_immersed_boundaries(s)
-
-        integer, intent(in) :: s
-        integer :: i
-        logical :: forces_computed
-
-        call nvtxStartRange("PROPAGATE-IMMERSED-BOUNDARIES")
-
-        forces_computed = .false.
-
-        do i = 1, num_ibs
-            if (s == 1) then
-                patch_ib(i)%step_vel = patch_ib(i)%vel
-                patch_ib(i)%step_angular_vel = patch_ib(i)%angular_vel
-                patch_ib(i)%step_angles = patch_ib(i)%angles
-                patch_ib(i)%step_x_centroid = patch_ib(i)%x_centroid
-                patch_ib(i)%step_y_centroid = patch_ib(i)%y_centroid
-                patch_ib(i)%step_z_centroid = patch_ib(i)%z_centroid
-            end if
-
-            if (patch_ib(i)%moving_ibm > 0) then
-                patch_ib(i)%vel = (rk_coef(s, 1)*patch_ib(i)%step_vel + rk_coef(s, 2)*patch_ib(i)%vel)/rk_coef(s, 4)
-                patch_ib(i)%angular_vel = (rk_coef(s, 1)*patch_ib(i)%step_angular_vel + rk_coef(s, 2)*patch_ib(i)%angular_vel)/rk_coef(s, 4)
-
-                if (patch_ib(i)%moving_ibm == 1) then
-                    ! plug in analytic velocities for 1-way coupling, if it exists
-                    @:mib_analytical()
-                else if (patch_ib(i)%moving_ibm == 2) then ! if we are using two-way coupling, apply force and torque
-                    ! compute the force and torque on the IB from the fluid
-                    if (.not. forces_computed) then
-                        call s_compute_ib_forces(q_prim_vf, fluid_pp)
-                        forces_computed = .true.
-                    end if
-
-                    ! update the velocity from the force value
-                    patch_ib(i)%vel = patch_ib(i)%vel + rk_coef(s, 3)*dt*(patch_ib(i)%force/patch_ib(i)%mass)/rk_coef(s, 4)
-
-                    ! update the angular velocity with the torque value
-                    patch_ib(i)%angular_vel = (patch_ib(i)%angular_vel*patch_ib(i)%moment) + (rk_coef(s, 3)*dt*patch_ib(i)%torque/rk_coef(s, 4)) ! add the torque to the angular momentum
-                    call s_compute_moment_of_inertia(i, patch_ib(i)%angular_vel) ! update the moment of inertia to be based on the direction of the angular momentum
-                    patch_ib(i)%angular_vel = patch_ib(i)%angular_vel/patch_ib(i)%moment ! convert back to angular velocity with the new moment of inertia
-                end if
-
-                ! Update the angle of the IB
-                patch_ib(i)%angles = (rk_coef(s, 1)*patch_ib(i)%step_angles + rk_coef(s, 2)*patch_ib(i)%angles + rk_coef(s, 3)*patch_ib(i)%angular_vel*dt)/rk_coef(s, 4)
-
-                ! Update the position of the IB
-                patch_ib(i)%x_centroid = (rk_coef(s, 1)*patch_ib(i)%step_x_centroid + rk_coef(s, 2)*patch_ib(i)%x_centroid + rk_coef(s, 3)*patch_ib(i)%vel(1)*dt)/rk_coef(s, 4)
-                patch_ib(i)%y_centroid = (rk_coef(s, 1)*patch_ib(i)%step_y_centroid + rk_coef(s, 2)*patch_ib(i)%y_centroid + rk_coef(s, 3)*patch_ib(i)%vel(2)*dt)/rk_coef(s, 4)
-                patch_ib(i)%z_centroid = (rk_coef(s, 1)*patch_ib(i)%step_z_centroid + rk_coef(s, 2)*patch_ib(i)%z_centroid + rk_coef(s, 3)*patch_ib(i)%vel(3)*dt)/rk_coef(s, 4)
-            end if
-        end do
-
-        call s_update_mib(num_ibs)
-
-        call nvtxEndRange
-
-    end subroutine s_propagate_immersed_boundaries
-
-    !> This subroutine saves the temporary q_prim_vf vector
-        !!      into the q_prim_ts vector that is then used in p_main
-        !! @param t_step current time-step
-    subroutine s_time_step_cycling(t_step)
-
-        integer, intent(in) :: t_step
-
-        integer :: i, j, k, l !< Generic loop iterator
-
-        if (t_step == t_step_start) then
-            $:GPU_PARALLEL_LOOP(collapse=4)
-            do i = 1, sys_size
-                do l = 0, p
-                    do k = 0, n
-                        do j = 0, m
-                            q_prim_ts2(2)%vf(i)%sf(j, k, l) = q_prim_vf(i)%sf(j, k, l)
-                        end do
-                    end do
-                end do
-            end do
-            $:END_GPU_PARALLEL_LOOP()
-        elseif (t_step == t_step_start + 1) then
-            $:GPU_PARALLEL_LOOP(collapse=4)
-            do i = 1, sys_size
-                do l = 0, p
-                    do k = 0, n
-                        do j = 0, m
-                            q_prim_ts2(1)%vf(i)%sf(j, k, l) = q_prim_vf(i)%sf(j, k, l)
-                        end do
-                    end do
-                end do
-            end do
-            $:END_GPU_PARALLEL_LOOP()
-        elseif (t_step == t_step_start + 2) then
-            $:GPU_PARALLEL_LOOP(collapse=4)
-            do i = 1, sys_size
-                do l = 0, p
-                    do k = 0, n
-                        do j = 0, m
-                            q_prim_ts1(2)%vf(i)%sf(j, k, l) = q_prim_vf(i)%sf(j, k, l)
-                        end do
-                    end do
-                end do
-            end do
-            $:END_GPU_PARALLEL_LOOP()
-        elseif (t_step == t_step_start + 3) then
-            $:GPU_PARALLEL_LOOP(collapse=4)
-            do i = 1, sys_size
-                do l = 0, p
-                    do k = 0, n
-                        do j = 0, m
-                            q_prim_ts1(1)%vf(i)%sf(j, k, l) = q_prim_vf(i)%sf(j, k, l)
-                        end do
-                    end do
-                end do
-            end do
-            $:END_GPU_PARALLEL_LOOP()
-        else ! All other timesteps
-            $:GPU_PARALLEL_LOOP(collapse=4)
-            do i = 1, sys_size
-                do l = 0, p
-                    do k = 0, n
-                        do j = 0, m
-                            q_prim_ts2(2)%vf(i)%sf(j, k, l) = q_prim_ts2(1)%vf(i)%sf(j, k, l)
-                            q_prim_ts2(1)%vf(i)%sf(j, k, l) = q_prim_ts1(2)%vf(i)%sf(j, k, l)
-                            q_prim_ts1(2)%vf(i)%sf(j, k, l) = q_prim_ts1(1)%vf(i)%sf(j, k, l)
-                            q_prim_ts1(1)%vf(i)%sf(j, k, l) = q_prim_vf(i)%sf(j, k, l)
-                        end do
-                    end do
-                end do
-            end do
-            $:END_GPU_PARALLEL_LOOP()
-        end if
-
-    end subroutine s_time_step_cycling
-
-    !> Module deallocation and/or disassociation procedures
-    impure subroutine s_finalize_time_steppers_module
-#ifdef FRONTIER_UNIFIED
-        use hipfort
-        use hipfort_hipmalloc
-        use hipfort_check
-#endif
-        integer :: i, j !< Generic loop iterators
-
-        ! Deallocating the cell-average conservative variables
-#if defined(__NVCOMPILER_GPU_UNIFIED_MEM)
-        do j = 1, sys_size
-            @:DEALLOCATE(q_cons_ts(1)%vf(j)%sf)
-            if (num_ts == 2) then
-                if (nv_uvm_out_of_core) then
-                    nullify (q_cons_ts(2)%vf(j)%sf)
-                else
-                    @:DEALLOCATE(q_cons_ts(2)%vf(j)%sf)
-                end if
-            end if
-        end do
-        if (num_ts == 2 .and. nv_uvm_out_of_core) then
-            deallocate (q_cons_ts_pool_host)
-        end if
-#elif defined(FRONTIER_UNIFIED)
-        do i = 1, num_ts
-            do j = 1, sys_size
-                nullify (q_cons_ts(i)%vf(j)%sf)
-            end do
-        end do
-#ifdef MFC_MIXED_PRECISION
-        call hipCheck(hipHostFree_(c_loc(q_cons_ts_pool_host)))
-        nullify (q_cons_ts_pool_host)
-        call hipCheck(hipFree_(c_loc(q_cons_ts_pool_device)))
-        nullify (q_cons_ts_pool_device)
-#else
-        call hipCheck(hipHostFree(q_cons_ts_pool_host))
-        call hipCheck(hipFree(q_cons_ts_pool_device))
-#endif
-#else
-        do i = 1, num_ts
-            do j = 1, sys_size
-                @:DEALLOCATE(q_cons_ts(i)%vf(j)%sf)
-            end do
-        end do
-#endif
-        do i = 1, num_ts
-            @:DEALLOCATE(q_cons_ts(i)%vf)
-        end do
-
-        @:DEALLOCATE(q_cons_ts)
-
-        ! Deallocating the cell-average primitive ts variables
-        if (probe_wrt) then
-            do i = 1, num_probe_ts
-                do j = 1, sys_size
-                    @:DEALLOCATE(q_prim_ts1(i)%vf(j)%sf,q_prim_ts2(i)%vf(j)%sf )
-                end do
-                @:DEALLOCATE(q_prim_ts1(i)%vf, q_prim_ts2(i)%vf)
-            end do
-            @:DEALLOCATE(q_prim_ts1, q_prim_ts2)
-        end if
-
-        if (.not. igr) then
-            ! Deallocating the cell-average primitive variables
-            do i = 1, adv_idx%end
-                @:DEALLOCATE(q_prim_vf(i)%sf)
-            end do
-
-            if (mhd) then
-                do i = B_idx%beg, B_idx%end
-                    @:DEALLOCATE(q_prim_vf(i)%sf)
-                end do
-            end if
-
-            if (elasticity) then
-                do i = stress_idx%beg, stress_idx%end
-                    @:DEALLOCATE(q_prim_vf(i)%sf)
-                end do
-            end if
-
-            if (hyperelasticity) then
-                do i = xibeg, xiend + 1
-                    @:DEALLOCATE(q_prim_vf(i)%sf)
-                end do
-            end if
-
-            if (cont_damage) then
-                @:DEALLOCATE(q_prim_vf(damage_idx)%sf)
-            end if
-
-            if (hyper_cleaning) then
-                @:DEALLOCATE(q_prim_vf(psi_idx)%sf)
-            end if
-
-            if (bubbles_euler) then
-                do i = bub_idx%beg, bub_idx%end
-                    @:DEALLOCATE(q_prim_vf(i)%sf)
-                end do
-            end if
-
-            if (model_eqns == 3) then
-                do i = internalEnergies_idx%beg, internalEnergies_idx%end
-                    @:DEALLOCATE(q_prim_vf(i)%sf)
-                end do
-            end if
-        end if
-
-        @:DEALLOCATE(q_prim_vf)
-
-        ! Deallocating the cell-average RHS variables
-        do i = 1, sys_size
-            @:DEALLOCATE(rhs_vf(i)%sf)
-        end do
-
-        @:DEALLOCATE(rhs_vf)
-
-        ! Writing the footer of and closing the run-time information file
-        if (proc_rank == 0 .and. run_time_info) then
-            call s_close_run_time_information_file()
-        end if
-
-    end subroutine s_finalize_time_steppers_module
-
-end module m_time_steppers
+!>
+!! @file
+!! @brief Contains module m_time_steppers
+
+#:include 'macros.fpp'
+#:include 'case.fpp'
+
+!> @brief Total-variation-diminishing (TVD) Runge--Kutta time integrators (1st-, 2nd-, and 3rd-order SSP)
+module m_time_steppers
+
+    use m_derived_types        !< Definitions of the derived types
+
+    use m_global_parameters    !< Definitions of the global parameters
+
+    use m_rhs                  !< Right-hane-side (RHS) evaluation procedures
+
+    use m_pressure_relaxation  !< Pressure relaxation procedures
+
+    use m_data_output          !< Run-time info & solution data output procedures
+
+    use m_bubbles_EE           !< Ensemble-averaged bubble dynamics routines
+
+    use m_bubbles_EL           !< Lagrange bubble dynamics routines
+
+    use m_ibm
+
+    use m_hyperelastic
+
+    use m_mpi_proxy            !< Message passing interface (MPI) module proxy
+
+    use m_boundary_common
+
+    use m_helper
+
+    use m_sim_helpers
+
+    use m_fftw
+
+    use m_nvtx
+
+    use m_thermochem, only: num_species
+
+    use m_body_forces
+
+    use m_derived_variables
+
+    use m_re_visc              !< Non-Newtonian viscosity computations
+
+    implicit none
+
+    type(vector_field), allocatable, dimension(:) :: q_cons_ts !<
+    !! Cell-average conservative variables at each time-stage (TS)
+
+    type(scalar_field), allocatable, dimension(:) :: q_prim_vf !<
+    !! Cell-average primitive variables at the current time-stage
+
+    type(scalar_field), allocatable, dimension(:) :: rhs_vf !<
+    !! Cell-average RHS variables at the current time-stage
+
+    type(integer_field), allocatable, dimension(:, :) :: bc_type !<
+    !! Boundary condition identifiers
+
+    type(vector_field), allocatable, dimension(:) :: q_prim_ts1, q_prim_ts2 !<
+    !! Cell-average primitive variables at consecutive TIMESTEPS
+
+    real(wp), allocatable, dimension(:, :, :, :, :) :: rhs_pb
+
+    type(scalar_field) :: q_T_sf !<
+    !! Cell-average temperature variables at the current time-stage
+
+    real(wp), allocatable, dimension(:, :, :, :, :) :: rhs_mv
+
+    real(wp), allocatable, dimension(:, :, :) :: max_dt
+
+    integer, private :: num_ts !<
+    !! Number of time stages in the time-stepping scheme
+
+    integer :: stor !< storage index
+    real(wp), allocatable, dimension(:, :) :: rk_coef
+    integer, private :: num_probe_ts
+
+    $:GPU_DECLARE(create='[q_cons_ts,q_prim_vf,q_T_sf,rhs_vf,q_prim_ts1,q_prim_ts2,rhs_mv,rhs_pb,max_dt,rk_coef,stor,bc_type]')
+
+!> @cond
+#if defined(__NVCOMPILER_GPU_UNIFIED_MEM)
+    real(stp), allocatable, dimension(:, :, :, :), pinned, target :: q_cons_ts_pool_host
+#elif defined(FRONTIER_UNIFIED)
+    real(stp), pointer, contiguous, dimension(:, :, :, :) :: q_cons_ts_pool_host, q_cons_ts_pool_device
+    integer(kind=8) :: pool_dims(4), pool_starts(4)
+    integer(kind=8) :: pool_size
+    type(c_ptr) :: cptr_host, cptr_device
+#endif
+!> @endcond
+
+contains
+
+    !> The computation of parameters, the allocation of memory,
+        !!      the association of pointers and/or the execution of any
+        !!      other procedures that are necessary to setup the module.
+    impure subroutine s_initialize_time_steppers_module
+#ifdef FRONTIER_UNIFIED
+        use hipfort
+        use hipfort_hipmalloc
+        use hipfort_check
+#if defined(MFC_OpenACC)
+        use openacc
+#endif
+#endif
+        integer :: i, j !< Generic loop iterators
+
+        ! Setting number of time-stages for selected time-stepping scheme
+        if (time_stepper == 1) then
+            num_ts = 1
+        elseif (any(time_stepper == (/2, 3/))) then
+            num_ts = 2
+        end if
+
+        if (probe_wrt) then
+            num_probe_ts = 2
+        end if
+
+        ! Allocating the cell-average conservative variables
+        @:ALLOCATE(q_cons_ts(1:num_ts))
+        @:PREFER_GPU(q_cons_ts)
+
+        do i = 1, num_ts
+            @:ALLOCATE(q_cons_ts(i)%vf(1:sys_size))
+            @:PREFER_GPU(q_cons_ts(i)%vf)
+        end do
+
+!> @cond
+#if defined(__NVCOMPILER_GPU_UNIFIED_MEM)
+        if (num_ts == 2 .and. nv_uvm_out_of_core) then
+            ! host allocation for q_cons_ts(2)%vf(j)%sf for all j
+            allocate (q_cons_ts_pool_host(idwbuff(1)%beg:idwbuff(1)%end, &
+                                          idwbuff(2)%beg:idwbuff(2)%end, &
+                                          idwbuff(3)%beg:idwbuff(3)%end, &
+                                          1:sys_size))
+        end if
+
+        do j = 1, sys_size
+            ! q_cons_ts(1) lives on the device
+            @:ALLOCATE(q_cons_ts(1)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+                idwbuff(2)%beg:idwbuff(2)%end, &
+                idwbuff(3)%beg:idwbuff(3)%end))
+            @:PREFER_GPU(q_cons_ts(1)%vf(j)%sf)
+            if (num_ts == 2) then
+                if (nv_uvm_out_of_core) then
+                    ! q_cons_ts(2) lives on the host
+                    q_cons_ts(2)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+                                          idwbuff(2)%beg:idwbuff(2)%end, &
+                                          idwbuff(3)%beg:idwbuff(3)%end) => q_cons_ts_pool_host(:, :, :, j)
+                else
+                    @:ALLOCATE(q_cons_ts(2)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+                        idwbuff(2)%beg:idwbuff(2)%end, &
+                        idwbuff(3)%beg:idwbuff(3)%end))
+                    @:PREFER_GPU(q_cons_ts(2)%vf(j)%sf)
+                end if
+            end if
+        end do
+
+        do i = 1, num_ts
+            @:ACC_SETUP_VFs(q_cons_ts(i))
+        end do
+#elif defined(FRONTIER_UNIFIED)
+        ! Allocate to memory regions using hip calls
+        ! that we will attach pointers to
+        do i = 1, 3
+            pool_dims(i) = idwbuff(i)%end - idwbuff(i)%beg + 1
+            pool_starts(i) = idwbuff(i)%beg
+        end do
+        pool_dims(4) = sys_size
+        pool_starts(4) = 1
+#ifdef MFC_MIXED_PRECISION
+        pool_size = 1_8*(idwbuff(1)%end - idwbuff(1)%beg + 1)*(idwbuff(2)%end - idwbuff(2)%beg + 1)*(idwbuff(3)%end - idwbuff(3)%beg + 1)*sys_size
+        call hipCheck(hipMalloc_(cptr_device, pool_size*2_8))
+        call c_f_pointer(cptr_device, q_cons_ts_pool_device, shape=pool_dims)
+        q_cons_ts_pool_device(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:) => q_cons_ts_pool_device
+
+        call hipCheck(hipMallocManaged_(cptr_host, pool_size*2_8, hipMemAttachGlobal))
+        call c_f_pointer(cptr_host, q_cons_ts_pool_host, shape=pool_dims)
+        q_cons_ts_pool_host(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:) => q_cons_ts_pool_host
+#else
+        ! Doing hipMalloc then mapping should be most performant
+        call hipCheck(hipMalloc(q_cons_ts_pool_device, dims8=pool_dims, lbounds8=pool_starts))
+        ! Without this map CCE will still create a device copy, because it's silly like that
+#if defined(MFC_OpenACC)
+        call acc_map_data(q_cons_ts_pool_device, c_loc(q_cons_ts_pool_device), c_sizeof(q_cons_ts_pool_device))
+#endif
+        ! CCE see it can access this and will leave it on the host. It will stay on the host so long as HSA_XNACK=1
+        ! NOTE: WE CANNOT DO ATOMICS INTO THIS MEMORY. We have to change a property to use atomics here
+        ! Otherwise leaving this as fine-grained will actually help performance since it can't be cached in GPU L2
+        if (num_ts == 2) then
+            call hipCheck(hipMallocManaged(q_cons_ts_pool_host, dims8=pool_dims, lbounds8=pool_starts, flags=hipMemAttachGlobal))
+#if defined(MFC_OpenMP)
+            call hipCheck(hipMemAdvise(c_loc(q_cons_ts_pool_host), c_sizeof(q_cons_ts_pool_host), hipMemAdviseSetPreferredLocation, -1))
+#endif
+        end if
+#endif
+
+        do j = 1, sys_size
+            ! q_cons_ts(1) lives on the device
+            q_cons_ts(1)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+                                  idwbuff(2)%beg:idwbuff(2)%end, &
+                                  idwbuff(3)%beg:idwbuff(3)%end) => q_cons_ts_pool_device(:, :, :, j)
+            if (num_ts == 2) then
+                ! q_cons_ts(2) lives on the host
+                q_cons_ts(2)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+                                      idwbuff(2)%beg:idwbuff(2)%end, &
+                                      idwbuff(3)%beg:idwbuff(3)%end) => q_cons_ts_pool_host(:, :, :, j)
+            end if
+        end do
+
+        do i = 1, num_ts
+            @:ACC_SETUP_VFs(q_cons_ts(i))
+            do j = 1, sys_size
+                $:GPU_UPDATE(device='[q_cons_ts(i)%vf(j)]')
+            end do
+        end do
+#else
+!> @endcond
+        do i = 1, num_ts
+            do j = 1, sys_size
+                @:ALLOCATE(q_cons_ts(i)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+                    idwbuff(2)%beg:idwbuff(2)%end, &
+                    idwbuff(3)%beg:idwbuff(3)%end))
+            end do
+            @:ACC_SETUP_VFs(q_cons_ts(i))
+        end do
+!> @cond
+#endif
+!> @endcond
+
+        ! Allocating the cell-average primitive ts variables
+        if (probe_wrt) then
+            @:ALLOCATE(q_prim_ts1(1:num_probe_ts))
+
+            do i = 1, num_probe_ts
+                @:ALLOCATE(q_prim_ts1(i)%vf(1:sys_size))
+            end do
+
+            do i = 1, num_probe_ts
+                do j = 1, sys_size
+                    @:ALLOCATE(q_prim_ts1(i)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+                        idwbuff(2)%beg:idwbuff(2)%end, &
+                        idwbuff(3)%beg:idwbuff(3)%end))
+                end do
+                @:ACC_SETUP_VFs(q_prim_ts1(i))
+            end do
+
+            @:ALLOCATE(q_prim_ts2(1:num_probe_ts))
+
+            do i = 1, num_probe_ts
+                @:ALLOCATE(q_prim_ts2(i)%vf(1:sys_size))
+            end do
+
+            do i = 1, num_probe_ts
+                do j = 1, sys_size
+                    @:ALLOCATE(q_prim_ts2(i)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+                        idwbuff(2)%beg:idwbuff(2)%end, &
+                        idwbuff(3)%beg:idwbuff(3)%end))
+                end do
+                @:ACC_SETUP_VFs(q_prim_ts2(i))
+            end do
+        end if
+
+        ! Allocating the cell-average primitive variables
+        @:ALLOCATE(q_prim_vf(1:sys_size))
+
+        if (.not. igr) then
+            do i = 1, adv_idx%end
+                @:ALLOCATE(q_prim_vf(i)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+                    idwbuff(2)%beg:idwbuff(2)%end, &
+                    idwbuff(3)%beg:idwbuff(3)%end))
+                @:ACC_SETUP_SFs(q_prim_vf(i))
+            end do
+
+            if (bubbles_euler) then
+                do i = bub_idx%beg, bub_idx%end
+                    @:ALLOCATE(q_prim_vf(i)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+                        idwbuff(2)%beg:idwbuff(2)%end, &
+                        idwbuff(3)%beg:idwbuff(3)%end))
+                    @:ACC_SETUP_SFs(q_prim_vf(i))
+                end do
+                if (adv_n) then
+                    @:ALLOCATE(q_prim_vf(n_idx)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+                        idwbuff(2)%beg:idwbuff(2)%end, &
+                        idwbuff(3)%beg:idwbuff(3)%end))
+                    @:ACC_SETUP_SFs(q_prim_vf(n_idx))
+                end if
+            end if
+
+            if (mhd) then
+                do i = B_idx%beg, B_idx%end
+                    @:ALLOCATE(q_prim_vf(i)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+                        idwbuff(2)%beg:idwbuff(2)%end, &
+                        idwbuff(3)%beg:idwbuff(3)%end))
+                    @:ACC_SETUP_SFs(q_prim_vf(i))
+                end do
+            end if
+
+            if (elasticity) then
+                do i = stress_idx%beg, stress_idx%end
+                    @:ALLOCATE(q_prim_vf(i)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+                        idwbuff(2)%beg:idwbuff(2)%end, &
+                        idwbuff(3)%beg:idwbuff(3)%end))
+                    @:ACC_SETUP_SFs(q_prim_vf(i))
+                end do
+            end if
+
+            if (hyperelasticity) then
+                do i = xibeg, xiend + 1
+                    @:ALLOCATE(q_prim_vf(i)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+                        idwbuff(2)%beg:idwbuff(2)%end, &
+                        idwbuff(3)%beg:idwbuff(3)%end))
+                    @:ACC_SETUP_SFs(q_prim_vf(i))
+                end do
+            end if
+
+            if (cont_damage) then
+                @:ALLOCATE(q_prim_vf(damage_idx)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+                    idwbuff(2)%beg:idwbuff(2)%end, &
+                    idwbuff(3)%beg:idwbuff(3)%end))
+                @:ACC_SETUP_SFs(q_prim_vf(damage_idx))
+            end if
+
+            if (hyper_cleaning) then
+                @:ALLOCATE(q_prim_vf(psi_idx)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+                    idwbuff(2)%beg:idwbuff(2)%end, &
+                    idwbuff(3)%beg:idwbuff(3)%end))
+                @:ACC_SETUP_SFs(q_prim_vf(psi_idx))
+            end if
+
+            if (model_eqns == 3) then
+                do i = internalEnergies_idx%beg, internalEnergies_idx%end
+                    @:ALLOCATE(q_prim_vf(i)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+                        idwbuff(2)%beg:idwbuff(2)%end, &
+                        idwbuff(3)%beg:idwbuff(3)%end))
+                    @:ACC_SETUP_SFs(q_prim_vf(i))
+                end do
+            end if
+
+            if (surface_tension) then
+                @:ALLOCATE(q_prim_vf(c_idx)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+                    idwbuff(2)%beg:idwbuff(2)%end, &
+                    idwbuff(3)%beg:idwbuff(3)%end))
+                @:ACC_SETUP_SFs(q_prim_vf(c_idx))
+            end if
+
+            if (chemistry) then
+                do i = chemxb, chemxe
+                    @:ALLOCATE(q_prim_vf(i)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+                        idwbuff(2)%beg:idwbuff(2)%end, &
+                        idwbuff(3)%beg:idwbuff(3)%end))
+                    @:ACC_SETUP_SFs(q_prim_vf(i))
+                end do
+
+                @:ALLOCATE(q_T_sf%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+                    idwbuff(2)%beg:idwbuff(2)%end, &
+                    idwbuff(3)%beg:idwbuff(3)%end))
+                @:ACC_SETUP_SFs(q_T_sf)
+            end if
+        end if
+
+        @:ALLOCATE(pb_ts(1:2))
+        !Initialize bubble variables pb and mv at all quadrature nodes for all R0 bins
+        if (qbmm .and. (.not. polytropic)) then
+            @:ALLOCATE(pb_ts(1)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+                idwbuff(2)%beg:idwbuff(2)%end, &
+                idwbuff(3)%beg:idwbuff(3)%end, 1:nnode, 1:nb))
+            @:ACC_SETUP_SFs(pb_ts(1))
+
+            @:ALLOCATE(pb_ts(2)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+                idwbuff(2)%beg:idwbuff(2)%end, &
+                idwbuff(3)%beg:idwbuff(3)%end, 1:nnode, 1:nb))
+            @:ACC_SETUP_SFs(pb_ts(2))
+
+            @:ALLOCATE(rhs_pb(idwbuff(1)%beg:idwbuff(1)%end, &
+                idwbuff(2)%beg:idwbuff(2)%end, &
+                idwbuff(3)%beg:idwbuff(3)%end, 1:nnode, 1:nb))
+        else if (qbmm .and. polytropic) then
+            @:ALLOCATE(pb_ts(1)%sf(idwbuff(1)%beg:idwbuff(1)%beg + 1, &
+                idwbuff(2)%beg:idwbuff(2)%beg + 1, &
+                idwbuff(3)%beg:idwbuff(3)%beg + 1, 1:nnode, 1:nb))
+            @:ACC_SETUP_SFs(pb_ts(1))
+
+            @:ALLOCATE(pb_ts(2)%sf(idwbuff(1)%beg:idwbuff(1)%beg + 1, &
+                idwbuff(2)%beg:idwbuff(2)%beg + 1, &
+                idwbuff(3)%beg:idwbuff(3)%beg + 1, 1:nnode, 1:nb))
+            @:ACC_SETUP_SFs(pb_ts(2))
+
+            @:ALLOCATE(rhs_pb(idwbuff(1)%beg:idwbuff(1)%beg + 1, &
+                idwbuff(2)%beg:idwbuff(2)%beg + 1, &
+                idwbuff(3)%beg:idwbuff(3)%beg + 1, 1:nnode, 1:nb))
+        else
+            @:ALLOCATE(pb_ts(1)%sf(0,0,0,0,0))
+            @:ACC_SETUP_SFs(pb_ts(1))
+
+            @:ALLOCATE(pb_ts(2)%sf(0,0,0,0,0))
+            @:ACC_SETUP_SFs(pb_ts(2))
+
+            @:ALLOCATE(rhs_pb(0,0,0,0,0))
+        end if
+
+        @:ALLOCATE(mv_ts(1:2))
+
+        if (qbmm .and. (.not. polytropic)) then
+            @:ALLOCATE(mv_ts(1)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+                idwbuff(2)%beg:idwbuff(2)%end, &
+                idwbuff(3)%beg:idwbuff(3)%end, 1:nnode, 1:nb))
+            @:ACC_SETUP_SFs(mv_ts(1))
+
+            @:ALLOCATE(mv_ts(2)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+                idwbuff(2)%beg:idwbuff(2)%end, &
+                idwbuff(3)%beg:idwbuff(3)%end, 1:nnode, 1:nb))
+            @:ACC_SETUP_SFs(mv_ts(2))
+
+            @:ALLOCATE(rhs_mv(idwbuff(1)%beg:idwbuff(1)%end, &
+                idwbuff(2)%beg:idwbuff(2)%end, &
+                idwbuff(3)%beg:idwbuff(3)%end, 1:nnode, 1:nb))
+
+        else if (qbmm .and. polytropic) then
+            @:ALLOCATE(mv_ts(1)%sf(idwbuff(1)%beg:idwbuff(1)%beg + 1, &
+                idwbuff(2)%beg:idwbuff(2)%beg + 1, &
+                idwbuff(3)%beg:idwbuff(3)%beg + 1, 1:nnode, 1:nb))
+            @:ACC_SETUP_SFs(mv_ts(1))
+
+            @:ALLOCATE(mv_ts(2)%sf(idwbuff(1)%beg:idwbuff(1)%beg + 1, &
+                idwbuff(2)%beg:idwbuff(2)%beg + 1, &
+                idwbuff(3)%beg:idwbuff(3)%beg + 1, 1:nnode, 1:nb))
+            @:ACC_SETUP_SFs(mv_ts(2))
+
+            @:ALLOCATE(rhs_mv(idwbuff(1)%beg:idwbuff(1)%beg + 1, &
+                idwbuff(2)%beg:idwbuff(2)%beg + 1, &
+                idwbuff(3)%beg:idwbuff(3)%beg + 1, 1:nnode, 1:nb))
+        else
+            @:ALLOCATE(mv_ts(1)%sf(0,0,0,0,0))
+            @:ACC_SETUP_SFs(mv_ts(1))
+
+            @:ALLOCATE(mv_ts(2)%sf(0,0,0,0,0))
+            @:ACC_SETUP_SFs(mv_ts(2))
+
+            @:ALLOCATE(rhs_mv(0,0,0,0,0))
+        end if
+
+        ! Allocating the cell-average RHS variables
+        @:ALLOCATE(rhs_vf(1:sys_size))
+        @:PREFER_GPU(rhs_vf)
+
+        if (igr) then
+            do i = 1, sys_size
+                @:ALLOCATE(rhs_vf(i)%sf(-1:m+1,-1:n+1,-1:p+1))
+                @:ACC_SETUP_SFs(rhs_vf(i))
+                @:PREFER_GPU(rhs_vf(i)%sf)
+            end do
+        else
+            do i = 1, sys_size
+                @:ALLOCATE(rhs_vf(i)%sf(0:m, 0:n, 0:p))
+                @:ACC_SETUP_SFs(rhs_vf(i))
+            end do
+        end if
+
+        ! Opening and writing the header of the run-time information file
+        if (proc_rank == 0 .and. run_time_info) then
+            call s_open_run_time_information_file()
+        end if
+
+        if (cfl_dt) then
+            @:ALLOCATE(max_dt(0:m, 0:n, 0:p))
+        end if
+
+        ! Allocating arrays to store the bc types
+        @:ALLOCATE(bc_type(1:num_dims,1:2))
+
+        @:ALLOCATE(bc_type(1,1)%sf(0:0,0:n,0:p))
+        @:ALLOCATE(bc_type(1,2)%sf(0:0,0:n,0:p))
+        #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
+            if (n > 0) then
+                @:ALLOCATE(bc_type(2,1)%sf(-buff_size:m+buff_size,0:0,0:p))
+                @:ALLOCATE(bc_type(2,2)%sf(-buff_size:m+buff_size,0:0,0:p))
+                #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
+                    if (p > 0) then
+                        @:ALLOCATE(bc_type(3,1)%sf(-buff_size:m+buff_size,-buff_size:n+buff_size,0:0))
+                        @:ALLOCATE(bc_type(3,2)%sf(-buff_size:m+buff_size,-buff_size:n+buff_size,0:0))
+                    end if
+                #:endif
+            end if
+        #:endif
+
+        do i = 1, num_dims
+            do j = 1, 2
+                @:ACC_SETUP_SFs(bc_type(i,j))
+            end do
+        end do
+
+        if (any(time_stepper == (/1, 2, 3/))) then
+            ! temporary array index for TVD RK
+            if (time_stepper == 1) then
+                stor = 1
+            else
+                stor = 2
+            end if
+
+            ! TVD RK coefficients
+            @:ALLOCATE (rk_coef(time_stepper, 4))
+            if (time_stepper == 1) then
+                rk_coef(1, :) = (/1._wp, 0._wp, 1._wp, 1._wp/)
+            else if (time_stepper == 2) then
+                rk_coef(1, :) = (/1._wp, 0._wp, 1._wp, 1._wp/)
+                rk_coef(2, :) = (/1._wp, 1._wp, 1._wp, 2._wp/)
+            else if (time_stepper == 3) then
+                rk_coef(1, :) = (/1._wp, 0._wp, 1._wp, 1._wp/)
+                rk_coef(2, :) = (/1._wp, 3._wp, 1._wp, 4._wp/)
+                rk_coef(3, :) = (/2._wp, 1._wp, 2._wp, 3._wp/)
+            end if
+            $:GPU_UPDATE(device='[rk_coef, stor]')
+        end if
+
+    end subroutine s_initialize_time_steppers_module
+
+    !> @brief Advances the solution one full step using a TVD Runge-Kutta time integrator.
+    impure subroutine s_tvd_rk(t_step, time_avg, nstage)
+#ifdef _CRAYFTN
+        !DIR$ OPTIMIZE (-haggress)
+#endif
+        integer, intent(in) :: t_step
+        real(wp), intent(inout) :: time_avg
+        integer, intent(in) :: nstage
+
+        integer :: i, j, k, l, q, s !< Generic loop iterator
+        real(wp) :: start, finish
+        integer :: dest
+
+        call cpu_time(start)
+        call nvtxStartRange("TIMESTEP")
+
+        ! Adaptive dt: initial stage
+        if (adap_dt) call s_adaptive_dt_bubble(1)
+
+        do s = 1, nstage
+            call s_compute_rhs(q_cons_ts(1)%vf, q_T_sf, q_prim_vf, bc_type, rhs_vf, pb_ts(1)%sf, rhs_pb, mv_ts(1)%sf, rhs_mv, t_step, time_avg, s)
+
+            if (s == 1) then
+                if (run_time_info) then
+                    if (igr .or. dummy) then
+                        call s_write_run_time_information(q_cons_ts(1)%vf, t_step)
+                    end if
+                    if (.not. igr .or. dummy) then
+                        call s_write_run_time_information(q_prim_vf, t_step)
+                    end if
+                end if
+
+                if (probe_wrt) then
+                    call s_time_step_cycling(t_step)
+                    call s_compute_derived_variables(t_step, q_cons_ts(1)%vf, q_prim_ts1, q_prim_ts2)
+                end if
+
+                if (cfl_dt) then
+                    if (mytime >= t_stop) return
+                else
+                    if (t_step == t_step_stop) return
+                end if
+            end if
+
+            if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=s)
+            $:GPU_PARALLEL_LOOP(collapse=4)
+            do i = 1, sys_size
+                do l = 0, p
+                    do k = 0, n
+                        do j = 0, m
+                            if (s == 1 .and. nstage > 1) then
+                                q_cons_ts(stor)%vf(i)%sf(j, k, l) = &
+                                    q_cons_ts(1)%vf(i)%sf(j, k, l)
+                            end if
+                            if (igr) then
+                                q_cons_ts(1)%vf(i)%sf(j, k, l) = &
+                                    (rk_coef(s, 1)*q_cons_ts(1)%vf(i)%sf(j, k, l) &
+                                     + rk_coef(s, 2)*q_cons_ts(stor)%vf(i)%sf(j, k, l) &
+                                     + rk_coef(s, 3)*rhs_vf(i)%sf(j, k, l))/rk_coef(s, 4)
+                            else
+                                q_cons_ts(1)%vf(i)%sf(j, k, l) = &
+                                    (rk_coef(s, 1)*q_cons_ts(1)%vf(i)%sf(j, k, l) &
+                                     + rk_coef(s, 2)*q_cons_ts(stor)%vf(i)%sf(j, k, l) &
+                                     + rk_coef(s, 3)*dt*rhs_vf(i)%sf(j, k, l))/rk_coef(s, 4)
+                            end if
+                        end do
+                    end do
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+            !Evolve pb and mv for non-polytropic qbmm
+            if (qbmm .and. (.not. polytropic)) then
+                $:GPU_PARALLEL_LOOP(collapse=5)
+                do i = 1, nb
+                    do l = 0, p
+                        do k = 0, n
+                            do j = 0, m
+                                do q = 1, nnode
+                                    if (s == 1 .and. nstage > 1) then
+                                        pb_ts(stor)%sf(j, k, l, q, i) = &
+                                            pb_ts(1)%sf(j, k, l, q, i)
+                                        mv_ts(stor)%sf(j, k, l, q, i) = &
+                                            mv_ts(1)%sf(j, k, l, q, i)
+                                    end if
+                                    pb_ts(1)%sf(j, k, l, q, i) = &
+                                        (rk_coef(s, 1)*pb_ts(1)%sf(j, k, l, q, i) &
+                                         + rk_coef(s, 2)*pb_ts(stor)%sf(j, k, l, q, i) &
+                                         + rk_coef(s, 3)*dt*rhs_pb(j, k, l, q, i))/rk_coef(s, 4)
+                                    mv_ts(1)%sf(j, k, l, q, i) = &
+                                        (rk_coef(s, 1)*mv_ts(1)%sf(j, k, l, q, i) &
+                                         + rk_coef(s, 2)*mv_ts(stor)%sf(j, k, l, q, i) &
+                                         + rk_coef(s, 3)*dt*rhs_mv(j, k, l, q, i))/rk_coef(s, 4)
+                                end do
+                            end do
+                        end do
+                    end do
+                end do
+                $:END_GPU_PARALLEL_LOOP()
+            end if
+
+            if (bodyForces) call s_apply_bodyforces(q_cons_ts(1)%vf, q_prim_vf, rhs_vf, rk_coef(s, 3)*dt/rk_coef(s, 4))
+
+            if (grid_geometry == 3) call s_apply_fourier_filter(q_cons_ts(1)%vf)
+
+            if (model_eqns == 3 .and. (.not. relax)) then
+                call s_pressure_relaxation_procedure(q_cons_ts(1)%vf)
+            end if
+
+            if (adv_n) call s_comp_alpha_from_n(q_cons_ts(1)%vf)
+
+            if (ib) then
+                ! check if any IBMS are moving, and if so, update the markers, ghost points, levelsets, and levelset norms
+                if (moving_immersed_boundary_flag) then
+                    call s_propagate_immersed_boundaries(s)
+                end if
+
+                ! update the ghost fluid properties point values based on IB state
+                if (qbmm .and. .not. polytropic) then
+                    call s_ibm_correct_state(q_cons_ts(1)%vf, q_prim_vf, pb_ts(1)%sf, mv_ts(1)%sf)
+                else
+                    call s_ibm_correct_state(q_cons_ts(1)%vf, q_prim_vf)
+                end if
+            end if
+
+        end do
+
+        if (moving_immersed_boundary_flag) call s_wrap_periodic_ibs()
+
+        ! Adaptive dt: final stage
+        if (adap_dt) call s_adaptive_dt_bubble(3)
+
+        call nvtxEndRange
+        call cpu_time(finish)
+
+        wall_time = abs(finish - start)
+
+        if (t_step >= 2) then
+            wall_time_avg = (wall_time + (t_step - 2)*wall_time_avg)/(t_step - 1)
+        else
+            wall_time_avg = 0._wp
+        end if
+
+    end subroutine s_tvd_rk
+
+    !> Bubble source part in Strang operator splitting scheme
+        !! @param stage Current time-stage
+    impure subroutine s_adaptive_dt_bubble(stage)
+
+        integer, intent(in) :: stage
+
+        type(vector_field) :: gm_alpha_qp
+
+        call s_convert_conservative_to_primitive_variables( &
+            q_cons_ts(1)%vf, &
+            q_T_sf, &
+            q_prim_vf, &
+            idwint)
+
+        if (bubbles_euler) then
+
+            call s_compute_bubble_EE_source(q_cons_ts(1)%vf, q_prim_vf, rhs_vf, divu)
+            call s_comp_alpha_from_n(q_cons_ts(1)%vf)
+
+        elseif (bubbles_lagrange) then
+
+            call s_populate_variables_buffers(bc_type, q_prim_vf, pb_ts(1)%sf, mv_ts(1)%sf)
+            call s_compute_bubble_EL_dynamics(q_prim_vf, stage)
+            call s_transfer_data_to_tmp()
+            call s_smear_voidfraction()
+            if (stage == 3) then
+                if (lag_params%write_bubbles_stats) call s_calculate_lag_bubble_stats()
+                if (lag_params%write_bubbles) then
+                    $:GPU_UPDATE(host='[gas_p,gas_mv,intfc_rad,intfc_vel]')
+                    call s_write_lag_particles(mytime)
+                end if
+                call s_write_void_evol(mytime)
+            end if
+
+        end if
+
+    end subroutine s_adaptive_dt_bubble
+
+    !> @brief Computes the global time step size from CFL stability constraints across all cells.
+    impure subroutine s_compute_dt()
+
+        real(wp) :: rho        !< Cell-avg. density
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3) :: vel        !< Cell-avg. velocity
+            real(wp), dimension(3) :: alpha      !< Cell-avg. volume fraction
+            real(wp), dimension(3, 2) :: Re_visc_per_phase  !< Per-phase Re_visc
+        #:else
+            real(wp), dimension(num_vels) :: vel        !< Cell-avg. velocity
+            real(wp), dimension(num_fluids) :: alpha      !< Cell-avg. volume fraction
+            real(wp), dimension(num_fluids, 2) :: Re_visc_per_phase  !< Per-phase Re_visc
+        #:endif
+        real(wp) :: vel_sum    !< Cell-avg. velocity sum
+        real(wp) :: pres       !< Cell-avg. pressure
+        real(wp) :: gamma      !< Cell-avg. sp. heat ratio
+        real(wp) :: pi_inf     !< Cell-avg. liquid stiffness function
+        real(wp) :: qv         !< Cell-avg. fluid reference energy
+        real(wp) :: c          !< Cell-avg. sound speed
+        real(wp) :: H          !< Cell-avg. enthalpy
+        real(wp), dimension(2) :: Re         !< Cell-avg. Reynolds numbers
+        type(vector_field) :: gm_alpha_qp
+
+        real(wp) :: dt_local
+        integer :: j, k, l !< Generic loop iterators
+
+        if (.not. igr .or. dummy .or. any_non_newtonian) then
+            call s_convert_conservative_to_primitive_variables( &
+                q_cons_ts(1)%vf, &
+                q_T_sf, &
+                q_prim_vf, &
+                idwint)
+        end if
+
+        $:GPU_PARALLEL_LOOP(collapse=3, private='[vel, alpha, Re, Re_visc_per_phase, rho, vel_sum, pres, gamma, pi_inf, c, H, qv]')
+        do l = 0, p
+            do k = 0, n
+                do j = 0, m
+                    if (igr) then
+                        call s_compute_enthalpy(q_cons_ts(1)%vf, pres, rho, gamma, pi_inf, Re, H, alpha, vel, vel_sum, qv, j, k, l)
+                    else
+                        call s_compute_enthalpy(q_prim_vf, pres, rho, gamma, pi_inf, Re, H, alpha, vel, vel_sum, qv, j, k, l)
+                    end if
+
+                    ! For non-Newtonian fluids, compute variable Re based on shear rate
+                    ! Always use q_prim_vf (velocities), not q_cons (momenta)
+                    if (any_non_newtonian) then
+                        call s_compute_re_visc(q_prim_vf, alpha, j, k, l, Re_visc_per_phase)
+                        call s_compute_mixture_re(alpha, Re_visc_per_phase, Re)
+                    end if
+
+                    ! Compute mixture sound speed
+                    call s_compute_speed_of_sound(pres, rho, gamma, pi_inf, H, alpha, vel_sum, 0._wp, c, qv)
+
+                    call s_compute_dt_from_cfl(vel, c, max_dt, rho, Re, j, k, l)
+                end do
+            end do
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+        #:call GPU_PARALLEL(copyout='[dt_local]', copyin='[max_dt]')
+            dt_local = minval(max_dt)
+        #:endcall GPU_PARALLEL
+
+        if (num_procs == 1) then
+            dt = dt_local
+        else
+            call s_mpi_allreduce_min(dt_local, dt)
+        end if
+
+        $:GPU_UPDATE(device='[dt]')
+
+    end subroutine s_compute_dt
+
+    !> This subroutine applies the body forces source term at each
+        !! Runge-Kutta stage
+        !! @param q_cons_vf Conservative variables
+        !! @param q_prim_vf_in Primitive variables
+        !! @param rhs_vf_in Right-hand side variables
+    subroutine s_apply_bodyforces(q_cons_vf, q_prim_vf_in, rhs_vf_in, ldt)
+
+        type(scalar_field), dimension(1:sys_size), intent(inout) :: q_cons_vf
+        type(scalar_field), dimension(1:sys_size), intent(in) :: q_prim_vf_in
+        type(scalar_field), dimension(1:sys_size), intent(inout) :: rhs_vf_in
+
+        real(wp), intent(in) :: ldt !< local dt
+
+        integer :: i, j, k, l
+
+        call nvtxStartRange("RHS-BODYFORCES")
+        call s_compute_body_forces_rhs(q_prim_vf_in, q_cons_vf, rhs_vf_in)
+
+        $:GPU_PARALLEL_LOOP(collapse=4)
+        do i = momxb, E_idx
+            do l = 0, p
+                do k = 0, n
+                    do j = 0, m
+                        q_cons_vf(i)%sf(j, k, l) = q_cons_vf(i)%sf(j, k, l) + &
+                                                   ldt*rhs_vf_in(i)%sf(j, k, l)
+                    end do
+                end do
+            end do
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+        call nvtxEndRange
+
+    end subroutine s_apply_bodyforces
+
+    !> @brief Updates immersed boundary positions and velocities at the current Runge-Kutta stage.
+    subroutine s_propagate_immersed_boundaries(s)
+
+        integer, intent(in) :: s
+        integer :: i
+        logical :: forces_computed
+
+        call nvtxStartRange("PROPAGATE-IMMERSED-BOUNDARIES")
+
+        forces_computed = .false.
+
+        do i = 1, num_ibs
+            if (s == 1) then
+                patch_ib(i)%step_vel = patch_ib(i)%vel
+                patch_ib(i)%step_angular_vel = patch_ib(i)%angular_vel
+                patch_ib(i)%step_angles = patch_ib(i)%angles
+                patch_ib(i)%step_x_centroid = patch_ib(i)%x_centroid
+                patch_ib(i)%step_y_centroid = patch_ib(i)%y_centroid
+                patch_ib(i)%step_z_centroid = patch_ib(i)%z_centroid
+            end if
+
+            if (patch_ib(i)%moving_ibm > 0) then
+                patch_ib(i)%vel = (rk_coef(s, 1)*patch_ib(i)%step_vel + rk_coef(s, 2)*patch_ib(i)%vel)/rk_coef(s, 4)
+                patch_ib(i)%angular_vel = (rk_coef(s, 1)*patch_ib(i)%step_angular_vel + rk_coef(s, 2)*patch_ib(i)%angular_vel)/rk_coef(s, 4)
+
+                if (patch_ib(i)%moving_ibm == 1) then
+                    ! plug in analytic velocities for 1-way coupling, if it exists
+                    @:mib_analytical()
+                else if (patch_ib(i)%moving_ibm == 2) then ! if we are using two-way coupling, apply force and torque
+                    ! compute the force and torque on the IB from the fluid
+                    if (.not. forces_computed) then
+                        call s_compute_ib_forces(q_prim_vf, fluid_pp)
+                        forces_computed = .true.
+                    end if
+
+                    ! update the velocity from the force value
+                    patch_ib(i)%vel = patch_ib(i)%vel + rk_coef(s, 3)*dt*(patch_ib(i)%force/patch_ib(i)%mass)/rk_coef(s, 4)
+
+                    ! update the angular velocity with the torque value
+                    patch_ib(i)%angular_vel = (patch_ib(i)%angular_vel*patch_ib(i)%moment) + (rk_coef(s, 3)*dt*patch_ib(i)%torque/rk_coef(s, 4)) ! add the torque to the angular momentum
+                    call s_compute_moment_of_inertia(i, patch_ib(i)%angular_vel) ! update the moment of inertia to be based on the direction of the angular momentum
+                    patch_ib(i)%angular_vel = patch_ib(i)%angular_vel/patch_ib(i)%moment ! convert back to angular velocity with the new moment of inertia
+                end if
+
+                ! Update the angle of the IB
+                patch_ib(i)%angles = (rk_coef(s, 1)*patch_ib(i)%step_angles + rk_coef(s, 2)*patch_ib(i)%angles + rk_coef(s, 3)*patch_ib(i)%angular_vel*dt)/rk_coef(s, 4)
+
+                ! Update the position of the IB
+                patch_ib(i)%x_centroid = (rk_coef(s, 1)*patch_ib(i)%step_x_centroid + rk_coef(s, 2)*patch_ib(i)%x_centroid + rk_coef(s, 3)*patch_ib(i)%vel(1)*dt)/rk_coef(s, 4)
+                patch_ib(i)%y_centroid = (rk_coef(s, 1)*patch_ib(i)%step_y_centroid + rk_coef(s, 2)*patch_ib(i)%y_centroid + rk_coef(s, 3)*patch_ib(i)%vel(2)*dt)/rk_coef(s, 4)
+                patch_ib(i)%z_centroid = (rk_coef(s, 1)*patch_ib(i)%step_z_centroid + rk_coef(s, 2)*patch_ib(i)%z_centroid + rk_coef(s, 3)*patch_ib(i)%vel(3)*dt)/rk_coef(s, 4)
+            end if
+        end do
+
+        call s_update_mib(num_ibs)
+
+        call nvtxEndRange
+
+    end subroutine s_propagate_immersed_boundaries
+
+    !> This subroutine saves the temporary q_prim_vf vector
+        !!      into the q_prim_ts vector that is then used in p_main
+        !! @param t_step current time-step
+    subroutine s_time_step_cycling(t_step)
+
+        integer, intent(in) :: t_step
+
+        integer :: i, j, k, l !< Generic loop iterator
+
+        if (t_step == t_step_start) then
+            $:GPU_PARALLEL_LOOP(collapse=4)
+            do i = 1, sys_size
+                do l = 0, p
+                    do k = 0, n
+                        do j = 0, m
+                            q_prim_ts2(2)%vf(i)%sf(j, k, l) = q_prim_vf(i)%sf(j, k, l)
+                        end do
+                    end do
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+        elseif (t_step == t_step_start + 1) then
+            $:GPU_PARALLEL_LOOP(collapse=4)
+            do i = 1, sys_size
+                do l = 0, p
+                    do k = 0, n
+                        do j = 0, m
+                            q_prim_ts2(1)%vf(i)%sf(j, k, l) = q_prim_vf(i)%sf(j, k, l)
+                        end do
+                    end do
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+        elseif (t_step == t_step_start + 2) then
+            $:GPU_PARALLEL_LOOP(collapse=4)
+            do i = 1, sys_size
+                do l = 0, p
+                    do k = 0, n
+                        do j = 0, m
+                            q_prim_ts1(2)%vf(i)%sf(j, k, l) = q_prim_vf(i)%sf(j, k, l)
+                        end do
+                    end do
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+        elseif (t_step == t_step_start + 3) then
+            $:GPU_PARALLEL_LOOP(collapse=4)
+            do i = 1, sys_size
+                do l = 0, p
+                    do k = 0, n
+                        do j = 0, m
+                            q_prim_ts1(1)%vf(i)%sf(j, k, l) = q_prim_vf(i)%sf(j, k, l)
+                        end do
+                    end do
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+        else ! All other timesteps
+            $:GPU_PARALLEL_LOOP(collapse=4)
+            do i = 1, sys_size
+                do l = 0, p
+                    do k = 0, n
+                        do j = 0, m
+                            q_prim_ts2(2)%vf(i)%sf(j, k, l) = q_prim_ts2(1)%vf(i)%sf(j, k, l)
+                            q_prim_ts2(1)%vf(i)%sf(j, k, l) = q_prim_ts1(2)%vf(i)%sf(j, k, l)
+                            q_prim_ts1(2)%vf(i)%sf(j, k, l) = q_prim_ts1(1)%vf(i)%sf(j, k, l)
+                            q_prim_ts1(1)%vf(i)%sf(j, k, l) = q_prim_vf(i)%sf(j, k, l)
+                        end do
+                    end do
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+        end if
+
+    end subroutine s_time_step_cycling
+
+    !> Module deallocation and/or disassociation procedures
+    impure subroutine s_finalize_time_steppers_module
+#ifdef FRONTIER_UNIFIED
+        use hipfort
+        use hipfort_hipmalloc
+        use hipfort_check
+#endif
+        integer :: i, j !< Generic loop iterators
+
+        ! Deallocating the cell-average conservative variables
+#if defined(__NVCOMPILER_GPU_UNIFIED_MEM)
+        do j = 1, sys_size
+            @:DEALLOCATE(q_cons_ts(1)%vf(j)%sf)
+            if (num_ts == 2) then
+                if (nv_uvm_out_of_core) then
+                    nullify (q_cons_ts(2)%vf(j)%sf)
+                else
+                    @:DEALLOCATE(q_cons_ts(2)%vf(j)%sf)
+                end if
+            end if
+        end do
+        if (num_ts == 2 .and. nv_uvm_out_of_core) then
+            deallocate (q_cons_ts_pool_host)
+        end if
+#elif defined(FRONTIER_UNIFIED)
+        do i = 1, num_ts
+            do j = 1, sys_size
+                nullify (q_cons_ts(i)%vf(j)%sf)
+            end do
+        end do
+#ifdef MFC_MIXED_PRECISION
+        call hipCheck(hipHostFree_(c_loc(q_cons_ts_pool_host)))
+        nullify (q_cons_ts_pool_host)
+        call hipCheck(hipFree_(c_loc(q_cons_ts_pool_device)))
+        nullify (q_cons_ts_pool_device)
+#else
+        call hipCheck(hipHostFree(q_cons_ts_pool_host))
+        call hipCheck(hipFree(q_cons_ts_pool_device))
+#endif
+#else
+        do i = 1, num_ts
+            do j = 1, sys_size
+                @:DEALLOCATE(q_cons_ts(i)%vf(j)%sf)
+            end do
+        end do
+#endif
+        do i = 1, num_ts
+            @:DEALLOCATE(q_cons_ts(i)%vf)
+        end do
+
+        @:DEALLOCATE(q_cons_ts)
+
+        ! Deallocating the cell-average primitive ts variables
+        if (probe_wrt) then
+            do i = 1, num_probe_ts
+                do j = 1, sys_size
+                    @:DEALLOCATE(q_prim_ts1(i)%vf(j)%sf,q_prim_ts2(i)%vf(j)%sf )
+                end do
+                @:DEALLOCATE(q_prim_ts1(i)%vf, q_prim_ts2(i)%vf)
+            end do
+            @:DEALLOCATE(q_prim_ts1, q_prim_ts2)
+        end if
+
+        if (.not. igr) then
+            ! Deallocating the cell-average primitive variables
+            do i = 1, adv_idx%end
+                @:DEALLOCATE(q_prim_vf(i)%sf)
+            end do
+
+            if (mhd) then
+                do i = B_idx%beg, B_idx%end
+                    @:DEALLOCATE(q_prim_vf(i)%sf)
+                end do
+            end if
+
+            if (elasticity) then
+                do i = stress_idx%beg, stress_idx%end
+                    @:DEALLOCATE(q_prim_vf(i)%sf)
+                end do
+            end if
+
+            if (hyperelasticity) then
+                do i = xibeg, xiend + 1
+                    @:DEALLOCATE(q_prim_vf(i)%sf)
+                end do
+            end if
+
+            if (cont_damage) then
+                @:DEALLOCATE(q_prim_vf(damage_idx)%sf)
+            end if
+
+            if (hyper_cleaning) then
+                @:DEALLOCATE(q_prim_vf(psi_idx)%sf)
+            end if
+
+            if (bubbles_euler) then
+                do i = bub_idx%beg, bub_idx%end
+                    @:DEALLOCATE(q_prim_vf(i)%sf)
+                end do
+            end if
+
+            if (model_eqns == 3) then
+                do i = internalEnergies_idx%beg, internalEnergies_idx%end
+                    @:DEALLOCATE(q_prim_vf(i)%sf)
+                end do
+            end if
+        end if
+
+        @:DEALLOCATE(q_prim_vf)
+
+        ! Deallocating the cell-average RHS variables
+        do i = 1, sys_size
+            @:DEALLOCATE(rhs_vf(i)%sf)
+        end do
+
+        @:DEALLOCATE(rhs_vf)
+
+        ! Writing the footer of and closing the run-time information file
+        if (proc_rank == 0 .and. run_time_info) then
+            call s_close_run_time_information_file()
+        end if
+
+    end subroutine s_finalize_time_steppers_module
+
+end module m_time_steppers
diff --git a/src/simulation/m_viscous.fpp b/src/simulation/m_viscous.fpp
index 6b1d9dbdf2..23e3b9e8fa 100644
--- a/src/simulation/m_viscous.fpp
+++ b/src/simulation/m_viscous.fpp
@@ -1,1622 +1,1578 @@
-!>
-!! @file
-!! @brief Contains module m_viscous
-#:include 'case.fpp'
-#:include 'macros.fpp'
-
-!> @brief Computes viscous stress tensors and diffusive flux contributions for the Navier--Stokes equations
-module m_viscous
-
-    use m_derived_types        !< Definitions of the derived types
-
-    use m_global_parameters    !< Definitions of the global parameters
-
-    use m_weno
-
-    use m_muscl                !< Monotonic Upstream-centered (MUSCL)
-                               !! schemes for conservation laws
-
-    use m_helper
-
-    use m_finite_differences
-
-    private; public s_get_viscous, &
- s_compute_viscous_stress_cylindrical_boundary, &
- s_initialize_viscous_module, &
- s_reconstruct_cell_boundary_values_visc_deriv, &
- s_finalize_viscous_module, &
- s_compute_viscous_stress_tensor
-
-    type(int_bounds_info) :: iv
-    type(int_bounds_info) :: is1_viscous, is2_viscous, is3_viscous
-    $:GPU_DECLARE(create='[is1_viscous,is2_viscous,is3_viscous,iv]')
-
-    real(wp), allocatable, dimension(:, :) :: Res_viscous
-    $:GPU_DECLARE(create='[Res_viscous]')
-
-contains
-
-    !> @brief Allocates and populates the viscous Reynolds number arrays and transfers data to the GPU.
-    impure subroutine s_initialize_viscous_module
-
-        integer :: i, j !< generic loop iterators
-
-        @:ALLOCATE(Res_viscous(1:2, 1:Re_size_max))
-
-        do i = 1, 2
-            do j = 1, Re_size(i)
-                Res_viscous(i, j) = fluid_pp(Re_idx(i, j))%Re(i)
-            end do
-        end do
-        $:GPU_UPDATE(device='[Res_viscous,Re_idx,Re_size]')
-        $:GPU_ENTER_DATA(copyin='[is1_viscous,is2_viscous,is3_viscous,iv]')
-
-    end subroutine s_initialize_viscous_module
-
-    !> The purpose of this subroutine is to compute the viscous
-    !      stress tensor for the cells directly next to the axis in
-    !      cylindrical coordinates. This is necessary to avoid the
-    !      1/r singularity that arises at the cell boundary coinciding
-    !      with the axis, i.e., y_cb(-1) = 0.
-    !  @param q_prim_vf Cell-average primitive variables
-    !  @param grad_x_vf Cell-average primitive variable derivatives, x-dir
-    !  @param grad_y_vf Cell-average primitive variable derivatives, y-dir
-    !  @param grad_z_vf Cell-average primitive variable derivatives, z-dir
-    subroutine s_compute_viscous_stress_cylindrical_boundary(q_prim_vf, grad_x_vf, grad_y_vf, grad_z_vf, &
-                                                             tau_Re_vf, &
-                                                             ix, iy, iz)
-
-        type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf
-        type(scalar_field), dimension(num_dims), intent(in) :: grad_x_vf, grad_y_vf, grad_z_vf
-        type(scalar_field), dimension(1:sys_size), intent(inout) :: tau_Re_vf
-        type(int_bounds_info), intent(in) :: ix, iy, iz
-
-        real(wp) :: rho_visc, gamma_visc, pi_inf_visc, alpha_visc_sum  !< Mixture variables
-        real(wp), dimension(2) :: Re_visc
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
-            real(wp), dimension(3) :: alpha_visc, alpha_rho_visc
-            real(wp), dimension(3, 3) :: tau_Re
-        #:else
-            real(wp), dimension(num_fluids) :: alpha_visc, alpha_rho_visc
-            real(wp), dimension(num_dims, num_dims) :: tau_Re
-        #:endif
-
-        integer :: i, j, k, l, q !< Generic loop iterator
-
-        is1_viscous = ix; is2_viscous = iy; is3_viscous = iz
-
-        $:GPU_UPDATE(device='[is1_viscous,is2_viscous,is3_viscous]')
-
-        $:GPU_PARALLEL_LOOP(collapse=3)
-        do l = is3_viscous%beg, is3_viscous%end
-            do k = is2_viscous%beg, is2_viscous%end
-                do j = is1_viscous%beg, is1_viscous%end
-                    $:GPU_LOOP(parallelism='[seq]')
-                    do i = momxb, E_idx
-                        tau_Re_vf(i)%sf(j, k, l) = 0._wp
-                    end do
-                end do
-            end do
-        end do
-        $:END_GPU_PARALLEL_LOOP()
-
-        #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
-            if (shear_stress) then    ! Shear stresses
-                $:GPU_PARALLEL_LOOP(collapse=3, private='[i,j,k,l,rho_visc, gamma_visc, pi_inf_visc, alpha_visc_sum ,alpha_visc, alpha_rho_visc, Re_visc, tau_Re]')
-                do l = is3_viscous%beg, is3_viscous%end
-                    do k = -1, 1
-                        do j = is1_viscous%beg, is1_viscous%end
-
-                            $:GPU_LOOP(parallelism='[seq]')
-                            do i = 1, num_fluids
-                                alpha_rho_visc(i) = q_prim_vf(i)%sf(j, k, l)
-                                if (bubbles_euler .and. num_fluids == 1) then
-                                    alpha_visc(i) = 1._wp - q_prim_vf(E_idx + i)%sf(j, k, l)
-                                else
-                                    alpha_visc(i) = q_prim_vf(E_idx + i)%sf(j, k, l)
-                                end if
-                            end do
-
-                            if (bubbles_euler) then
-                                rho_visc = 0._wp
-                                gamma_visc = 0._wp
-                                pi_inf_visc = 0._wp
-
-                                if (mpp_lim .and. (model_eqns == 2) .and. (num_fluids > 2)) then
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = 1, num_fluids
-                                        rho_visc = rho_visc + alpha_rho_visc(i)
-                                        gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
-                                        pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i)
-                                    end do
-                                else if ((model_eqns == 2) .and. (num_fluids > 2)) then
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = 1, num_fluids - 1
-                                        rho_visc = rho_visc + alpha_rho_visc(i)
-                                        gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
-                                        pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i)
-                                    end do
-                                else
-                                    rho_visc = alpha_rho_visc(1)
-                                    gamma_visc = gammas(1)
-                                    pi_inf_visc = pi_infs(1)
-                                end if
-                            else
-                                rho_visc = 0._wp
-                                gamma_visc = 0._wp
-                                pi_inf_visc = 0._wp
-
-                                alpha_visc_sum = 0._wp
-
-                                if (mpp_lim) then
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = 1, num_fluids
-                                        alpha_rho_visc(i) = max(0._wp, alpha_rho_visc(i))
-                                        alpha_visc(i) = min(max(0._wp, alpha_visc(i)), 1._wp)
-                                        alpha_visc_sum = alpha_visc_sum + alpha_visc(i)
-                                    end do
-
-                                    alpha_visc = alpha_visc/max(alpha_visc_sum, sgm_eps)
-
-                                end if
-
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, num_fluids
-                                    rho_visc = rho_visc + alpha_rho_visc(i)
-                                    gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
-                                    pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i)
-                                end do
-
-                                if (viscous) then
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = 1, 2
-                                        Re_visc(i) = dflt_real
-
-                                        if (Re_size(i) > 0) Re_visc(i) = 0._wp
-                                        $:GPU_LOOP(parallelism='[seq]')
-                                        do q = 1, Re_size(i)
-                                            Re_visc(i) = alpha_visc(Re_idx(i, q))/Res_viscous(i, q) &
-                                                         + Re_visc(i)
-                                        end do
-
-                                        Re_visc(i) = 1._wp/max(Re_visc(i), sgm_eps)
-
-                                    end do
-                                end if
-                            end if
-
-                            tau_Re(2, 1) = (grad_y_vf(1)%sf(j, k, l) + &
-                                            grad_x_vf(2)%sf(j, k, l))/ &
-                                           Re_visc(1)
-
-                            tau_Re(2, 2) = (4._wp*grad_y_vf(2)%sf(j, k, l) &
-                                            - 2._wp*grad_x_vf(1)%sf(j, k, l) &
-                                            - 2._wp*q_prim_vf(momxb + 1)%sf(j, k, l)/y_cc(k))/ &
-                                           (3._wp*Re_visc(1))
-                            $:GPU_LOOP(parallelism='[seq]')
-                            do i = 1, 2
-                                tau_Re_vf(contxe + i)%sf(j, k, l) = &
-                                    tau_Re_vf(contxe + i)%sf(j, k, l) - &
-                                    tau_Re(2, i)
-
-                                tau_Re_vf(E_idx)%sf(j, k, l) = &
-                                    tau_Re_vf(E_idx)%sf(j, k, l) - &
-                                    q_prim_vf(contxe + i)%sf(j, k, l)*tau_Re(2, i)
-                            end do
-                        end do
-                    end do
-                end do
-                $:END_GPU_PARALLEL_LOOP()
-            end if
-        #:endif
-
-        #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
-            if (bulk_stress) then    ! Bulk stresses
-                $:GPU_PARALLEL_LOOP(collapse=3, private='[i,j,k,l,rho_visc, gamma_visc, pi_inf_visc, alpha_visc_sum ,alpha_visc, alpha_rho_visc, Re_visc, tau_Re]')
-                do l = is3_viscous%beg, is3_viscous%end
-                    do k = -1, 1
-                        do j = is1_viscous%beg, is1_viscous%end
-
-                            $:GPU_LOOP(parallelism='[seq]')
-                            do i = 1, num_fluids
-                                alpha_rho_visc(i) = q_prim_vf(i)%sf(j, k, l)
-                                if (bubbles_euler .and. num_fluids == 1) then
-                                    alpha_visc(i) = 1._wp - q_prim_vf(E_idx + i)%sf(j, k, l)
-                                else
-                                    alpha_visc(i) = q_prim_vf(E_idx + i)%sf(j, k, l)
-                                end if
-                            end do
-
-                            if (bubbles_euler) then
-                                rho_visc = 0._wp
-                                gamma_visc = 0._wp
-                                pi_inf_visc = 0._wp
-
-                                if (mpp_lim .and. (model_eqns == 2) .and. (num_fluids > 2)) then
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = 1, num_fluids
-                                        rho_visc = rho_visc + alpha_rho_visc(i)
-                                        gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
-                                        pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i)
-                                    end do
-                                else if ((model_eqns == 2) .and. (num_fluids > 2)) then
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = 1, num_fluids - 1
-                                        rho_visc = rho_visc + alpha_rho_visc(i)
-                                        gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
-                                        pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i)
-                                    end do
-                                else
-                                    rho_visc = alpha_rho_visc(1)
-                                    gamma_visc = gammas(1)
-                                    pi_inf_visc = pi_infs(1)
-                                end if
-                            else
-                                rho_visc = 0._wp
-                                gamma_visc = 0._wp
-                                pi_inf_visc = 0._wp
-
-                                alpha_visc_sum = 0._wp
-
-                                if (mpp_lim) then
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = 1, num_fluids
-                                        alpha_rho_visc(i) = max(0._wp, alpha_rho_visc(i))
-                                        alpha_visc(i) = min(max(0._wp, alpha_visc(i)), 1._wp)
-                                        alpha_visc_sum = alpha_visc_sum + alpha_visc(i)
-                                    end do
-
-                                    alpha_visc = alpha_visc/max(alpha_visc_sum, sgm_eps)
-
-                                end if
-
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, num_fluids
-                                    rho_visc = rho_visc + alpha_rho_visc(i)
-                                    gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
-                                    pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i)
-                                end do
-
-                                if (viscous) then
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = 1, 2
-                                        Re_visc(i) = dflt_real
-
-                                        if (Re_size(i) > 0) Re_visc(i) = 0._wp
-                                        $:GPU_LOOP(parallelism='[seq]')
-                                        do q = 1, Re_size(i)
-                                            Re_visc(i) = alpha_visc(Re_idx(i, q))/Res_viscous(i, q) &
-                                                         + Re_visc(i)
-                                        end do
-
-                                        Re_visc(i) = 1._wp/max(Re_visc(i), sgm_eps)
-
-                                    end do
-                                end if
-                            end if
-
-                            tau_Re(2, 2) = (grad_x_vf(1)%sf(j, k, l) + &
-                                            grad_y_vf(2)%sf(j, k, l) + &
-                                            q_prim_vf(momxb + 1)%sf(j, k, l)/y_cc(k))/ &
-                                           Re_visc(2)
-
-                            tau_Re_vf(momxb + 1)%sf(j, k, l) = &
-                                tau_Re_vf(momxb + 1)%sf(j, k, l) - &
-                                tau_Re(2, 2)
-
-                            tau_Re_vf(E_idx)%sf(j, k, l) = &
-                                tau_Re_vf(E_idx)%sf(j, k, l) - &
-                                q_prim_vf(momxb + 1)%sf(j, k, l)*tau_Re(2, 2)
-
-                        end do
-                    end do
-                end do
-                $:END_GPU_PARALLEL_LOOP()
-            end if
-        #:endif
-
-        if (p == 0) return
-        #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
-
-            if (shear_stress) then    ! Shear stresses
-                $:GPU_PARALLEL_LOOP(collapse=3, private='[alpha_visc, alpha_rho_visc, Re_visc, tau_Re]')
-                do l = is3_viscous%beg, is3_viscous%end
-                    do k = -1, 1
-                        do j = is1_viscous%beg, is1_viscous%end
-
-                            $:GPU_LOOP(parallelism='[seq]')
-                            do i = 1, num_fluids
-                                alpha_rho_visc(i) = q_prim_vf(i)%sf(j, k, l)
-                                if (bubbles_euler .and. num_fluids == 1) then
-                                    alpha_visc(i) = 1._wp - q_prim_vf(E_idx + i)%sf(j, k, l)
-                                else
-                                    alpha_visc(i) = q_prim_vf(E_idx + i)%sf(j, k, l)
-                                end if
-                            end do
-
-                            if (bubbles_euler) then
-                                rho_visc = 0._wp
-                                gamma_visc = 0._wp
-                                pi_inf_visc = 0._wp
-
-                                if (mpp_lim .and. (model_eqns == 2) .and. (num_fluids > 2)) then
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = 1, num_fluids
-                                        rho_visc = rho_visc + alpha_rho_visc(i)
-                                        gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
-                                        pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i)
-                                    end do
-                                else if ((model_eqns == 2) .and. (num_fluids > 2)) then
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = 1, num_fluids - 1
-                                        rho_visc = rho_visc + alpha_rho_visc(i)
-                                        gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
-                                        pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i)
-                                    end do
-                                else
-                                    rho_visc = alpha_rho_visc(1)
-                                    gamma_visc = gammas(1)
-                                    pi_inf_visc = pi_infs(1)
-                                end if
-                            else
-                                rho_visc = 0._wp
-                                gamma_visc = 0._wp
-                                pi_inf_visc = 0._wp
-
-                                alpha_visc_sum = 0._wp
-
-                                if (mpp_lim) then
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = 1, num_fluids
-                                        alpha_rho_visc(i) = max(0._wp, alpha_rho_visc(i))
-                                        alpha_visc(i) = min(max(0._wp, alpha_visc(i)), 1._wp)
-                                        alpha_visc_sum = alpha_visc_sum + alpha_visc(i)
-                                    end do
-
-                                    alpha_visc = alpha_visc/max(alpha_visc_sum, sgm_eps)
-
-                                end if
-
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, num_fluids
-                                    rho_visc = rho_visc + alpha_rho_visc(i)
-                                    gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
-                                    pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i)
-                                end do
-
-                                if (viscous) then
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = 1, 2
-                                        Re_visc(i) = dflt_real
-
-                                        if (Re_size(i) > 0) Re_visc(i) = 0._wp
-                                        $:GPU_LOOP(parallelism='[seq]')
-                                        do q = 1, Re_size(i)
-                                            Re_visc(i) = alpha_visc(Re_idx(i, q))/Res_viscous(i, q) &
-                                                         + Re_visc(i)
-                                        end do
-
-                                        Re_visc(i) = 1._wp/max(Re_visc(i), sgm_eps)
-
-                                    end do
-                                end if
-                            end if
-
-                            tau_Re(2, 2) = -(2._wp/3._wp)*grad_z_vf(3)%sf(j, k, l)/y_cc(k)/ &
-                                           Re_visc(1)
-
-                            tau_Re(2, 3) = ((grad_z_vf(2)%sf(j, k, l) - &
-                                             q_prim_vf(momxe)%sf(j, k, l))/ &
-                                            y_cc(k) + grad_y_vf(3)%sf(j, k, l))/ &
-                                           Re_visc(1)
-
-                            $:GPU_LOOP(parallelism='[seq]')
-                            do i = 2, 3
-                                tau_Re_vf(contxe + i)%sf(j, k, l) = &
-                                    tau_Re_vf(contxe + i)%sf(j, k, l) - &
-                                    tau_Re(2, i)
-
-                                tau_Re_vf(E_idx)%sf(j, k, l) = &
-                                    tau_Re_vf(E_idx)%sf(j, k, l) - &
-                                    q_prim_vf(contxe + i)%sf(j, k, l)*tau_Re(2, i)
-                            end do
-
-                        end do
-                    end do
-                end do
-                $:END_GPU_PARALLEL_LOOP()
-            end if
-
-            if (bulk_stress) then    ! Bulk stresses
-                $:GPU_PARALLEL_LOOP(collapse=3, private='[alpha_visc, alpha_rho_visc, Re_visc, tau_Re]')
-                do l = is3_viscous%beg, is3_viscous%end
-                    do k = -1, 1
-                        do j = is1_viscous%beg, is1_viscous%end
-
-                            $:GPU_LOOP(parallelism='[seq]')
-                            do i = 1, num_fluids
-                                alpha_rho_visc(i) = q_prim_vf(i)%sf(j, k, l)
-                                if (bubbles_euler .and. num_fluids == 1) then
-                                    alpha_visc(i) = 1._wp - q_prim_vf(E_idx + i)%sf(j, k, l)
-                                else
-                                    alpha_visc(i) = q_prim_vf(E_idx + i)%sf(j, k, l)
-                                end if
-                            end do
-
-                            if (bubbles_euler) then
-                                rho_visc = 0._wp
-                                gamma_visc = 0._wp
-                                pi_inf_visc = 0._wp
-
-                                if (mpp_lim .and. (model_eqns == 2) .and. (num_fluids > 2)) then
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = 1, num_fluids
-                                        rho_visc = rho_visc + alpha_rho_visc(i)
-                                        gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
-                                        pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i)
-                                    end do
-                                else if ((model_eqns == 2) .and. (num_fluids > 2)) then
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = 1, num_fluids - 1
-                                        rho_visc = rho_visc + alpha_rho_visc(i)
-                                        gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
-                                        pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i)
-                                    end do
-                                else
-                                    rho_visc = alpha_rho_visc(1)
-                                    gamma_visc = gammas(1)
-                                    pi_inf_visc = pi_infs(1)
-                                end if
-                            else
-                                rho_visc = 0._wp
-                                gamma_visc = 0._wp
-                                pi_inf_visc = 0._wp
-
-                                alpha_visc_sum = 0._wp
-
-                                if (mpp_lim) then
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = 1, num_fluids
-                                        alpha_rho_visc(i) = max(0._wp, alpha_rho_visc(i))
-                                        alpha_visc(i) = min(max(0._wp, alpha_visc(i)), 1._wp)
-                                        alpha_visc_sum = alpha_visc_sum + alpha_visc(i)
-                                    end do
-
-                                    alpha_visc = alpha_visc/max(alpha_visc_sum, sgm_eps)
-
-                                end if
-
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, num_fluids
-                                    rho_visc = rho_visc + alpha_rho_visc(i)
-                                    gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
-                                    pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i)
-                                end do
-
-                                if (viscous) then
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = 1, 2
-                                        Re_visc(i) = dflt_real
-
-                                        if (Re_size(i) > 0) Re_visc(i) = 0._wp
-                                        $:GPU_LOOP(parallelism='[seq]')
-                                        do q = 1, Re_size(i)
-                                            Re_visc(i) = alpha_visc(Re_idx(i, q))/Res_viscous(i, q) &
-                                                         + Re_visc(i)
-                                        end do
-
-                                        Re_visc(i) = 1._wp/max(Re_visc(i), sgm_eps)
-
-                                    end do
-                                end if
-                            end if
-
-                            tau_Re(2, 2) = grad_z_vf(3)%sf(j, k, l)/y_cc(k)/ &
-                                           Re_visc(2)
-
-                            tau_Re_vf(momxb + 1)%sf(j, k, l) = &
-                                tau_Re_vf(momxb + 1)%sf(j, k, l) - &
-                                tau_Re(2, 2)
-
-                            tau_Re_vf(E_idx)%sf(j, k, l) = &
-                                tau_Re_vf(E_idx)%sf(j, k, l) - &
-                                q_prim_vf(momxb + 1)%sf(j, k, l)*tau_Re(2, 2)
-
-                        end do
-                    end do
-                end do
-                $:END_GPU_PARALLEL_LOOP()
-            end if
-        #:endif
-    end subroutine s_compute_viscous_stress_cylindrical_boundary
-
-    !>  Computes viscous terms
-    !!  @param qL_prim_rsx_vf Left reconstructed primitive variables in x
-    !!  @param qL_prim_rsy_vf Left reconstructed primitive variables in y
-    !!  @param qL_prim_rsz_vf Left reconstructed primitive variables in z
-    !!  @param dqL_prim_dx_n Left primitive x-derivative
-    !!  @param dqL_prim_dy_n Left primitive y-derivative
-    !!  @param dqL_prim_dz_n Left primitive z-derivative
-    !!  @param qL_prim Left cell-boundary primitive variables
-    !!  @param qR_prim_rsx_vf Right reconstructed primitive variables in x
-    !!  @param qR_prim_rsy_vf Right reconstructed primitive variables in y
-    !!  @param qR_prim_rsz_vf Right reconstructed primitive variables in z
-    !!  @param dqR_prim_dx_n Right primitive x-derivative
-    !!  @param dqR_prim_dy_n Right primitive y-derivative
-    !!  @param dqR_prim_dz_n Right primitive z-derivative
-    !!  @param qR_prim Right cell-boundary primitive variables
-    !!  @param q_prim_qp Cell-averaged primitive variables
-    !!  @param dq_prim_dx_qp Cell-averaged primitive x-derivative
-    !!  @param dq_prim_dy_qp Cell-averaged primitive y-derivative
-    !!  @param dq_prim_dz_qp Cell-averaged primitive z-derivative
-    !!  @param ix Index bounds in the x-direction
-    !!  @param iy Index bounds in the y-direction
-    !!  @param iz Index bounds in the z-direction
-    subroutine s_get_viscous(qL_prim_rsx_vf, qL_prim_rsy_vf, qL_prim_rsz_vf, &
-                             dqL_prim_dx_n, dqL_prim_dy_n, dqL_prim_dz_n, &
-                             qL_prim, &
-                             qR_prim_rsx_vf, qR_prim_rsy_vf, qR_prim_rsz_vf, &
-                             dqR_prim_dx_n, dqR_prim_dy_n, dqR_prim_dz_n, &
-                             qR_prim, &
-                             q_prim_qp, &
-                             dq_prim_dx_qp, dq_prim_dy_qp, dq_prim_dz_qp, &
-                             ix, iy, iz)
-
-        real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:), &
-            intent(inout) :: qL_prim_rsx_vf, qR_prim_rsx_vf, &
-                             qL_prim_rsy_vf, qR_prim_rsy_vf, &
-                             qL_prim_rsz_vf, qR_prim_rsz_vf
-
-        type(vector_field), dimension(num_dims), intent(inout) :: qL_prim, qR_prim
-
-        type(vector_field), intent(in) :: q_prim_qp
-
-        type(vector_field), dimension(1:num_dims), &
-            intent(inout) :: dqL_prim_dx_n, dqR_prim_dx_n, &
-                             dqL_prim_dy_n, dqR_prim_dy_n, &
-                             dqL_prim_dz_n, dqR_prim_dz_n
-
-        type(vector_field), dimension(1), intent(inout) :: dq_prim_dx_qp, dq_prim_dy_qp, dq_prim_dz_qp
-        type(int_bounds_info), intent(in) :: ix, iy, iz
-
-        integer :: i, j, k, l
-
-        do i = 1, num_dims
-
-            iv%beg = mom_idx%beg; iv%end = mom_idx%end
-
-            $:GPU_UPDATE(device='[iv]')
-
-            call s_reconstruct_cell_boundary_values_visc( &
-                q_prim_qp%vf(iv%beg:iv%end), &
-                qL_prim_rsx_vf, qL_prim_rsy_vf, qL_prim_rsz_vf, &
-                qR_prim_rsx_vf, qR_prim_rsy_vf, qR_prim_rsz_vf, &
-                i, qL_prim(i)%vf(iv%beg:iv%end), qR_prim(i)%vf(iv%beg:iv%end), &
-                ix, iy, iz)
-        end do
-
-        if (weno_Re_flux) then
-            ! Compute velocity gradient at cell centers using scalar
-            ! divergence theorem
-            do i = 1, num_dims
-                if (i == 1) then
-                    call s_apply_scalar_divergence_theorem( &
-                        qL_prim(i)%vf(iv%beg:iv%end), &
-                        qR_prim(i)%vf(iv%beg:iv%end), &
-                        dq_prim_dx_qp(1)%vf(iv%beg:iv%end), i, &
-                        ix, iy, iz, iv, dx, m, buff_size)
-                elseif (i == 2) then
-                    call s_apply_scalar_divergence_theorem( &
-                        qL_prim(i)%vf(iv%beg:iv%end), &
-                        qR_prim(i)%vf(iv%beg:iv%end), &
-                        dq_prim_dy_qp(1)%vf(iv%beg:iv%end), i, &
-                        ix, iy, iz, iv, dy, n, buff_size)
-                else
-                    call s_apply_scalar_divergence_theorem( &
-                        qL_prim(i)%vf(iv%beg:iv%end), &
-                        qR_prim(i)%vf(iv%beg:iv%end), &
-                        dq_prim_dz_qp(1)%vf(iv%beg:iv%end), i, &
-                        ix, iy, iz, iv, dz, p, buff_size)
-                end if
-            end do
-
-        else ! Compute velocity gradient at cell centers using finite differences
-
-            iv%beg = mom_idx%beg; iv%end = mom_idx%end
-            $:GPU_UPDATE(device='[iv]')
-
-            is1_viscous = ix; is2_viscous = iy; is3_viscous = iz
-
-            $:GPU_UPDATE(device='[is1_viscous,is2_viscous,is3_viscous]')
-
-            $:GPU_PARALLEL_LOOP(collapse=3)
-            do l = is3_viscous%beg, is3_viscous%end
-                do k = iy%beg, iy%end
-                    do j = is1_viscous%beg + 1, is1_viscous%end
-                        $:GPU_LOOP(parallelism='[seq]')
-                        do i = iv%beg, iv%end
-                            dqL_prim_dx_n(1)%vf(i)%sf(j, k, l) = &
-                                (q_prim_qp%vf(i)%sf(j, k, l) - &
-                                 q_prim_qp%vf(i)%sf(j - 1, k, l))/ &
-                                (x_cc(j) - x_cc(j - 1))
-                        end do
-                    end do
-                end do
-            end do
-            $:END_GPU_PARALLEL_LOOP()
-
-            $:GPU_PARALLEL_LOOP(collapse=3)
-            do l = is3_viscous%beg, is3_viscous%end
-                do k = is2_viscous%beg, is2_viscous%end
-                    do j = is1_viscous%beg, is1_viscous%end - 1
-                        $:GPU_LOOP(parallelism='[seq]')
-                        do i = iv%beg, iv%end
-                            dqR_prim_dx_n(1)%vf(i)%sf(j, k, l) = &
-                                (q_prim_qp%vf(i)%sf(j + 1, k, l) - &
-                                 q_prim_qp%vf(i)%sf(j, k, l))/ &
-                                (x_cc(j + 1) - x_cc(j))
-                        end do
-                    end do
-                end do
-            end do
-            $:END_GPU_PARALLEL_LOOP()
-
-            if (n > 0) then
-
-                #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
-                    $:GPU_PARALLEL_LOOP(collapse=3)
-                    do l = is3_viscous%beg, is3_viscous%end
-                        do j = is2_viscous%beg + 1, is2_viscous%end
-                            do k = is1_viscous%beg, is1_viscous%end
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = iv%beg, iv%end
-                                    dqL_prim_dy_n(2)%vf(i)%sf(k, j, l) = &
-                                        (q_prim_qp%vf(i)%sf(k, j, l) - &
-                                         q_prim_qp%vf(i)%sf(k, j - 1, l))/ &
-                                        (y_cc(j) - y_cc(j - 1))
-                                end do
-                            end do
-                        end do
-                    end do
-                    $:END_GPU_PARALLEL_LOOP()
-
-                    $:GPU_PARALLEL_LOOP(collapse=3)
-                    do l = is3_viscous%beg, is3_viscous%end
-                        do j = is2_viscous%beg, is2_viscous%end - 1
-                            do k = is1_viscous%beg, is1_viscous%end
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = iv%beg, iv%end
-                                    dqR_prim_dy_n(2)%vf(i)%sf(k, j, l) = &
-                                        (q_prim_qp%vf(i)%sf(k, j + 1, l) - &
-                                         q_prim_qp%vf(i)%sf(k, j, l))/ &
-                                        (y_cc(j + 1) - y_cc(j))
-                                end do
-                            end do
-                        end do
-                    end do
-                    $:END_GPU_PARALLEL_LOOP()
-
-                    $:GPU_PARALLEL_LOOP(collapse=3)
-                    do l = is3_viscous%beg, is3_viscous%end
-                        do j = is2_viscous%beg + 1, is2_viscous%end
-                            do k = is1_viscous%beg + 1, is1_viscous%end - 1
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = iv%beg, iv%end
-                                    dqL_prim_dx_n(2)%vf(i)%sf(k, j, l) = &
-                                        (dqL_prim_dx_n(1)%vf(i)%sf(k, j, l) + &
-                                         dqR_prim_dx_n(1)%vf(i)%sf(k, j, l) + &
-                                         dqL_prim_dx_n(1)%vf(i)%sf(k, j - 1, l) + &
-                                         dqR_prim_dx_n(1)%vf(i)%sf(k, j - 1, l))
-
-                                    dqL_prim_dx_n(2)%vf(i)%sf(k, j, l) = 25.e-2_wp* &
-                                                                         dqL_prim_dx_n(2)%vf(i)%sf(k, j, l)
-                                end do
-                            end do
-                        end do
-                    end do
-                    $:END_GPU_PARALLEL_LOOP()
-
-                    $:GPU_PARALLEL_LOOP(collapse=3)
-                    do l = is3_viscous%beg, is3_viscous%end
-                        do j = is2_viscous%beg, is2_viscous%end - 1
-                            do k = is1_viscous%beg + 1, is1_viscous%end - 1
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = iv%beg, iv%end
-                                    dqR_prim_dx_n(2)%vf(i)%sf(k, j, l) = &
-                                        (dqL_prim_dx_n(1)%vf(i)%sf(k, j + 1, l) + &
-                                         dqR_prim_dx_n(1)%vf(i)%sf(k, j + 1, l) + &
-                                         dqL_prim_dx_n(1)%vf(i)%sf(k, j, l) + &
-                                         dqR_prim_dx_n(1)%vf(i)%sf(k, j, l))
-
-                                    dqR_prim_dx_n(2)%vf(i)%sf(k, j, l) = 25.e-2_wp* &
-                                                                         dqR_prim_dx_n(2)%vf(i)%sf(k, j, l)
-
-                                end do
-                            end do
-                        end do
-                    end do
-                    $:END_GPU_PARALLEL_LOOP()
-
-                    $:GPU_PARALLEL_LOOP(collapse=3)
-                    do l = is3_viscous%beg, is3_viscous%end
-                        do k = is2_viscous%beg + 1, is2_viscous%end - 1
-                            do j = is1_viscous%beg + 1, is1_viscous%end
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = iv%beg, iv%end
-                                    dqL_prim_dy_n(1)%vf(i)%sf(j, k, l) = &
-                                        (dqL_prim_dy_n(2)%vf(i)%sf(j, k, l) + &
-                                         dqR_prim_dy_n(2)%vf(i)%sf(j, k, l) + &
-                                         dqL_prim_dy_n(2)%vf(i)%sf(j - 1, k, l) + &
-                                         dqR_prim_dy_n(2)%vf(i)%sf(j - 1, k, l))
-
-                                    dqL_prim_dy_n(1)%vf(i)%sf(j, k, l) = 25.e-2_wp* &
-                                                                         dqL_prim_dy_n(1)%vf(i)%sf(j, k, l)
-
-                                end do
-                            end do
-                        end do
-                    end do
-                    $:END_GPU_PARALLEL_LOOP()
-
-                    $:GPU_PARALLEL_LOOP(collapse=3)
-                    do l = is3_viscous%beg, is3_viscous%end
-                        do k = is2_viscous%beg + 1, is2_viscous%end - 1
-                            do j = is1_viscous%beg, is1_viscous%end - 1
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = iv%beg, iv%end
-                                    dqR_prim_dy_n(1)%vf(i)%sf(j, k, l) = &
-                                        (dqL_prim_dy_n(2)%vf(i)%sf(j + 1, k, l) + &
-                                         dqR_prim_dy_n(2)%vf(i)%sf(j + 1, k, l) + &
-                                         dqL_prim_dy_n(2)%vf(i)%sf(j, k, l) + &
-                                         dqR_prim_dy_n(2)%vf(i)%sf(j, k, l))
-
-                                    dqR_prim_dy_n(1)%vf(i)%sf(j, k, l) = 25.e-2_wp* &
-                                                                         dqR_prim_dy_n(1)%vf(i)%sf(j, k, l)
-
-                                end do
-                            end do
-                        end do
-                    end do
-                    $:END_GPU_PARALLEL_LOOP()
-
-                #:endif
-
-                if (p > 0) then
-                    #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
-                        $:GPU_PARALLEL_LOOP(collapse=3)
-                        do j = is3_viscous%beg + 1, is3_viscous%end
-                            do l = is2_viscous%beg, is2_viscous%end
-                                do k = is1_viscous%beg, is1_viscous%end
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = iv%beg, iv%end
-
-                                        dqL_prim_dz_n(3)%vf(i)%sf(k, l, j) = &
-                                            (q_prim_qp%vf(i)%sf(k, l, j) - &
-                                             q_prim_qp%vf(i)%sf(k, l, j - 1))/ &
-                                            (z_cc(j) - z_cc(j - 1))
-                                    end do
-                                end do
-                            end do
-                        end do
-                        $:END_GPU_PARALLEL_LOOP()
-
-                        $:GPU_PARALLEL_LOOP(collapse=3)
-                        do j = is3_viscous%beg, is3_viscous%end - 1
-                            do l = is2_viscous%beg, is2_viscous%end
-                                do k = is1_viscous%beg, is1_viscous%end
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = iv%beg, iv%end
-
-                                        dqR_prim_dz_n(3)%vf(i)%sf(k, l, j) = &
-                                            (q_prim_qp%vf(i)%sf(k, l, j + 1) - &
-                                             q_prim_qp%vf(i)%sf(k, l, j))/ &
-                                            (z_cc(j + 1) - z_cc(j))
-                                    end do
-                                end do
-                            end do
-                        end do
-                        $:END_GPU_PARALLEL_LOOP()
-
-                        $:GPU_PARALLEL_LOOP(collapse=3)
-                        do l = is3_viscous%beg + 1, is3_viscous%end - 1
-                            do k = is2_viscous%beg, is2_viscous%end
-                                do j = is1_viscous%beg + 1, is1_viscous%end
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = iv%beg, iv%end
-
-                                        dqL_prim_dz_n(1)%vf(i)%sf(j, k, l) = &
-                                            (dqL_prim_dz_n(3)%vf(i)%sf(j, k, l) + &
-                                             dqR_prim_dz_n(3)%vf(i)%sf(j, k, l) + &
-                                             dqL_prim_dz_n(3)%vf(i)%sf(j - 1, k, l) + &
-                                             dqR_prim_dz_n(3)%vf(i)%sf(j - 1, k, l))
-
-                                        dqL_prim_dz_n(1)%vf(i)%sf(j, k, l) = 25.e-2_wp* &
-                                                                             dqL_prim_dz_n(1)%vf(i)%sf(j, k, l)
-
-                                    end do
-                                end do
-                            end do
-                        end do
-                        $:END_GPU_PARALLEL_LOOP()
-
-                        $:GPU_PARALLEL_LOOP(collapse=3)
-                        do l = is3_viscous%beg + 1, is3_viscous%end - 1
-                            do k = is2_viscous%beg, is2_viscous%end
-                                do j = is1_viscous%beg, is1_viscous%end - 1
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = iv%beg, iv%end
-
-                                        dqR_prim_dz_n(1)%vf(i)%sf(j, k, l) = &
-                                            (dqL_prim_dz_n(3)%vf(i)%sf(j + 1, k, l) + &
-                                             dqR_prim_dz_n(3)%vf(i)%sf(j + 1, k, l) + &
-                                             dqL_prim_dz_n(3)%vf(i)%sf(j, k, l) + &
-                                             dqR_prim_dz_n(3)%vf(i)%sf(j, k, l))
-
-                                        dqR_prim_dz_n(1)%vf(i)%sf(j, k, l) = 25.e-2_wp* &
-                                                                             dqR_prim_dz_n(1)%vf(i)%sf(j, k, l)
-
-                                    end do
-                                end do
-                            end do
-                        end do
-                        $:END_GPU_PARALLEL_LOOP()
-
-                        $:GPU_PARALLEL_LOOP(collapse=3)
-                        do l = is3_viscous%beg + 1, is3_viscous%end - 1
-                            do j = is2_viscous%beg + 1, is2_viscous%end
-                                do k = is1_viscous%beg, is1_viscous%end
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = iv%beg, iv%end
-
-                                        dqL_prim_dz_n(2)%vf(i)%sf(k, j, l) = &
-                                            (dqL_prim_dz_n(3)%vf(i)%sf(k, j, l) + &
-                                             dqR_prim_dz_n(3)%vf(i)%sf(k, j, l) + &
-                                             dqL_prim_dz_n(3)%vf(i)%sf(k, j - 1, l) + &
-                                             dqR_prim_dz_n(3)%vf(i)%sf(k, j - 1, l))
-
-                                        dqL_prim_dz_n(2)%vf(i)%sf(k, j, l) = 25.e-2_wp* &
-                                                                             dqL_prim_dz_n(2)%vf(i)%sf(k, j, l)
-
-                                    end do
-                                end do
-                            end do
-                        end do
-                        $:END_GPU_PARALLEL_LOOP()
-
-                        $:GPU_PARALLEL_LOOP(collapse=3)
-                        do l = is3_viscous%beg + 1, is3_viscous%end - 1
-                            do j = is2_viscous%beg, is2_viscous%end - 1
-                                do k = is1_viscous%beg, is1_viscous%end
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = iv%beg, iv%end
-
-                                        dqR_prim_dz_n(2)%vf(i)%sf(k, j, l) = &
-                                            (dqL_prim_dz_n(3)%vf(i)%sf(k, j + 1, l) + &
-                                             dqR_prim_dz_n(3)%vf(i)%sf(k, j + 1, l) + &
-                                             dqL_prim_dz_n(3)%vf(i)%sf(k, j, l) + &
-                                             dqR_prim_dz_n(3)%vf(i)%sf(k, j, l))
-
-                                        dqR_prim_dz_n(2)%vf(i)%sf(k, j, l) = 25.e-2_wp* &
-                                                                             dqR_prim_dz_n(2)%vf(i)%sf(k, j, l)
-
-                                    end do
-                                end do
-                            end do
-                        end do
-                        $:END_GPU_PARALLEL_LOOP()
-
-                        $:GPU_PARALLEL_LOOP(collapse=3)
-                        do j = is3_viscous%beg + 1, is3_viscous%end
-                            do l = is2_viscous%beg + 1, is2_viscous%end - 1
-                                do k = is1_viscous%beg, is1_viscous%end
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = iv%beg, iv%end
-
-                                        dqL_prim_dy_n(3)%vf(i)%sf(k, l, j) = &
-                                            (dqL_prim_dy_n(2)%vf(i)%sf(k, l, j) + &
-                                             dqR_prim_dy_n(2)%vf(i)%sf(k, l, j) + &
-                                             dqL_prim_dy_n(2)%vf(i)%sf(k, l, j - 1) + &
-                                             dqR_prim_dy_n(2)%vf(i)%sf(k, l, j - 1))
-
-                                        dqL_prim_dy_n(3)%vf(i)%sf(k, l, j) = 25.e-2_wp* &
-                                                                             dqL_prim_dy_n(3)%vf(i)%sf(k, l, j)
-
-                                    end do
-                                end do
-                            end do
-                        end do
-                        $:END_GPU_PARALLEL_LOOP()
-
-                        $:GPU_PARALLEL_LOOP(collapse=3)
-                        do j = is3_viscous%beg, is3_viscous%end - 1
-                            do l = is2_viscous%beg + 1, is2_viscous%end - 1
-                                do k = is1_viscous%beg, is1_viscous%end
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = iv%beg, iv%end
-
-                                        dqR_prim_dy_n(3)%vf(i)%sf(k, l, j) = &
-                                            (dqL_prim_dy_n(2)%vf(i)%sf(k, l, j + 1) + &
-                                             dqR_prim_dy_n(2)%vf(i)%sf(k, l, j + 1) + &
-                                             dqL_prim_dy_n(2)%vf(i)%sf(k, l, j) + &
-                                             dqR_prim_dy_n(2)%vf(i)%sf(k, l, j))
-
-                                        dqR_prim_dy_n(3)%vf(i)%sf(k, l, j) = 25.e-2_wp* &
-                                                                             dqR_prim_dy_n(3)%vf(i)%sf(k, l, j)
-
-                                    end do
-                                end do
-                            end do
-                        end do
-                        $:END_GPU_PARALLEL_LOOP()
-                        $:GPU_PARALLEL_LOOP(collapse=3)
-                        do j = is3_viscous%beg + 1, is3_viscous%end
-                            do l = is2_viscous%beg, is2_viscous%end
-                                do k = is1_viscous%beg + 1, is1_viscous%end - 1
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = iv%beg, iv%end
-
-                                        dqL_prim_dx_n(3)%vf(i)%sf(k, l, j) = &
-                                            (dqL_prim_dx_n(1)%vf(i)%sf(k, l, j) + &
-                                             dqR_prim_dx_n(1)%vf(i)%sf(k, l, j) + &
-                                             dqL_prim_dx_n(1)%vf(i)%sf(k, l, j - 1) + &
-                                             dqR_prim_dx_n(1)%vf(i)%sf(k, l, j - 1))
-
-                                        dqL_prim_dx_n(3)%vf(i)%sf(k, l, j) = 25.e-2_wp* &
-                                                                             dqL_prim_dx_n(3)%vf(i)%sf(k, l, j)
-
-                                    end do
-                                end do
-                            end do
-                        end do
-                        $:END_GPU_PARALLEL_LOOP()
-                        $:GPU_PARALLEL_LOOP(collapse=3)
-                        do j = is3_viscous%beg, is3_viscous%end - 1
-                            do l = is2_viscous%beg, is2_viscous%end
-                                do k = is1_viscous%beg + 1, is1_viscous%end - 1
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = iv%beg, iv%end
-                                        dqR_prim_dx_n(3)%vf(i)%sf(k, l, j) = &
-                                            (dqL_prim_dx_n(1)%vf(i)%sf(k, l, j + 1) + &
-                                             dqR_prim_dx_n(1)%vf(i)%sf(k, l, j + 1) + &
-                                             dqL_prim_dx_n(1)%vf(i)%sf(k, l, j) + &
-                                             dqR_prim_dx_n(1)%vf(i)%sf(k, l, j))
-
-                                        dqR_prim_dx_n(3)%vf(i)%sf(k, l, j) = 25.e-2_wp* &
-                                                                             dqR_prim_dx_n(3)%vf(i)%sf(k, l, j)
-
-                                    end do
-                                end do
-                            end do
-                        end do
-                        $:END_GPU_PARALLEL_LOOP()
-
-                        do i = iv%beg, iv%end
-                            call s_compute_fd_gradient(q_prim_qp%vf(i), &
-                                                       dq_prim_dx_qp(1)%vf(i), &
-                                                       dq_prim_dy_qp(1)%vf(i), &
-                                                       dq_prim_dz_qp(1)%vf(i))
-                        end do
-                    #:endif
-
-                else
-
-                    do i = iv%beg, iv%end
-                        call s_compute_fd_gradient(q_prim_qp%vf(i), &
-                                                   dq_prim_dx_qp(1)%vf(i), &
-                                                   dq_prim_dy_qp(1)%vf(i), &
-                                                   dq_prim_dy_qp(1)%vf(i))
-                    end do
-
-                end if
-
-            else
-
-                do i = iv%beg, iv%end
-                    call s_compute_fd_gradient(q_prim_qp%vf(i), &
-                                               dq_prim_dx_qp(1)%vf(i), &
-                                               dq_prim_dx_qp(1)%vf(i), &
-                                               dq_prim_dx_qp(1)%vf(i))
-                end do
-
-            end if
-
-        end if
-
-    end subroutine s_get_viscous
-
-    !> @brief Reconstructs left and right cell-boundary values of viscous primitive variables using WENO or MUSCL.
-    subroutine s_reconstruct_cell_boundary_values_visc(v_vf, vL_x, vL_y, vL_z, vR_x, vR_y, vR_z, &
-                                                       norm_dir, vL_prim_vf, vR_prim_vf, ix, iy, iz)
-
-        type(scalar_field), dimension(iv%beg:iv%end), intent(in) :: v_vf
-        type(scalar_field), dimension(iv%beg:iv%end), intent(inout) :: vL_prim_vf, vR_prim_vf
-
-        real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:), intent(inout) :: vL_x, vL_y, vL_z, vR_x, vR_y, vR_z
-        integer, intent(in) :: norm_dir
-        type(int_bounds_info), intent(in) :: ix, iy, iz
-
-        integer :: recon_dir !< Coordinate direction of the WENO reconstruction
-
-        integer :: i, j, k, l
-
-        #:for SCHEME, TYPE in [('weno','WENO_TYPE'), ('muscl','MUSCL_TYPE')]
-            if (recon_type == ${TYPE}$ .or. dummy) then
-                ! Reconstruction in s1-direction
-
-                if (norm_dir == 1) then
-                    is1_viscous = ix; is2_viscous = iy; is3_viscous = iz
-                    recon_dir = 1; is1_viscous%beg = is1_viscous%beg + ${SCHEME}$_polyn
-                    is1_viscous%end = is1_viscous%end - ${SCHEME}$_polyn
-
-                elseif (norm_dir == 2) then
-                    is1_viscous = iy; is2_viscous = ix; is3_viscous = iz
-                    recon_dir = 2; is1_viscous%beg = is1_viscous%beg + ${SCHEME}$_polyn
-                    is1_viscous%end = is1_viscous%end - ${SCHEME}$_polyn
-
-                else
-                    is1_viscous = iz; is2_viscous = iy; is3_viscous = ix
-                    recon_dir = 3; is1_viscous%beg = is1_viscous%beg + ${SCHEME}$_polyn
-                    is1_viscous%end = is1_viscous%end - ${SCHEME}$_polyn
-
-                end if
-
-                $:GPU_UPDATE(device='[is1_viscous, is2_viscous, is3_viscous, iv]')
-                if (n > 0) then
-                    if (p > 0) then
-                        call s_${SCHEME}$ (v_vf(iv%beg:iv%end), &
-                                           vL_x(:, :, :, iv%beg:iv%end), vL_y(:, :, :, iv%beg:iv%end), vL_z(:, :, :, iv%beg:iv%end), vR_x(:, :, :, iv%beg:iv%end), vR_y(:, :, :, iv%beg:iv%end), vR_z(:, :, :, iv%beg:iv%end), &
-                                           recon_dir, &
-                                           is1_viscous, is2_viscous, is3_viscous)
-                    else
-                        call s_${SCHEME}$ (v_vf(iv%beg:iv%end), &
-                                           vL_x(:, :, :, iv%beg:iv%end), vL_y(:, :, :, iv%beg:iv%end), vL_z(:, :, :, :), vR_x(:, :, :, iv%beg:iv%end), vR_y(:, :, :, iv%beg:iv%end), vR_z(:, :, :, :), &
-                                           recon_dir, &
-                                           is1_viscous, is2_viscous, is3_viscous)
-                    end if
-                else
-                    call s_${SCHEME}$ (v_vf(iv%beg:iv%end), &
-                                       vL_x(:, :, :, iv%beg:iv%end), vL_y(:, :, :, :), vL_z(:, :, :, :), vR_x(:, :, :, iv%beg:iv%end), vR_y(:, :, :, :), vR_z(:, :, :, :), &
-                                       recon_dir, &
-                                       is1_viscous, is2_viscous, is3_viscous)
-                end if
-            end if
-        #:endfor
-
-        if (viscous .or. dummy) then
-            if (weno_Re_flux) then
-                if (norm_dir == 2) then
-                    $:GPU_PARALLEL_LOOP(collapse=4)
-                    do i = iv%beg, iv%end
-                        do l = is3_viscous%beg, is3_viscous%end
-                            do j = is1_viscous%beg, is1_viscous%end
-                                do k = is2_viscous%beg, is2_viscous%end
-                                    vL_prim_vf(i)%sf(k, j, l) = vL_y(j, k, l, i)
-                                    vR_prim_vf(i)%sf(k, j, l) = vR_y(j, k, l, i)
-                                end do
-                            end do
-                        end do
-                    end do
-                    $:END_GPU_PARALLEL_LOOP()
-                elseif (norm_dir == 3) then
-                    $:GPU_PARALLEL_LOOP(collapse=4)
-                    do i = iv%beg, iv%end
-                        do j = is1_viscous%beg, is1_viscous%end
-                            do k = is2_viscous%beg, is2_viscous%end
-                                do l = is3_viscous%beg, is3_viscous%end
-                                    vL_prim_vf(i)%sf(l, k, j) = vL_z(j, k, l, i)
-                                    vR_prim_vf(i)%sf(l, k, j) = vR_z(j, k, l, i)
-                                end do
-                            end do
-                        end do
-                    end do
-                    $:END_GPU_PARALLEL_LOOP()
-                elseif (norm_dir == 1) then
-                    $:GPU_PARALLEL_LOOP(collapse=4)
-                    do i = iv%beg, iv%end
-                        do l = is3_viscous%beg, is3_viscous%end
-                            do k = is2_viscous%beg, is2_viscous%end
-                                do j = is1_viscous%beg, is1_viscous%end
-                                    vL_prim_vf(i)%sf(j, k, l) = vL_x(j, k, l, i)
-                                    vR_prim_vf(i)%sf(j, k, l) = vR_x(j, k, l, i)
-                                end do
-                            end do
-                        end do
-                    end do
-                    $:END_GPU_PARALLEL_LOOP()
-                end if
-            end if
-        end if
-
-    end subroutine s_reconstruct_cell_boundary_values_visc
-
-    !> @brief Reconstructs left and right cell-boundary values of viscous primitive variable derivatives using WENO or MUSCL.
-    subroutine s_reconstruct_cell_boundary_values_visc_deriv(v_vf, vL_x, vL_y, vL_z, vR_x, vR_y, vR_z, &
-                                                             norm_dir, vL_prim_vf, vR_prim_vf, ix, iy, iz)
-        type(scalar_field), dimension(iv%beg:iv%end), intent(in) :: v_vf
-        real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, iv%beg:), intent(inout) :: vL_x, vL_y, vL_z, vR_x, vR_y, vR_z
-        type(scalar_field), dimension(iv%beg:iv%end), intent(inout) :: vL_prim_vf, vR_prim_vf
-        type(int_bounds_info), intent(in) :: ix, iy, iz
-
-        integer, intent(IN) :: norm_dir
-
-        integer :: recon_dir !< Coordinate direction of the WENO reconstruction
-
-        integer :: i, j, k, l
-        #:for SCHEME, TYPE in [('weno','WENO_TYPE'), ('muscl','MUSCL_TYPE')]
-            if (recon_type == ${TYPE}$) then
-                ! Reconstruction in s1-direction
-
-                if (norm_dir == 1) then
-                    is1_viscous = ix; is2_viscous = iy; is3_viscous = iz
-                    recon_dir = 1; is1_viscous%beg = is1_viscous%beg + ${SCHEME}$_polyn
-                    is1_viscous%end = is1_viscous%end - ${SCHEME}$_polyn
-
-                elseif (norm_dir == 2) then
-                    is1_viscous = iy; is2_viscous = ix; is3_viscous = iz
-                    recon_dir = 2; is1_viscous%beg = is1_viscous%beg + ${SCHEME}$_polyn
-                    is1_viscous%end = is1_viscous%end - ${SCHEME}$_polyn
-
-                else
-                    is1_viscous = iz; is2_viscous = iy; is3_viscous = ix
-                    recon_dir = 3; is1_viscous%beg = is1_viscous%beg + ${SCHEME}$_polyn
-                    is1_viscous%end = is1_viscous%end - ${SCHEME}$_polyn
-
-                end if
-                $:GPU_UPDATE(device='[is1_viscous, is2_viscous, is3_viscous, iv]')
-                if (n > 0) then
-                    if (p > 0) then
-
-                        call s_${SCHEME}$ (v_vf(iv%beg:iv%end), &
-                                           vL_x(:, :, :, iv%beg:iv%end), vL_y(:, :, :, iv%beg:iv%end), vL_z(:, :, :, iv%beg:iv%end), vR_x(:, :, :, iv%beg:iv%end), vR_y(:, :, :, iv%beg:iv%end), vR_z(:, :, :, iv%beg:iv%end), &
-                                           recon_dir, &
-                                           is1_viscous, is2_viscous, is3_viscous)
-                    else
-                        call s_${SCHEME}$ (v_vf(iv%beg:iv%end), &
-                                           vL_x(:, :, :, iv%beg:iv%end), vL_y(:, :, :, iv%beg:iv%end), vL_z(:, :, :, :), vR_x(:, :, :, iv%beg:iv%end), vR_y(:, :, :, iv%beg:iv%end), vR_z(:, :, :, :), &
-                                           recon_dir, &
-                                           is1_viscous, is2_viscous, is3_viscous)
-                    end if
-                else
-
-                    call s_${SCHEME}$ (v_vf(iv%beg:iv%end), &
-                                       vL_x(:, :, :, iv%beg:iv%end), vL_y(:, :, :, :), vL_z(:, :, :, :), vR_x(:, :, :, iv%beg:iv%end), vR_y(:, :, :, :), vR_z(:, :, :, :), &
-                                       recon_dir, &
-                                       is1_viscous, is2_viscous, is3_viscous)
-                end if
-            end if
-        #:endfor
-
-        if (viscous .or. dummy) then
-            if (weno_Re_flux) then
-                if (norm_dir == 2) then
-                    $:GPU_PARALLEL_LOOP(collapse=4)
-                    do i = iv%beg, iv%end
-                        do l = is3_viscous%beg, is3_viscous%end
-                            do j = is1_viscous%beg, is1_viscous%end
-                                do k = is2_viscous%beg, is2_viscous%end
-                                    vL_prim_vf(i)%sf(k, j, l) = vL_y(j, k, l, i)
-                                    vR_prim_vf(i)%sf(k, j, l) = vR_y(j, k, l, i)
-                                end do
-                            end do
-                        end do
-                    end do
-                    $:END_GPU_PARALLEL_LOOP()
-                elseif (norm_dir == 3) then
-                    $:GPU_PARALLEL_LOOP(collapse=4)
-                    do i = iv%beg, iv%end
-                        do j = is1_viscous%beg, is1_viscous%end
-                            do k = is2_viscous%beg, is2_viscous%end
-                                do l = is3_viscous%beg, is3_viscous%end
-                                    vL_prim_vf(i)%sf(l, k, j) = vL_z(j, k, l, i)
-                                    vR_prim_vf(i)%sf(l, k, j) = vR_z(j, k, l, i)
-                                end do
-                            end do
-                        end do
-                    end do
-                    $:END_GPU_PARALLEL_LOOP()
-                elseif (norm_dir == 1) then
-                    $:GPU_PARALLEL_LOOP(collapse=4)
-                    do i = iv%beg, iv%end
-                        do l = is3_viscous%beg, is3_viscous%end
-                            do k = is2_viscous%beg, is2_viscous%end
-                                do j = is1_viscous%beg, is1_viscous%end
-                                    vL_prim_vf(i)%sf(j, k, l) = vL_x(j, k, l, i)
-                                    vR_prim_vf(i)%sf(j, k, l) = vR_x(j, k, l, i)
-                                end do
-                            end do
-                        end do
-                    end do
-                    $:END_GPU_PARALLEL_LOOP()
-                end if
-            end if
-        end if
-
-    end subroutine s_reconstruct_cell_boundary_values_visc_deriv
-
-    !>  The purpose of this subroutine is to employ the inputted
-        !!      left and right cell-boundary integral-averaged variables
-        !!      to compute the relevant cell-average first-order spatial
-        !!      derivatives in the x-, y- or z-direction by means of the
-        !!      scalar divergence theorem.
-        !!  @param vL_vf Left cell-boundary integral averages
-        !!  @param vR_vf Right cell-boundary integral averages
-        !!  @param dv_ds_vf Cell-average first-order spatial derivatives
-        !!  @param norm_dir Splitting coordinate direction
-        !!  @param ix Index bounds in the x-direction
-        !!  @param iy Index bounds in the y-direction
-        !!  @param iz Index bounds in the z-direction
-        !!  @param iv_in Variable index bounds
-        !!  @param dL Cell width array
-        !!  @param dim Dimension size
-        !!  @param buff_size_in Buffer layer size
-    subroutine s_apply_scalar_divergence_theorem(vL_vf, vR_vf, &
-                                                 dv_ds_vf, &
-                                                 norm_dir, &
-                                                 ix, iy, iz, iv_in, &
-                                                 dL, dim, buff_size_in)
-
-        ! arrays of cell widths
-        type(scalar_field), &
-            dimension(iv%beg:iv%end), &
-            intent(in) :: vL_vf, vR_vf
-
-        type(scalar_field), &
-            dimension(iv%beg:iv%end), &
-            intent(inout) :: dv_ds_vf
-
-        integer, intent(in) :: norm_dir
-        type(int_bounds_info), intent(in) :: ix, iy, iz, iv_in
-        integer, intent(in) :: dim, buff_size_in
-        real(wp), dimension(-buff_size_in:dim + buff_size_in), intent(in) :: dL
-
-        integer :: i, j, k, l !< Generic loop iterators
-
-        is1_viscous = ix
-        is2_viscous = iy
-        is3_viscous = iz
-        iv = iv_in
-
-        $:GPU_UPDATE(device='[is1_viscous, is2_viscous, is3_viscous, iv]')
-
-        ! First-Order Spatial Derivatives in x-direction
-        if (norm_dir == 1) then
-
-            ! A general application of the scalar divergence theorem that
-            ! utilizes the left and right cell-boundary integral-averages,
-            ! inside each cell, or an arithmetic mean of these two at the
-            ! cell-boundaries, to calculate the cell-averaged first-order
-            ! spatial derivatives inside the cell.
-
-            $:GPU_PARALLEL_LOOP(collapse=3)
-            do l = is3_viscous%beg, is3_viscous%end
-                do k = is2_viscous%beg, is2_viscous%end
-                    do j = is1_viscous%beg + 1, is1_viscous%end - 1
-                        $:GPU_LOOP(parallelism='[seq]')
-                        do i = iv%beg, iv%end
-                            dv_ds_vf(i)%sf(j, k, l) = &
-                                1._wp/((1._wp + wa_flg)*dL(j)) &
-                                *(wa_flg*vL_vf(i)%sf(j + 1, k, l) &
-                                  + vR_vf(i)%sf(j, k, l) &
-                                  - vL_vf(i)%sf(j, k, l) &
-                                  - wa_flg*vR_vf(i)%sf(j - 1, k, l))
-                        end do
-                    end do
-                end do
-            end do
-            $:END_GPU_PARALLEL_LOOP()
-
-            ! END: First-Order Spatial Derivatives in x-direction
-
-            ! First-Order Spatial Derivatives in y-direction
-        elseif (norm_dir == 2) then
-
-            ! A general application of the scalar divergence theorem that
-            ! utilizes the left and right cell-boundary integral-averages,
-            ! inside each cell, or an arithmetic mean of these two at the
-            ! cell-boundaries, to calculate the cell-averaged first-order
-            ! spatial derivatives inside the cell.
-
-            $:GPU_PARALLEL_LOOP(collapse=3)
-            do l = is3_viscous%beg, is3_viscous%end
-                do k = is2_viscous%beg + 1, is2_viscous%end - 1
-                    do j = is1_viscous%beg, is1_viscous%end
-                        $:GPU_LOOP(parallelism='[seq]')
-                        do i = iv%beg, iv%end
-                            dv_ds_vf(i)%sf(j, k, l) = &
-                                1._wp/((1._wp + wa_flg)*dL(k)) &
-                                *(wa_flg*vL_vf(i)%sf(j, k + 1, l) &
-                                  + vR_vf(i)%sf(j, k, l) &
-                                  - vL_vf(i)%sf(j, k, l) &
-                                  - wa_flg*vR_vf(i)%sf(j, k - 1, l))
-                        end do
-                    end do
-                end do
-            end do
-            $:END_GPU_PARALLEL_LOOP()
-
-            ! END: First-Order Spatial Derivatives in y-direction
-
-            ! First-Order Spatial Derivatives in z-direction
-        else
-
-            ! A general application of the scalar divergence theorem that
-            ! utilizes the left and right cell-boundary integral-averages,
-            ! inside each cell, or an arithmetic mean of these two at the
-            ! cell-boundaries, to calculate the cell-averaged first-order
-            ! spatial derivatives inside the cell.
-
-            $:GPU_PARALLEL_LOOP(collapse=3)
-            do l = is3_viscous%beg + 1, is3_viscous%end - 1
-                do k = is2_viscous%beg, is2_viscous%end
-                    do j = is1_viscous%beg, is1_viscous%end
-                        $:GPU_LOOP(parallelism='[seq]')
-                        do i = iv%beg, iv%end
-                            dv_ds_vf(i)%sf(j, k, l) = &
-                                1._wp/((1._wp + wa_flg)*dL(l)) &
-                                *(wa_flg*vL_vf(i)%sf(j, k, l + 1) &
-                                  + vR_vf(i)%sf(j, k, l) &
-                                  - vL_vf(i)%sf(j, k, l) &
-                                  - wa_flg*vR_vf(i)%sf(j, k, l - 1))
-                        end do
-                    end do
-                end do
-            end do
-            $:END_GPU_PARALLEL_LOOP()
-
-        end if
-        ! END: First-Order Spatial Derivatives in z-direction
-
-    end subroutine s_apply_scalar_divergence_theorem
-
-    !>  Computes the scalar gradient fields via finite differences
-        !!  @param var Variable to compute derivative of
-        !!  @param grad_x First coordinate direction component of the derivative
-        !!  @param grad_y Second coordinate direction component of the derivative
-        !!  @param grad_z Third coordinate direction component of the derivative
-    subroutine s_compute_fd_gradient(var, grad_x, grad_y, grad_z)
-
-        type(scalar_field), intent(in) :: var
-        type(scalar_field), intent(inout) :: grad_x
-        type(scalar_field), intent(inout) :: grad_y
-        type(scalar_field), intent(inout) :: grad_z
-        type(int_bounds_info) :: ix, iy, iz
-
-        integer :: j, k, l !< Generic loop iterators
-
-        ix%beg = 1 - buff_size; ix%end = m + buff_size - 1
-        if (n > 0) then
-            iy%beg = 1 - buff_size; iy%end = n + buff_size - 1
-        else
-            iy%beg = 0; iy%end = 0
-        end if
-
-        if (p > 0) then
-            iz%beg = 1 - buff_size; iz%end = p + buff_size - 1
-        else
-            iz%beg = 0; iz%end = 0
-        end if
-
-        is1_viscous = ix; is2_viscous = iy; is3_viscous = iz
-
-        $:GPU_UPDATE(device='[is1_viscous,is2_viscous,is3_viscous]')
-
-        $:GPU_PARALLEL_LOOP(collapse=3)
-        do l = is3_viscous%beg, is3_viscous%end
-            do k = is2_viscous%beg, is2_viscous%end
-                do j = is1_viscous%beg, is1_viscous%end
-                    grad_x%sf(j, k, l) = &
-                        (var%sf(j + 1, k, l) - var%sf(j - 1, k, l))/ &
-                        (x_cc(j + 1) - x_cc(j - 1))
-                end do
-            end do
-        end do
-        $:END_GPU_PARALLEL_LOOP()
-
-        if (n > 0) then
-            $:GPU_PARALLEL_LOOP(collapse=3)
-            do l = is3_viscous%beg, is3_viscous%end
-                do k = is2_viscous%beg, is2_viscous%end
-                    do j = is1_viscous%beg, is1_viscous%end
-                        grad_y%sf(j, k, l) = &
-                            (var%sf(j, k + 1, l) - var%sf(j, k - 1, l))/ &
-                            (y_cc(k + 1) - y_cc(k - 1))
-                    end do
-                end do
-            end do
-            $:END_GPU_PARALLEL_LOOP()
-        end if
-
-        if (p > 0) then
-            $:GPU_PARALLEL_LOOP(collapse=3)
-            do l = is3_viscous%beg, is3_viscous%end
-                do k = is2_viscous%beg, is2_viscous%end
-                    do j = is1_viscous%beg, is1_viscous%end
-                        grad_z%sf(j, k, l) = &
-                            (var%sf(j, k, l + 1) - var%sf(j, k, l - 1))/ &
-                            (z_cc(l + 1) - z_cc(l - 1))
-                    end do
-                end do
-            end do
-            $:END_GPU_PARALLEL_LOOP()
-        end if
-
-        $:GPU_PARALLEL_LOOP(collapse=2)
-        do l = idwbuff(3)%beg, idwbuff(3)%end
-            do k = idwbuff(2)%beg, idwbuff(2)%end
-                grad_x%sf(idwbuff(1)%beg, k, l) = &
-                    (-3._wp*var%sf(idwbuff(1)%beg, k, l) + 4._wp*var%sf(idwbuff(1)%beg + 1, k, l) - var%sf(idwbuff(1)%beg + 2, k, l))/ &
-                    (x_cc(idwbuff(1)%beg + 2) - x_cc(idwbuff(1)%beg))
-                grad_x%sf(idwbuff(1)%end, k, l) = &
-                    (+3._wp*var%sf(idwbuff(1)%end, k, l) - 4._wp*var%sf(idwbuff(1)%end - 1, k, l) + var%sf(idwbuff(1)%end - 2, k, l))/ &
-                    (x_cc(idwbuff(1)%end) - x_cc(idwbuff(1)%end - 2))
-            end do
-        end do
-        $:END_GPU_PARALLEL_LOOP()
-        if (n > 0) then
-            $:GPU_PARALLEL_LOOP(collapse=2)
-            do l = idwbuff(3)%beg, idwbuff(3)%end
-                do j = idwbuff(1)%beg, idwbuff(1)%end
-                    grad_y%sf(j, idwbuff(2)%beg, l) = &
-                        (-3._wp*var%sf(j, idwbuff(2)%beg, l) + 4._wp*var%sf(j, idwbuff(2)%beg + 1, l) - var%sf(j, idwbuff(2)%beg + 2, l))/ &
-                        (y_cc(idwbuff(2)%beg + 2) - y_cc(idwbuff(2)%beg))
-                    grad_y%sf(j, idwbuff(2)%end, l) = &
-                        (+3._wp*var%sf(j, idwbuff(2)%end, l) - 4._wp*var%sf(j, idwbuff(2)%end - 1, l) + var%sf(j, idwbuff(2)%end - 2, l))/ &
-                        (y_cc(idwbuff(2)%end) - y_cc(idwbuff(2)%end - 2))
-                end do
-            end do
-            $:END_GPU_PARALLEL_LOOP()
-            if (p > 0) then
-                $:GPU_PARALLEL_LOOP(collapse=2)
-                do k = idwbuff(2)%beg, idwbuff(2)%end
-                    do j = idwbuff(1)%beg, idwbuff(1)%end
-                        grad_z%sf(j, k, idwbuff(3)%beg) = &
-                            (-3._wp*var%sf(j, k, idwbuff(3)%beg) + 4._wp*var%sf(j, k, idwbuff(3)%beg + 1) - var%sf(j, k, idwbuff(3)%beg + 2))/ &
-                            (z_cc(idwbuff(3)%beg + 2) - z_cc(is3_viscous%beg))
-                        grad_z%sf(j, k, idwbuff(3)%end) = &
-                            (+3._wp*var%sf(j, k, idwbuff(3)%end) - 4._wp*var%sf(j, k, idwbuff(3)%end - 1) + var%sf(j, k, idwbuff(3)%end - 2))/ &
-                            (z_cc(idwbuff(3)%end) - z_cc(idwbuff(3)%end - 2))
-                    end do
-                end do
-                $:END_GPU_PARALLEL_LOOP()
-            end if
-        end if
-
-        if (bc_x%beg <= BC_GHOST_EXTRAP) then
-            $:GPU_PARALLEL_LOOP(collapse=2)
-            do l = idwbuff(3)%beg, idwbuff(3)%end
-                do k = idwbuff(2)%beg, idwbuff(2)%end
-                    grad_x%sf(0, k, l) = (-3._wp*var%sf(0, k, l) + 4._wp*var%sf(1, k, l) - var%sf(2, k, l))/ &
-                                         (x_cc(2) - x_cc(0))
-                end do
-            end do
-            $:END_GPU_PARALLEL_LOOP()
-        end if
-        if (bc_x%end <= BC_GHOST_EXTRAP) then
-            $:GPU_PARALLEL_LOOP(collapse=2)
-            do l = idwbuff(3)%beg, idwbuff(3)%end
-                do k = idwbuff(2)%beg, idwbuff(2)%end
-                    grad_x%sf(m, k, l) = (3._wp*var%sf(m, k, l) - 4._wp*var%sf(m - 1, k, l) + var%sf(m - 2, k, l))/ &
-                                         (x_cc(m) - x_cc(m - 2))
-                end do
-            end do
-            $:END_GPU_PARALLEL_LOOP()
-        end if
-        if (n > 0) then
-            if (bc_y%beg <= BC_GHOST_EXTRAP .and. bc_y%beg /= BC_NULL) then
-                $:GPU_PARALLEL_LOOP(collapse=2)
-                do l = idwbuff(3)%beg, idwbuff(3)%end
-                    do j = idwbuff(1)%beg, idwbuff(1)%end
-                        grad_y%sf(j, 0, l) = (-3._wp*var%sf(j, 0, l) + 4._wp*var%sf(j, 1, l) - var%sf(j, 2, l))/ &
-                                             (y_cc(2) - y_cc(0))
-                    end do
-                end do
-                $:END_GPU_PARALLEL_LOOP()
-            end if
-            if (bc_y%end <= BC_GHOST_EXTRAP) then
-                $:GPU_PARALLEL_LOOP(collapse=2)
-                do l = idwbuff(3)%beg, idwbuff(3)%end
-                    do j = idwbuff(1)%beg, idwbuff(1)%end
-                        grad_y%sf(j, n, l) = (3._wp*var%sf(j, n, l) - 4._wp*var%sf(j, n - 1, l) + var%sf(j, n - 2, l))/ &
-                                             (y_cc(n) - y_cc(n - 2))
-                    end do
-                end do
-                $:END_GPU_PARALLEL_LOOP()
-            end if
-            if (p > 0) then
-                if (bc_z%beg <= BC_GHOST_EXTRAP) then
-                    $:GPU_PARALLEL_LOOP(collapse=2)
-                    do k = idwbuff(2)%beg, idwbuff(2)%end
-                        do j = idwbuff(1)%beg, idwbuff(1)%end
-                            grad_z%sf(j, k, 0) = &
-                                (-3._wp*var%sf(j, k, 0) + 4._wp*var%sf(j, k, 1) - var%sf(j, k, 2))/ &
-                                (z_cc(2) - z_cc(0))
-                        end do
-                    end do
-                    $:END_GPU_PARALLEL_LOOP()
-                end if
-                if (bc_z%end <= BC_GHOST_EXTRAP) then
-                    $:GPU_PARALLEL_LOOP(collapse=2)
-                    do k = idwbuff(2)%beg, idwbuff(2)%end
-                        do j = idwbuff(1)%beg, idwbuff(1)%end
-                            grad_z%sf(j, k, p) = &
-                                (3._wp*var%sf(j, k, p) - 4._wp*var%sf(j, k, p - 1) + var%sf(j, k, p - 2))/ &
-                                (z_cc(p) - z_cc(p - 2))
-                        end do
-                    end do
-                    $:END_GPU_PARALLEL_LOOP()
-                end if
-            end if
-        end if
-
-    end subroutine s_compute_fd_gradient
-
-    !> @brief Computes the viscous stress tensor at a single grid cell using finite-difference velocity gradients.
-    subroutine s_compute_viscous_stress_tensor(viscous_stress_tensor, q_prim_vf, dynamic_viscosity, i, j, k)
-        $:GPU_ROUTINE(parallelism='[seq]')
-
-        real(wp), dimension(1:3, 1:3), intent(inout) :: viscous_stress_tensor
-        type(scalar_field), dimension(1:sys_size), intent(in) :: q_prim_vf
-        real(wp), intent(in) :: dynamic_viscosity
-        integer, intent(in) :: i, j, k
-
-        real(wp), dimension(1:3, 1:3) :: velocity_gradient_tensor
-        real(wp), dimension(1:3) :: dx
-        real(wp) :: divergence
-        integer :: l, q ! iterators
-
-        ! zero the viscous stress, collection of velocity diriviatives, and spacial finite differences
-        viscous_stress_tensor = 0._wp
-        velocity_gradient_tensor = 0._wp
-        dx = 0._wp
-
-        ! get the change in x used in the finite difference equaiont
-        dx(1) = 0.5_wp*(x_cc(i + 1) - x_cc(i - 1))
-        dx(2) = 0.5_wp*(y_cc(j + 1) - y_cc(j - 1))
-        if (num_dims == 3) then
-            dx(3) = 0.5_wp*(z_cc(k + 1) - z_cc(k - 1))
-        end if
-
-        ! compute the velocity gradient tensor
-        do l = 1, num_dims
-            velocity_gradient_tensor(l, 1) = (q_prim_vf(momxb + l - 1)%sf(i + 1, j, k) - q_prim_vf(momxb + l - 1)%sf(i - 1, j, k))/(2._wp*dx(1))
-            velocity_gradient_tensor(l, 2) = (q_prim_vf(momxb + l - 1)%sf(i, j + 1, k) - q_prim_vf(momxb + l - 1)%sf(i, j - 1, k))/(2._wp*dx(2))
-            if (num_dims == 3) then
-                velocity_gradient_tensor(l, 3) = (q_prim_vf(momxb + l - 1)%sf(i, j, k + 1) - q_prim_vf(momxb + l - 1)%sf(i, j, k - 1))/(2._wp*dx(3))
-            end if
-        end do
-
-        ! compute divergence
-        divergence = 0._wp
-        do l = 1, num_dims
-            divergence = divergence + velocity_gradient_tensor(l, l)
-        end do
-
-        ! set up the shear stress tensor
-        do l = 1, num_dims
-            do q = 1, num_dims
-                viscous_stress_tensor(l, q) = dynamic_viscosity*(velocity_gradient_tensor(l, q) + velocity_gradient_tensor(q, l))
-            end do
-        end do
-
-        ! populate the viscous_stress_tensor
-        do l = 1, num_dims
-            viscous_stress_tensor(l, l) = viscous_stress_tensor(l, l) - 2._wp*divergence*dynamic_viscosity/3._wp
-        end do
-
-        if (num_dims == 2) then
-            do l = 1, 3
-                viscous_stress_tensor(3, l) = 0._wp
-                viscous_stress_tensor(l, 3) = 0._wp
-            end do
-        end if
-
-    end subroutine s_compute_viscous_stress_tensor
-
-    !> @brief Deallocates the viscous Reynolds number arrays.
-    impure subroutine s_finalize_viscous_module()
-
-        @:DEALLOCATE(Res_viscous)
-
-    end subroutine s_finalize_viscous_module
-
-end module m_viscous
+!>
+!! @file
+!! @brief Contains module m_viscous
+#:include 'case.fpp'
+#:include 'macros.fpp'
+
+!> @brief Computes viscous stress tensors and diffusive flux contributions for the Navier--Stokes equations
+module m_viscous
+
+    use m_derived_types        !< Definitions of the derived types
+
+    use m_global_parameters    !< Definitions of the global parameters
+
+    use m_weno
+
+    use m_muscl                !< Monotonic Upstream-centered (MUSCL)
+                               !! schemes for conservation laws
+
+    use m_helper
+
+    use m_finite_differences
+
+    use m_re_visc
+
+    private; public s_get_viscous, &
+ s_compute_viscous_stress_cylindrical_boundary, &
+ s_initialize_viscous_module, &
+ s_reconstruct_cell_boundary_values_visc_deriv, &
+ s_finalize_viscous_module, &
+ s_compute_viscous_stress_tensor
+
+    type(int_bounds_info) :: iv
+    type(int_bounds_info) :: is1_viscous, is2_viscous, is3_viscous
+    $:GPU_DECLARE(create='[is1_viscous,is2_viscous,is3_viscous,iv]')
+
+    ! Note: Static Res_viscous array removed - s_compute_re_visc handles
+    ! both Newtonian and non-Newtonian cases dynamically
+
+contains
+
+    !> @brief Allocates and populates the viscous Reynolds number arrays and transfers data to the GPU.
+    impure subroutine s_initialize_viscous_module
+
+        $:GPU_UPDATE(device='[Re_idx,Re_size]')
+        $:GPU_ENTER_DATA(copyin='[is1_viscous,is2_viscous,is3_viscous,iv]')
+
+    end subroutine s_initialize_viscous_module
+
+    !> The purpose of this subroutine is to compute the viscous
+    !      stress tensor for the cells directly next to the axis in
+    !      cylindrical coordinates. This is necessary to avoid the
+    !      1/r singularity that arises at the cell boundary coinciding
+    !      with the axis, i.e., y_cb(-1) = 0.
+    !  @param q_prim_vf Cell-average primitive variables
+    !  @param grad_x_vf Cell-average primitive variable derivatives, x-dir
+    !  @param grad_y_vf Cell-average primitive variable derivatives, y-dir
+    !  @param grad_z_vf Cell-average primitive variable derivatives, z-dir
+    subroutine s_compute_viscous_stress_cylindrical_boundary(q_prim_vf, grad_x_vf, grad_y_vf, grad_z_vf, &
+                                                             tau_Re_vf, &
+                                                             ix, iy, iz)
+
+        type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf
+        type(scalar_field), dimension(num_dims), intent(in) :: grad_x_vf, grad_y_vf, grad_z_vf
+        type(scalar_field), dimension(1:sys_size), intent(inout) :: tau_Re_vf
+        type(int_bounds_info), intent(in) :: ix, iy, iz
+
+        real(wp) :: rho_visc, gamma_visc, pi_inf_visc, alpha_visc_sum  !< Mixture variables
+        real(wp), dimension(2) :: Re_visc
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3) :: alpha_visc, alpha_rho_visc
+            real(wp), dimension(3, 3) :: tau_Re
+            real(wp), dimension(3, 2) :: Re_visc_nn
+        #:else
+            real(wp), dimension(num_fluids) :: alpha_visc, alpha_rho_visc
+            real(wp), dimension(num_dims, num_dims) :: tau_Re
+            real(wp), dimension(num_fluids, 2) :: Re_visc_nn
+        #:endif
+
+        integer :: i, j, k, l, q !< Generic loop iterator
+
+        is1_viscous = ix; is2_viscous = iy; is3_viscous = iz
+
+        $:GPU_UPDATE(device='[is1_viscous,is2_viscous,is3_viscous]')
+
+        $:GPU_PARALLEL_LOOP(collapse=3)
+        do l = is3_viscous%beg, is3_viscous%end
+            do k = is2_viscous%beg, is2_viscous%end
+                do j = is1_viscous%beg, is1_viscous%end
+                    $:GPU_LOOP(parallelism='[seq]')
+                    do i = momxb, E_idx
+                        tau_Re_vf(i)%sf(j, k, l) = 0._wp
+                    end do
+                end do
+            end do
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+        #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
+            if (shear_stress) then    ! Shear stresses
+                $:GPU_PARALLEL_LOOP(collapse=3, private='[i,j,k,l,rho_visc, gamma_visc, pi_inf_visc, alpha_visc_sum ,alpha_visc, alpha_rho_visc, Re_visc, tau_Re]')
+                do l = is3_viscous%beg, is3_viscous%end
+                    do k = -1, 1
+                        do j = is1_viscous%beg, is1_viscous%end
+
+                            $:GPU_LOOP(parallelism='[seq]')
+                            do i = 1, num_fluids
+                                alpha_rho_visc(i) = q_prim_vf(i)%sf(j, k, l)
+                                if (bubbles_euler .and. num_fluids == 1) then
+                                    alpha_visc(i) = 1._wp - q_prim_vf(E_idx + i)%sf(j, k, l)
+                                else
+                                    alpha_visc(i) = q_prim_vf(E_idx + i)%sf(j, k, l)
+                                end if
+                            end do
+
+                            if (bubbles_euler) then
+                                rho_visc = 0._wp
+                                gamma_visc = 0._wp
+                                pi_inf_visc = 0._wp
+
+                                if (mpp_lim .and. (model_eqns == 2) .and. (num_fluids > 2)) then
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, num_fluids
+                                        rho_visc = rho_visc + alpha_rho_visc(i)
+                                        gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
+                                        pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i)
+                                    end do
+                                else if ((model_eqns == 2) .and. (num_fluids > 2)) then
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, num_fluids - 1
+                                        rho_visc = rho_visc + alpha_rho_visc(i)
+                                        gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
+                                        pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i)
+                                    end do
+                                else
+                                    rho_visc = alpha_rho_visc(1)
+                                    gamma_visc = gammas(1)
+                                    pi_inf_visc = pi_infs(1)
+                                end if
+                            else
+                                rho_visc = 0._wp
+                                gamma_visc = 0._wp
+                                pi_inf_visc = 0._wp
+
+                                alpha_visc_sum = 0._wp
+
+                                if (mpp_lim) then
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, num_fluids
+                                        alpha_rho_visc(i) = max(0._wp, alpha_rho_visc(i))
+                                        alpha_visc(i) = min(max(0._wp, alpha_visc(i)), 1._wp)
+                                        alpha_visc_sum = alpha_visc_sum + alpha_visc(i)
+                                    end do
+
+                                    alpha_visc = alpha_visc/max(alpha_visc_sum, sgm_eps)
+
+                                end if
+
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = 1, num_fluids
+                                    rho_visc = rho_visc + alpha_rho_visc(i)
+                                    gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
+                                    pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i)
+                                end do
+
+                                if (viscous) then
+                                    call s_compute_re_visc(q_prim_vf, &
+                                        alpha_visc, j, k, l, &
+                                        Re_visc_nn, grad_x_vf, &
+                                        grad_y_vf, grad_z_vf)
+                                    call s_compute_mixture_re(alpha_visc, Re_visc_nn, Re_visc)
+                                end if
+                            end if
+
+                            tau_Re(2, 1) = (grad_y_vf(1)%sf(j, k, l) + &
+                                            grad_x_vf(2)%sf(j, k, l))/ &
+                                           Re_visc(1)
+
+                            tau_Re(2, 2) = (4._wp*grad_y_vf(2)%sf(j, k, l) &
+                                            - 2._wp*grad_x_vf(1)%sf(j, k, l) &
+                                            - 2._wp*q_prim_vf(momxb + 1)%sf(j, k, l)/y_cc(k))/ &
+                                           (3._wp*Re_visc(1))
+                            $:GPU_LOOP(parallelism='[seq]')
+                            do i = 1, 2
+                                tau_Re_vf(contxe + i)%sf(j, k, l) = &
+                                    tau_Re_vf(contxe + i)%sf(j, k, l) - &
+                                    tau_Re(2, i)
+
+                                tau_Re_vf(E_idx)%sf(j, k, l) = &
+                                    tau_Re_vf(E_idx)%sf(j, k, l) - &
+                                    q_prim_vf(contxe + i)%sf(j, k, l)*tau_Re(2, i)
+                            end do
+                        end do
+                    end do
+                end do
+                $:END_GPU_PARALLEL_LOOP()
+            end if
+        #:endif
+
+        #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
+            if (bulk_stress) then    ! Bulk stresses
+                $:GPU_PARALLEL_LOOP(collapse=3, private='[i,j,k,l,rho_visc, gamma_visc, pi_inf_visc, alpha_visc_sum ,alpha_visc, alpha_rho_visc, Re_visc, tau_Re]')
+                do l = is3_viscous%beg, is3_viscous%end
+                    do k = -1, 1
+                        do j = is1_viscous%beg, is1_viscous%end
+
+                            $:GPU_LOOP(parallelism='[seq]')
+                            do i = 1, num_fluids
+                                alpha_rho_visc(i) = q_prim_vf(i)%sf(j, k, l)
+                                if (bubbles_euler .and. num_fluids == 1) then
+                                    alpha_visc(i) = 1._wp - q_prim_vf(E_idx + i)%sf(j, k, l)
+                                else
+                                    alpha_visc(i) = q_prim_vf(E_idx + i)%sf(j, k, l)
+                                end if
+                            end do
+
+                            if (bubbles_euler) then
+                                rho_visc = 0._wp
+                                gamma_visc = 0._wp
+                                pi_inf_visc = 0._wp
+
+                                if (mpp_lim .and. (model_eqns == 2) .and. (num_fluids > 2)) then
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, num_fluids
+                                        rho_visc = rho_visc + alpha_rho_visc(i)
+                                        gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
+                                        pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i)
+                                    end do
+                                else if ((model_eqns == 2) .and. (num_fluids > 2)) then
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, num_fluids - 1
+                                        rho_visc = rho_visc + alpha_rho_visc(i)
+                                        gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
+                                        pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i)
+                                    end do
+                                else
+                                    rho_visc = alpha_rho_visc(1)
+                                    gamma_visc = gammas(1)
+                                    pi_inf_visc = pi_infs(1)
+                                end if
+                            else
+                                rho_visc = 0._wp
+                                gamma_visc = 0._wp
+                                pi_inf_visc = 0._wp
+
+                                alpha_visc_sum = 0._wp
+
+                                if (mpp_lim) then
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, num_fluids
+                                        alpha_rho_visc(i) = max(0._wp, alpha_rho_visc(i))
+                                        alpha_visc(i) = min(max(0._wp, alpha_visc(i)), 1._wp)
+                                        alpha_visc_sum = alpha_visc_sum + alpha_visc(i)
+                                    end do
+
+                                    alpha_visc = alpha_visc/max(alpha_visc_sum, sgm_eps)
+
+                                end if
+
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = 1, num_fluids
+                                    rho_visc = rho_visc + alpha_rho_visc(i)
+                                    gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
+                                    pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i)
+                                end do
+
+                                if (viscous) then
+                                    call s_compute_re_visc(q_prim_vf, &
+                                        alpha_visc, j, k, l, &
+                                        Re_visc_nn, grad_x_vf, &
+                                        grad_y_vf, grad_z_vf)
+                                    call s_compute_mixture_re(alpha_visc, Re_visc_nn, Re_visc)
+                                end if
+                            end if
+
+                            tau_Re(2, 2) = (grad_x_vf(1)%sf(j, k, l) + &
+                                            grad_y_vf(2)%sf(j, k, l) + &
+                                            q_prim_vf(momxb + 1)%sf(j, k, l)/y_cc(k))/ &
+                                           Re_visc(2)
+
+                            tau_Re_vf(momxb + 1)%sf(j, k, l) = &
+                                tau_Re_vf(momxb + 1)%sf(j, k, l) - &
+                                tau_Re(2, 2)
+
+                            tau_Re_vf(E_idx)%sf(j, k, l) = &
+                                tau_Re_vf(E_idx)%sf(j, k, l) - &
+                                q_prim_vf(momxb + 1)%sf(j, k, l)*tau_Re(2, 2)
+
+                        end do
+                    end do
+                end do
+                $:END_GPU_PARALLEL_LOOP()
+            end if
+        #:endif
+
+        if (p == 0) return
+        #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
+
+            if (shear_stress) then    ! Shear stresses
+                $:GPU_PARALLEL_LOOP(collapse=3, private='[alpha_visc, alpha_rho_visc, Re_visc, tau_Re]')
+                do l = is3_viscous%beg, is3_viscous%end
+                    do k = -1, 1
+                        do j = is1_viscous%beg, is1_viscous%end
+
+                            $:GPU_LOOP(parallelism='[seq]')
+                            do i = 1, num_fluids
+                                alpha_rho_visc(i) = q_prim_vf(i)%sf(j, k, l)
+                                if (bubbles_euler .and. num_fluids == 1) then
+                                    alpha_visc(i) = 1._wp - q_prim_vf(E_idx + i)%sf(j, k, l)
+                                else
+                                    alpha_visc(i) = q_prim_vf(E_idx + i)%sf(j, k, l)
+                                end if
+                            end do
+
+                            if (bubbles_euler) then
+                                rho_visc = 0._wp
+                                gamma_visc = 0._wp
+                                pi_inf_visc = 0._wp
+
+                                if (mpp_lim .and. (model_eqns == 2) .and. (num_fluids > 2)) then
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, num_fluids
+                                        rho_visc = rho_visc + alpha_rho_visc(i)
+                                        gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
+                                        pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i)
+                                    end do
+                                else if ((model_eqns == 2) .and. (num_fluids > 2)) then
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, num_fluids - 1
+                                        rho_visc = rho_visc + alpha_rho_visc(i)
+                                        gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
+                                        pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i)
+                                    end do
+                                else
+                                    rho_visc = alpha_rho_visc(1)
+                                    gamma_visc = gammas(1)
+                                    pi_inf_visc = pi_infs(1)
+                                end if
+                            else
+                                rho_visc = 0._wp
+                                gamma_visc = 0._wp
+                                pi_inf_visc = 0._wp
+
+                                alpha_visc_sum = 0._wp
+
+                                if (mpp_lim) then
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, num_fluids
+                                        alpha_rho_visc(i) = max(0._wp, alpha_rho_visc(i))
+                                        alpha_visc(i) = min(max(0._wp, alpha_visc(i)), 1._wp)
+                                        alpha_visc_sum = alpha_visc_sum + alpha_visc(i)
+                                    end do
+
+                                    alpha_visc = alpha_visc/max(alpha_visc_sum, sgm_eps)
+
+                                end if
+
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = 1, num_fluids
+                                    rho_visc = rho_visc + alpha_rho_visc(i)
+                                    gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
+                                    pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i)
+                                end do
+
+                                if (viscous) then
+                                    call s_compute_re_visc(q_prim_vf, &
+                                        alpha_visc, j, k, l, &
+                                        Re_visc_nn, grad_x_vf, &
+                                        grad_y_vf, grad_z_vf)
+                                    call s_compute_mixture_re(alpha_visc, Re_visc_nn, Re_visc)
+                                end if
+                            end if
+
+                            tau_Re(2, 2) = -(2._wp/3._wp)*grad_z_vf(3)%sf(j, k, l)/y_cc(k)/ &
+                                           Re_visc(1)
+
+                            tau_Re(2, 3) = ((grad_z_vf(2)%sf(j, k, l) - &
+                                             q_prim_vf(momxe)%sf(j, k, l))/ &
+                                            y_cc(k) + grad_y_vf(3)%sf(j, k, l))/ &
+                                           Re_visc(1)
+
+                            $:GPU_LOOP(parallelism='[seq]')
+                            do i = 2, 3
+                                tau_Re_vf(contxe + i)%sf(j, k, l) = &
+                                    tau_Re_vf(contxe + i)%sf(j, k, l) - &
+                                    tau_Re(2, i)
+
+                                tau_Re_vf(E_idx)%sf(j, k, l) = &
+                                    tau_Re_vf(E_idx)%sf(j, k, l) - &
+                                    q_prim_vf(contxe + i)%sf(j, k, l)*tau_Re(2, i)
+                            end do
+
+                        end do
+                    end do
+                end do
+                $:END_GPU_PARALLEL_LOOP()
+            end if
+
+            if (bulk_stress) then    ! Bulk stresses
+                $:GPU_PARALLEL_LOOP(collapse=3, private='[alpha_visc, alpha_rho_visc, Re_visc, tau_Re]')
+                do l = is3_viscous%beg, is3_viscous%end
+                    do k = -1, 1
+                        do j = is1_viscous%beg, is1_viscous%end
+
+                            $:GPU_LOOP(parallelism='[seq]')
+                            do i = 1, num_fluids
+                                alpha_rho_visc(i) = q_prim_vf(i)%sf(j, k, l)
+                                if (bubbles_euler .and. num_fluids == 1) then
+                                    alpha_visc(i) = 1._wp - q_prim_vf(E_idx + i)%sf(j, k, l)
+                                else
+                                    alpha_visc(i) = q_prim_vf(E_idx + i)%sf(j, k, l)
+                                end if
+                            end do
+
+                            if (bubbles_euler) then
+                                rho_visc = 0._wp
+                                gamma_visc = 0._wp
+                                pi_inf_visc = 0._wp
+
+                                if (mpp_lim .and. (model_eqns == 2) .and. (num_fluids > 2)) then
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, num_fluids
+                                        rho_visc = rho_visc + alpha_rho_visc(i)
+                                        gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
+                                        pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i)
+                                    end do
+                                else if ((model_eqns == 2) .and. (num_fluids > 2)) then
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, num_fluids - 1
+                                        rho_visc = rho_visc + alpha_rho_visc(i)
+                                        gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
+                                        pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i)
+                                    end do
+                                else
+                                    rho_visc = alpha_rho_visc(1)
+                                    gamma_visc = gammas(1)
+                                    pi_inf_visc = pi_infs(1)
+                                end if
+                            else
+                                rho_visc = 0._wp
+                                gamma_visc = 0._wp
+                                pi_inf_visc = 0._wp
+
+                                alpha_visc_sum = 0._wp
+
+                                if (mpp_lim) then
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, num_fluids
+                                        alpha_rho_visc(i) = max(0._wp, alpha_rho_visc(i))
+                                        alpha_visc(i) = min(max(0._wp, alpha_visc(i)), 1._wp)
+                                        alpha_visc_sum = alpha_visc_sum + alpha_visc(i)
+                                    end do
+
+                                    alpha_visc = alpha_visc/max(alpha_visc_sum, sgm_eps)
+
+                                end if
+
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = 1, num_fluids
+                                    rho_visc = rho_visc + alpha_rho_visc(i)
+                                    gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
+                                    pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i)
+                                end do
+
+                                if (viscous) then
+                                    call s_compute_re_visc(q_prim_vf, &
+                                        alpha_visc, j, k, l, &
+                                        Re_visc_nn, grad_x_vf, &
+                                        grad_y_vf, grad_z_vf)
+                                    call s_compute_mixture_re(alpha_visc, Re_visc_nn, Re_visc)
+                                end if
+                            end if
+
+                            tau_Re(2, 2) = grad_z_vf(3)%sf(j, k, l)/y_cc(k)/ &
+                                           Re_visc(2)
+
+                            tau_Re_vf(momxb + 1)%sf(j, k, l) = &
+                                tau_Re_vf(momxb + 1)%sf(j, k, l) - &
+                                tau_Re(2, 2)
+
+                            tau_Re_vf(E_idx)%sf(j, k, l) = &
+                                tau_Re_vf(E_idx)%sf(j, k, l) - &
+                                q_prim_vf(momxb + 1)%sf(j, k, l)*tau_Re(2, 2)
+
+                        end do
+                    end do
+                end do
+                $:END_GPU_PARALLEL_LOOP()
+            end if
+        #:endif
+    end subroutine s_compute_viscous_stress_cylindrical_boundary
+
+    !>  Computes viscous terms
+    !!  @param qL_prim_rsx_vf Left reconstructed primitive variables in x
+    !!  @param qL_prim_rsy_vf Left reconstructed primitive variables in y
+    !!  @param qL_prim_rsz_vf Left reconstructed primitive variables in z
+    !!  @param dqL_prim_dx_n Left primitive x-derivative
+    !!  @param dqL_prim_dy_n Left primitive y-derivative
+    !!  @param dqL_prim_dz_n Left primitive z-derivative
+    !!  @param qL_prim Left cell-boundary primitive variables
+    !!  @param qR_prim_rsx_vf Right reconstructed primitive variables in x
+    !!  @param qR_prim_rsy_vf Right reconstructed primitive variables in y
+    !!  @param qR_prim_rsz_vf Right reconstructed primitive variables in z
+    !!  @param dqR_prim_dx_n Right primitive x-derivative
+    !!  @param dqR_prim_dy_n Right primitive y-derivative
+    !!  @param dqR_prim_dz_n Right primitive z-derivative
+    !!  @param qR_prim Right cell-boundary primitive variables
+    !!  @param q_prim_qp Cell-averaged primitive variables
+    !!  @param dq_prim_dx_qp Cell-averaged primitive x-derivative
+    !!  @param dq_prim_dy_qp Cell-averaged primitive y-derivative
+    !!  @param dq_prim_dz_qp Cell-averaged primitive z-derivative
+    !!  @param ix Index bounds in the x-direction
+    !!  @param iy Index bounds in the y-direction
+    !!  @param iz Index bounds in the z-direction
+    subroutine s_get_viscous(qL_prim_rsx_vf, qL_prim_rsy_vf, qL_prim_rsz_vf, &
+                             dqL_prim_dx_n, dqL_prim_dy_n, dqL_prim_dz_n, &
+                             qL_prim, &
+                             qR_prim_rsx_vf, qR_prim_rsy_vf, qR_prim_rsz_vf, &
+                             dqR_prim_dx_n, dqR_prim_dy_n, dqR_prim_dz_n, &
+                             qR_prim, &
+                             q_prim_qp, &
+                             dq_prim_dx_qp, dq_prim_dy_qp, dq_prim_dz_qp, &
+                             ix, iy, iz)
+
+        real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:), &
+            intent(inout) :: qL_prim_rsx_vf, qR_prim_rsx_vf, &
+                             qL_prim_rsy_vf, qR_prim_rsy_vf, &
+                             qL_prim_rsz_vf, qR_prim_rsz_vf
+
+        type(vector_field), dimension(num_dims), intent(inout) :: qL_prim, qR_prim
+
+        type(vector_field), intent(in) :: q_prim_qp
+
+        type(vector_field), dimension(1:num_dims), &
+            intent(inout) :: dqL_prim_dx_n, dqR_prim_dx_n, &
+                             dqL_prim_dy_n, dqR_prim_dy_n, &
+                             dqL_prim_dz_n, dqR_prim_dz_n
+
+        type(vector_field), dimension(1), intent(inout) :: dq_prim_dx_qp, dq_prim_dy_qp, dq_prim_dz_qp
+        type(int_bounds_info), intent(in) :: ix, iy, iz
+
+        integer :: i, j, k, l
+
+        do i = 1, num_dims
+
+            iv%beg = mom_idx%beg; iv%end = mom_idx%end
+
+            $:GPU_UPDATE(device='[iv]')
+
+            call s_reconstruct_cell_boundary_values_visc( &
+                q_prim_qp%vf(iv%beg:iv%end), &
+                qL_prim_rsx_vf, qL_prim_rsy_vf, qL_prim_rsz_vf, &
+                qR_prim_rsx_vf, qR_prim_rsy_vf, qR_prim_rsz_vf, &
+                i, qL_prim(i)%vf(iv%beg:iv%end), qR_prim(i)%vf(iv%beg:iv%end), &
+                ix, iy, iz)
+        end do
+
+        if (weno_Re_flux) then
+            ! Compute velocity gradient at cell centers using scalar
+            ! divergence theorem
+            do i = 1, num_dims
+                if (i == 1) then
+                    call s_apply_scalar_divergence_theorem( &
+                        qL_prim(i)%vf(iv%beg:iv%end), &
+                        qR_prim(i)%vf(iv%beg:iv%end), &
+                        dq_prim_dx_qp(1)%vf(iv%beg:iv%end), i, &
+                        ix, iy, iz, iv, dx, m, buff_size)
+                elseif (i == 2) then
+                    call s_apply_scalar_divergence_theorem( &
+                        qL_prim(i)%vf(iv%beg:iv%end), &
+                        qR_prim(i)%vf(iv%beg:iv%end), &
+                        dq_prim_dy_qp(1)%vf(iv%beg:iv%end), i, &
+                        ix, iy, iz, iv, dy, n, buff_size)
+                else
+                    call s_apply_scalar_divergence_theorem( &
+                        qL_prim(i)%vf(iv%beg:iv%end), &
+                        qR_prim(i)%vf(iv%beg:iv%end), &
+                        dq_prim_dz_qp(1)%vf(iv%beg:iv%end), i, &
+                        ix, iy, iz, iv, dz, p, buff_size)
+                end if
+            end do
+
+        else ! Compute velocity gradient at cell centers using finite differences
+
+            iv%beg = mom_idx%beg; iv%end = mom_idx%end
+            $:GPU_UPDATE(device='[iv]')
+
+            is1_viscous = ix; is2_viscous = iy; is3_viscous = iz
+
+            $:GPU_UPDATE(device='[is1_viscous,is2_viscous,is3_viscous]')
+
+            $:GPU_PARALLEL_LOOP(collapse=3)
+            do l = is3_viscous%beg, is3_viscous%end
+                do k = iy%beg, iy%end
+                    do j = is1_viscous%beg + 1, is1_viscous%end
+                        $:GPU_LOOP(parallelism='[seq]')
+                        do i = iv%beg, iv%end
+                            dqL_prim_dx_n(1)%vf(i)%sf(j, k, l) = &
+                                (q_prim_qp%vf(i)%sf(j, k, l) - &
+                                 q_prim_qp%vf(i)%sf(j - 1, k, l))/ &
+                                (x_cc(j) - x_cc(j - 1))
+                        end do
+                    end do
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+
+            $:GPU_PARALLEL_LOOP(collapse=3)
+            do l = is3_viscous%beg, is3_viscous%end
+                do k = is2_viscous%beg, is2_viscous%end
+                    do j = is1_viscous%beg, is1_viscous%end - 1
+                        $:GPU_LOOP(parallelism='[seq]')
+                        do i = iv%beg, iv%end
+                            dqR_prim_dx_n(1)%vf(i)%sf(j, k, l) = &
+                                (q_prim_qp%vf(i)%sf(j + 1, k, l) - &
+                                 q_prim_qp%vf(i)%sf(j, k, l))/ &
+                                (x_cc(j + 1) - x_cc(j))
+                        end do
+                    end do
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+
+            if (n > 0) then
+
+                #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
+                    $:GPU_PARALLEL_LOOP(collapse=3)
+                    do l = is3_viscous%beg, is3_viscous%end
+                        do j = is2_viscous%beg + 1, is2_viscous%end
+                            do k = is1_viscous%beg, is1_viscous%end
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = iv%beg, iv%end
+                                    dqL_prim_dy_n(2)%vf(i)%sf(k, j, l) = &
+                                        (q_prim_qp%vf(i)%sf(k, j, l) - &
+                                         q_prim_qp%vf(i)%sf(k, j - 1, l))/ &
+                                        (y_cc(j) - y_cc(j - 1))
+                                end do
+                            end do
+                        end do
+                    end do
+                    $:END_GPU_PARALLEL_LOOP()
+
+                    $:GPU_PARALLEL_LOOP(collapse=3)
+                    do l = is3_viscous%beg, is3_viscous%end
+                        do j = is2_viscous%beg, is2_viscous%end - 1
+                            do k = is1_viscous%beg, is1_viscous%end
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = iv%beg, iv%end
+                                    dqR_prim_dy_n(2)%vf(i)%sf(k, j, l) = &
+                                        (q_prim_qp%vf(i)%sf(k, j + 1, l) - &
+                                         q_prim_qp%vf(i)%sf(k, j, l))/ &
+                                        (y_cc(j + 1) - y_cc(j))
+                                end do
+                            end do
+                        end do
+                    end do
+                    $:END_GPU_PARALLEL_LOOP()
+
+                    $:GPU_PARALLEL_LOOP(collapse=3)
+                    do l = is3_viscous%beg, is3_viscous%end
+                        do j = is2_viscous%beg + 1, is2_viscous%end
+                            do k = is1_viscous%beg + 1, is1_viscous%end - 1
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = iv%beg, iv%end
+                                    dqL_prim_dx_n(2)%vf(i)%sf(k, j, l) = &
+                                        (dqL_prim_dx_n(1)%vf(i)%sf(k, j, l) + &
+                                         dqR_prim_dx_n(1)%vf(i)%sf(k, j, l) + &
+                                         dqL_prim_dx_n(1)%vf(i)%sf(k, j - 1, l) + &
+                                         dqR_prim_dx_n(1)%vf(i)%sf(k, j - 1, l))
+
+                                    dqL_prim_dx_n(2)%vf(i)%sf(k, j, l) = 25.e-2_wp* &
+                                                                         dqL_prim_dx_n(2)%vf(i)%sf(k, j, l)
+                                end do
+                            end do
+                        end do
+                    end do
+                    $:END_GPU_PARALLEL_LOOP()
+
+                    $:GPU_PARALLEL_LOOP(collapse=3)
+                    do l = is3_viscous%beg, is3_viscous%end
+                        do j = is2_viscous%beg, is2_viscous%end - 1
+                            do k = is1_viscous%beg + 1, is1_viscous%end - 1
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = iv%beg, iv%end
+                                    dqR_prim_dx_n(2)%vf(i)%sf(k, j, l) = &
+                                        (dqL_prim_dx_n(1)%vf(i)%sf(k, j + 1, l) + &
+                                         dqR_prim_dx_n(1)%vf(i)%sf(k, j + 1, l) + &
+                                         dqL_prim_dx_n(1)%vf(i)%sf(k, j, l) + &
+                                         dqR_prim_dx_n(1)%vf(i)%sf(k, j, l))
+
+                                    dqR_prim_dx_n(2)%vf(i)%sf(k, j, l) = 25.e-2_wp* &
+                                                                         dqR_prim_dx_n(2)%vf(i)%sf(k, j, l)
+
+                                end do
+                            end do
+                        end do
+                    end do
+                    $:END_GPU_PARALLEL_LOOP()
+
+                    $:GPU_PARALLEL_LOOP(collapse=3)
+                    do l = is3_viscous%beg, is3_viscous%end
+                        do k = is2_viscous%beg + 1, is2_viscous%end - 1
+                            do j = is1_viscous%beg + 1, is1_viscous%end
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = iv%beg, iv%end
+                                    dqL_prim_dy_n(1)%vf(i)%sf(j, k, l) = &
+                                        (dqL_prim_dy_n(2)%vf(i)%sf(j, k, l) + &
+                                         dqR_prim_dy_n(2)%vf(i)%sf(j, k, l) + &
+                                         dqL_prim_dy_n(2)%vf(i)%sf(j - 1, k, l) + &
+                                         dqR_prim_dy_n(2)%vf(i)%sf(j - 1, k, l))
+
+                                    dqL_prim_dy_n(1)%vf(i)%sf(j, k, l) = 25.e-2_wp* &
+                                                                         dqL_prim_dy_n(1)%vf(i)%sf(j, k, l)
+
+                                end do
+                            end do
+                        end do
+                    end do
+                    $:END_GPU_PARALLEL_LOOP()
+
+                    $:GPU_PARALLEL_LOOP(collapse=3)
+                    do l = is3_viscous%beg, is3_viscous%end
+                        do k = is2_viscous%beg + 1, is2_viscous%end - 1
+                            do j = is1_viscous%beg, is1_viscous%end - 1
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = iv%beg, iv%end
+                                    dqR_prim_dy_n(1)%vf(i)%sf(j, k, l) = &
+                                        (dqL_prim_dy_n(2)%vf(i)%sf(j + 1, k, l) + &
+                                         dqR_prim_dy_n(2)%vf(i)%sf(j + 1, k, l) + &
+                                         dqL_prim_dy_n(2)%vf(i)%sf(j, k, l) + &
+                                         dqR_prim_dy_n(2)%vf(i)%sf(j, k, l))
+
+                                    dqR_prim_dy_n(1)%vf(i)%sf(j, k, l) = 25.e-2_wp* &
+                                                                         dqR_prim_dy_n(1)%vf(i)%sf(j, k, l)
+
+                                end do
+                            end do
+                        end do
+                    end do
+                    $:END_GPU_PARALLEL_LOOP()
+
+                #:endif
+
+                if (p > 0) then
+                    #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
+                        $:GPU_PARALLEL_LOOP(collapse=3)
+                        do j = is3_viscous%beg + 1, is3_viscous%end
+                            do l = is2_viscous%beg, is2_viscous%end
+                                do k = is1_viscous%beg, is1_viscous%end
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = iv%beg, iv%end
+
+                                        dqL_prim_dz_n(3)%vf(i)%sf(k, l, j) = &
+                                            (q_prim_qp%vf(i)%sf(k, l, j) - &
+                                             q_prim_qp%vf(i)%sf(k, l, j - 1))/ &
+                                            (z_cc(j) - z_cc(j - 1))
+                                    end do
+                                end do
+                            end do
+                        end do
+                        $:END_GPU_PARALLEL_LOOP()
+
+                        $:GPU_PARALLEL_LOOP(collapse=3)
+                        do j = is3_viscous%beg, is3_viscous%end - 1
+                            do l = is2_viscous%beg, is2_viscous%end
+                                do k = is1_viscous%beg, is1_viscous%end
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = iv%beg, iv%end
+
+                                        dqR_prim_dz_n(3)%vf(i)%sf(k, l, j) = &
+                                            (q_prim_qp%vf(i)%sf(k, l, j + 1) - &
+                                             q_prim_qp%vf(i)%sf(k, l, j))/ &
+                                            (z_cc(j + 1) - z_cc(j))
+                                    end do
+                                end do
+                            end do
+                        end do
+                        $:END_GPU_PARALLEL_LOOP()
+
+                        $:GPU_PARALLEL_LOOP(collapse=3)
+                        do l = is3_viscous%beg + 1, is3_viscous%end - 1
+                            do k = is2_viscous%beg, is2_viscous%end
+                                do j = is1_viscous%beg + 1, is1_viscous%end
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = iv%beg, iv%end
+
+                                        dqL_prim_dz_n(1)%vf(i)%sf(j, k, l) = &
+                                            (dqL_prim_dz_n(3)%vf(i)%sf(j, k, l) + &
+                                             dqR_prim_dz_n(3)%vf(i)%sf(j, k, l) + &
+                                             dqL_prim_dz_n(3)%vf(i)%sf(j - 1, k, l) + &
+                                             dqR_prim_dz_n(3)%vf(i)%sf(j - 1, k, l))
+
+                                        dqL_prim_dz_n(1)%vf(i)%sf(j, k, l) = 25.e-2_wp* &
+                                                                             dqL_prim_dz_n(1)%vf(i)%sf(j, k, l)
+
+                                    end do
+                                end do
+                            end do
+                        end do
+                        $:END_GPU_PARALLEL_LOOP()
+
+                        $:GPU_PARALLEL_LOOP(collapse=3)
+                        do l = is3_viscous%beg + 1, is3_viscous%end - 1
+                            do k = is2_viscous%beg, is2_viscous%end
+                                do j = is1_viscous%beg, is1_viscous%end - 1
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = iv%beg, iv%end
+
+                                        dqR_prim_dz_n(1)%vf(i)%sf(j, k, l) = &
+                                            (dqL_prim_dz_n(3)%vf(i)%sf(j + 1, k, l) + &
+                                             dqR_prim_dz_n(3)%vf(i)%sf(j + 1, k, l) + &
+                                             dqL_prim_dz_n(3)%vf(i)%sf(j, k, l) + &
+                                             dqR_prim_dz_n(3)%vf(i)%sf(j, k, l))
+
+                                        dqR_prim_dz_n(1)%vf(i)%sf(j, k, l) = 25.e-2_wp* &
+                                                                             dqR_prim_dz_n(1)%vf(i)%sf(j, k, l)
+
+                                    end do
+                                end do
+                            end do
+                        end do
+                        $:END_GPU_PARALLEL_LOOP()
+
+                        $:GPU_PARALLEL_LOOP(collapse=3)
+                        do l = is3_viscous%beg + 1, is3_viscous%end - 1
+                            do j = is2_viscous%beg + 1, is2_viscous%end
+                                do k = is1_viscous%beg, is1_viscous%end
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = iv%beg, iv%end
+
+                                        dqL_prim_dz_n(2)%vf(i)%sf(k, j, l) = &
+                                            (dqL_prim_dz_n(3)%vf(i)%sf(k, j, l) + &
+                                             dqR_prim_dz_n(3)%vf(i)%sf(k, j, l) + &
+                                             dqL_prim_dz_n(3)%vf(i)%sf(k, j - 1, l) + &
+                                             dqR_prim_dz_n(3)%vf(i)%sf(k, j - 1, l))
+
+                                        dqL_prim_dz_n(2)%vf(i)%sf(k, j, l) = 25.e-2_wp* &
+                                                                             dqL_prim_dz_n(2)%vf(i)%sf(k, j, l)
+
+                                    end do
+                                end do
+                            end do
+                        end do
+                        $:END_GPU_PARALLEL_LOOP()
+
+                        $:GPU_PARALLEL_LOOP(collapse=3)
+                        do l = is3_viscous%beg + 1, is3_viscous%end - 1
+                            do j = is2_viscous%beg, is2_viscous%end - 1
+                                do k = is1_viscous%beg, is1_viscous%end
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = iv%beg, iv%end
+
+                                        dqR_prim_dz_n(2)%vf(i)%sf(k, j, l) = &
+                                            (dqL_prim_dz_n(3)%vf(i)%sf(k, j + 1, l) + &
+                                             dqR_prim_dz_n(3)%vf(i)%sf(k, j + 1, l) + &
+                                             dqL_prim_dz_n(3)%vf(i)%sf(k, j, l) + &
+                                             dqR_prim_dz_n(3)%vf(i)%sf(k, j, l))
+
+                                        dqR_prim_dz_n(2)%vf(i)%sf(k, j, l) = 25.e-2_wp* &
+                                                                             dqR_prim_dz_n(2)%vf(i)%sf(k, j, l)
+
+                                    end do
+                                end do
+                            end do
+                        end do
+                        $:END_GPU_PARALLEL_LOOP()
+
+                        $:GPU_PARALLEL_LOOP(collapse=3)
+                        do j = is3_viscous%beg + 1, is3_viscous%end
+                            do l = is2_viscous%beg + 1, is2_viscous%end - 1
+                                do k = is1_viscous%beg, is1_viscous%end
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = iv%beg, iv%end
+
+                                        dqL_prim_dy_n(3)%vf(i)%sf(k, l, j) = &
+                                            (dqL_prim_dy_n(2)%vf(i)%sf(k, l, j) + &
+                                             dqR_prim_dy_n(2)%vf(i)%sf(k, l, j) + &
+                                             dqL_prim_dy_n(2)%vf(i)%sf(k, l, j - 1) + &
+                                             dqR_prim_dy_n(2)%vf(i)%sf(k, l, j - 1))
+
+                                        dqL_prim_dy_n(3)%vf(i)%sf(k, l, j) = 25.e-2_wp* &
+                                                                             dqL_prim_dy_n(3)%vf(i)%sf(k, l, j)
+
+                                    end do
+                                end do
+                            end do
+                        end do
+                        $:END_GPU_PARALLEL_LOOP()
+
+                        $:GPU_PARALLEL_LOOP(collapse=3)
+                        do j = is3_viscous%beg, is3_viscous%end - 1
+                            do l = is2_viscous%beg + 1, is2_viscous%end - 1
+                                do k = is1_viscous%beg, is1_viscous%end
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = iv%beg, iv%end
+
+                                        dqR_prim_dy_n(3)%vf(i)%sf(k, l, j) = &
+                                            (dqL_prim_dy_n(2)%vf(i)%sf(k, l, j + 1) + &
+                                             dqR_prim_dy_n(2)%vf(i)%sf(k, l, j + 1) + &
+                                             dqL_prim_dy_n(2)%vf(i)%sf(k, l, j) + &
+                                             dqR_prim_dy_n(2)%vf(i)%sf(k, l, j))
+
+                                        dqR_prim_dy_n(3)%vf(i)%sf(k, l, j) = 25.e-2_wp* &
+                                                                             dqR_prim_dy_n(3)%vf(i)%sf(k, l, j)
+
+                                    end do
+                                end do
+                            end do
+                        end do
+                        $:END_GPU_PARALLEL_LOOP()
+                        $:GPU_PARALLEL_LOOP(collapse=3)
+                        do j = is3_viscous%beg + 1, is3_viscous%end
+                            do l = is2_viscous%beg, is2_viscous%end
+                                do k = is1_viscous%beg + 1, is1_viscous%end - 1
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = iv%beg, iv%end
+
+                                        dqL_prim_dx_n(3)%vf(i)%sf(k, l, j) = &
+                                            (dqL_prim_dx_n(1)%vf(i)%sf(k, l, j) + &
+                                             dqR_prim_dx_n(1)%vf(i)%sf(k, l, j) + &
+                                             dqL_prim_dx_n(1)%vf(i)%sf(k, l, j - 1) + &
+                                             dqR_prim_dx_n(1)%vf(i)%sf(k, l, j - 1))
+
+                                        dqL_prim_dx_n(3)%vf(i)%sf(k, l, j) = 25.e-2_wp* &
+                                                                             dqL_prim_dx_n(3)%vf(i)%sf(k, l, j)
+
+                                    end do
+                                end do
+                            end do
+                        end do
+                        $:END_GPU_PARALLEL_LOOP()
+                        $:GPU_PARALLEL_LOOP(collapse=3)
+                        do j = is3_viscous%beg, is3_viscous%end - 1
+                            do l = is2_viscous%beg, is2_viscous%end
+                                do k = is1_viscous%beg + 1, is1_viscous%end - 1
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = iv%beg, iv%end
+                                        dqR_prim_dx_n(3)%vf(i)%sf(k, l, j) = &
+                                            (dqL_prim_dx_n(1)%vf(i)%sf(k, l, j + 1) + &
+                                             dqR_prim_dx_n(1)%vf(i)%sf(k, l, j + 1) + &
+                                             dqL_prim_dx_n(1)%vf(i)%sf(k, l, j) + &
+                                             dqR_prim_dx_n(1)%vf(i)%sf(k, l, j))
+
+                                        dqR_prim_dx_n(3)%vf(i)%sf(k, l, j) = 25.e-2_wp* &
+                                                                             dqR_prim_dx_n(3)%vf(i)%sf(k, l, j)
+
+                                    end do
+                                end do
+                            end do
+                        end do
+                        $:END_GPU_PARALLEL_LOOP()
+
+                        do i = iv%beg, iv%end
+                            call s_compute_fd_gradient(q_prim_qp%vf(i), &
+                                                       dq_prim_dx_qp(1)%vf(i), &
+                                                       dq_prim_dy_qp(1)%vf(i), &
+                                                       dq_prim_dz_qp(1)%vf(i))
+                        end do
+                    #:endif
+
+                else
+
+                    do i = iv%beg, iv%end
+                        call s_compute_fd_gradient(q_prim_qp%vf(i), &
+                                                   dq_prim_dx_qp(1)%vf(i), &
+                                                   dq_prim_dy_qp(1)%vf(i), &
+                                                   dq_prim_dy_qp(1)%vf(i))
+                    end do
+
+                end if
+
+            else
+
+                do i = iv%beg, iv%end
+                    call s_compute_fd_gradient(q_prim_qp%vf(i), &
+                                               dq_prim_dx_qp(1)%vf(i), &
+                                               dq_prim_dx_qp(1)%vf(i), &
+                                               dq_prim_dx_qp(1)%vf(i))
+                end do
+
+            end if
+
+        end if
+
+    end subroutine s_get_viscous
+
+    !> @brief Reconstructs left and right cell-boundary values of viscous primitive variables using WENO or MUSCL.
+    subroutine s_reconstruct_cell_boundary_values_visc(v_vf, vL_x, vL_y, vL_z, vR_x, vR_y, vR_z, &
+                                                       norm_dir, vL_prim_vf, vR_prim_vf, ix, iy, iz)
+
+        type(scalar_field), dimension(iv%beg:iv%end), intent(in) :: v_vf
+        type(scalar_field), dimension(iv%beg:iv%end), intent(inout) :: vL_prim_vf, vR_prim_vf
+
+        real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:), intent(inout) :: vL_x, vL_y, vL_z, vR_x, vR_y, vR_z
+        integer, intent(in) :: norm_dir
+        type(int_bounds_info), intent(in) :: ix, iy, iz
+
+        integer :: recon_dir !< Coordinate direction of the WENO reconstruction
+
+        integer :: i, j, k, l
+
+        #:for SCHEME, TYPE in [('weno','WENO_TYPE'), ('muscl','MUSCL_TYPE')]
+            if (recon_type == ${TYPE}$ .or. dummy) then
+                ! Reconstruction in s1-direction
+
+                if (norm_dir == 1) then
+                    is1_viscous = ix; is2_viscous = iy; is3_viscous = iz
+                    recon_dir = 1; is1_viscous%beg = is1_viscous%beg + ${SCHEME}$_polyn
+                    is1_viscous%end = is1_viscous%end - ${SCHEME}$_polyn
+
+                elseif (norm_dir == 2) then
+                    is1_viscous = iy; is2_viscous = ix; is3_viscous = iz
+                    recon_dir = 2; is1_viscous%beg = is1_viscous%beg + ${SCHEME}$_polyn
+                    is1_viscous%end = is1_viscous%end - ${SCHEME}$_polyn
+
+                else
+                    is1_viscous = iz; is2_viscous = iy; is3_viscous = ix
+                    recon_dir = 3; is1_viscous%beg = is1_viscous%beg + ${SCHEME}$_polyn
+                    is1_viscous%end = is1_viscous%end - ${SCHEME}$_polyn
+
+                end if
+
+                $:GPU_UPDATE(device='[is1_viscous, is2_viscous, is3_viscous, iv]')
+                if (n > 0) then
+                    if (p > 0) then
+                        call s_${SCHEME}$ (v_vf(iv%beg:iv%end), &
+                                           vL_x(:, :, :, iv%beg:iv%end), vL_y(:, :, :, iv%beg:iv%end), vL_z(:, :, :, iv%beg:iv%end), vR_x(:, :, :, iv%beg:iv%end), vR_y(:, :, :, iv%beg:iv%end), vR_z(:, :, :, iv%beg:iv%end), &
+                                           recon_dir, &
+                                           is1_viscous, is2_viscous, is3_viscous)
+                    else
+                        call s_${SCHEME}$ (v_vf(iv%beg:iv%end), &
+                                           vL_x(:, :, :, iv%beg:iv%end), vL_y(:, :, :, iv%beg:iv%end), vL_z(:, :, :, :), vR_x(:, :, :, iv%beg:iv%end), vR_y(:, :, :, iv%beg:iv%end), vR_z(:, :, :, :), &
+                                           recon_dir, &
+                                           is1_viscous, is2_viscous, is3_viscous)
+                    end if
+                else
+                    call s_${SCHEME}$ (v_vf(iv%beg:iv%end), &
+                                       vL_x(:, :, :, iv%beg:iv%end), vL_y(:, :, :, :), vL_z(:, :, :, :), vR_x(:, :, :, iv%beg:iv%end), vR_y(:, :, :, :), vR_z(:, :, :, :), &
+                                       recon_dir, &
+                                       is1_viscous, is2_viscous, is3_viscous)
+                end if
+            end if
+        #:endfor
+
+        if (viscous .or. dummy) then
+            if (weno_Re_flux) then
+                if (norm_dir == 2) then
+                    $:GPU_PARALLEL_LOOP(collapse=4)
+                    do i = iv%beg, iv%end
+                        do l = is3_viscous%beg, is3_viscous%end
+                            do j = is1_viscous%beg, is1_viscous%end
+                                do k = is2_viscous%beg, is2_viscous%end
+                                    vL_prim_vf(i)%sf(k, j, l) = vL_y(j, k, l, i)
+                                    vR_prim_vf(i)%sf(k, j, l) = vR_y(j, k, l, i)
+                                end do
+                            end do
+                        end do
+                    end do
+                    $:END_GPU_PARALLEL_LOOP()
+                elseif (norm_dir == 3) then
+                    $:GPU_PARALLEL_LOOP(collapse=4)
+                    do i = iv%beg, iv%end
+                        do j = is1_viscous%beg, is1_viscous%end
+                            do k = is2_viscous%beg, is2_viscous%end
+                                do l = is3_viscous%beg, is3_viscous%end
+                                    vL_prim_vf(i)%sf(l, k, j) = vL_z(j, k, l, i)
+                                    vR_prim_vf(i)%sf(l, k, j) = vR_z(j, k, l, i)
+                                end do
+                            end do
+                        end do
+                    end do
+                    $:END_GPU_PARALLEL_LOOP()
+                elseif (norm_dir == 1) then
+                    $:GPU_PARALLEL_LOOP(collapse=4)
+                    do i = iv%beg, iv%end
+                        do l = is3_viscous%beg, is3_viscous%end
+                            do k = is2_viscous%beg, is2_viscous%end
+                                do j = is1_viscous%beg, is1_viscous%end
+                                    vL_prim_vf(i)%sf(j, k, l) = vL_x(j, k, l, i)
+                                    vR_prim_vf(i)%sf(j, k, l) = vR_x(j, k, l, i)
+                                end do
+                            end do
+                        end do
+                    end do
+                    $:END_GPU_PARALLEL_LOOP()
+                end if
+            end if
+        end if
+
+    end subroutine s_reconstruct_cell_boundary_values_visc
+
+    !> @brief Reconstructs left and right cell-boundary values of viscous primitive variable derivatives using WENO or MUSCL.
+    subroutine s_reconstruct_cell_boundary_values_visc_deriv(v_vf, vL_x, vL_y, vL_z, vR_x, vR_y, vR_z, &
+                                                             norm_dir, vL_prim_vf, vR_prim_vf, ix, iy, iz)
+        type(scalar_field), dimension(iv%beg:iv%end), intent(in) :: v_vf
+        real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, iv%beg:), intent(inout) :: vL_x, vL_y, vL_z, vR_x, vR_y, vR_z
+        type(scalar_field), dimension(iv%beg:iv%end), intent(inout) :: vL_prim_vf, vR_prim_vf
+        type(int_bounds_info), intent(in) :: ix, iy, iz
+
+        integer, intent(IN) :: norm_dir
+
+        integer :: recon_dir !< Coordinate direction of the WENO reconstruction
+
+        integer :: i, j, k, l
+        #:for SCHEME, TYPE in [('weno','WENO_TYPE'), ('muscl','MUSCL_TYPE')]
+            if (recon_type == ${TYPE}$) then
+                ! Reconstruction in s1-direction
+
+                if (norm_dir == 1) then
+                    is1_viscous = ix; is2_viscous = iy; is3_viscous = iz
+                    recon_dir = 1; is1_viscous%beg = is1_viscous%beg + ${SCHEME}$_polyn
+                    is1_viscous%end = is1_viscous%end - ${SCHEME}$_polyn
+
+                elseif (norm_dir == 2) then
+                    is1_viscous = iy; is2_viscous = ix; is3_viscous = iz
+                    recon_dir = 2; is1_viscous%beg = is1_viscous%beg + ${SCHEME}$_polyn
+                    is1_viscous%end = is1_viscous%end - ${SCHEME}$_polyn
+
+                else
+                    is1_viscous = iz; is2_viscous = iy; is3_viscous = ix
+                    recon_dir = 3; is1_viscous%beg = is1_viscous%beg + ${SCHEME}$_polyn
+                    is1_viscous%end = is1_viscous%end - ${SCHEME}$_polyn
+
+                end if
+                $:GPU_UPDATE(device='[is1_viscous, is2_viscous, is3_viscous, iv]')
+                if (n > 0) then
+                    if (p > 0) then
+
+                        call s_${SCHEME}$ (v_vf(iv%beg:iv%end), &
+                                           vL_x(:, :, :, iv%beg:iv%end), vL_y(:, :, :, iv%beg:iv%end), vL_z(:, :, :, iv%beg:iv%end), vR_x(:, :, :, iv%beg:iv%end), vR_y(:, :, :, iv%beg:iv%end), vR_z(:, :, :, iv%beg:iv%end), &
+                                           recon_dir, &
+                                           is1_viscous, is2_viscous, is3_viscous)
+                    else
+                        call s_${SCHEME}$ (v_vf(iv%beg:iv%end), &
+                                           vL_x(:, :, :, iv%beg:iv%end), vL_y(:, :, :, iv%beg:iv%end), vL_z(:, :, :, :), vR_x(:, :, :, iv%beg:iv%end), vR_y(:, :, :, iv%beg:iv%end), vR_z(:, :, :, :), &
+                                           recon_dir, &
+                                           is1_viscous, is2_viscous, is3_viscous)
+                    end if
+                else
+
+                    call s_${SCHEME}$ (v_vf(iv%beg:iv%end), &
+                                       vL_x(:, :, :, iv%beg:iv%end), vL_y(:, :, :, :), vL_z(:, :, :, :), vR_x(:, :, :, iv%beg:iv%end), vR_y(:, :, :, :), vR_z(:, :, :, :), &
+                                       recon_dir, &
+                                       is1_viscous, is2_viscous, is3_viscous)
+                end if
+            end if
+        #:endfor
+
+        if (viscous .or. dummy) then
+            if (weno_Re_flux) then
+                if (norm_dir == 2) then
+                    $:GPU_PARALLEL_LOOP(collapse=4)
+                    do i = iv%beg, iv%end
+                        do l = is3_viscous%beg, is3_viscous%end
+                            do j = is1_viscous%beg, is1_viscous%end
+                                do k = is2_viscous%beg, is2_viscous%end
+                                    vL_prim_vf(i)%sf(k, j, l) = vL_y(j, k, l, i)
+                                    vR_prim_vf(i)%sf(k, j, l) = vR_y(j, k, l, i)
+                                end do
+                            end do
+                        end do
+                    end do
+                    $:END_GPU_PARALLEL_LOOP()
+                elseif (norm_dir == 3) then
+                    $:GPU_PARALLEL_LOOP(collapse=4)
+                    do i = iv%beg, iv%end
+                        do j = is1_viscous%beg, is1_viscous%end
+                            do k = is2_viscous%beg, is2_viscous%end
+                                do l = is3_viscous%beg, is3_viscous%end
+                                    vL_prim_vf(i)%sf(l, k, j) = vL_z(j, k, l, i)
+                                    vR_prim_vf(i)%sf(l, k, j) = vR_z(j, k, l, i)
+                                end do
+                            end do
+                        end do
+                    end do
+                    $:END_GPU_PARALLEL_LOOP()
+                elseif (norm_dir == 1) then
+                    $:GPU_PARALLEL_LOOP(collapse=4)
+                    do i = iv%beg, iv%end
+                        do l = is3_viscous%beg, is3_viscous%end
+                            do k = is2_viscous%beg, is2_viscous%end
+                                do j = is1_viscous%beg, is1_viscous%end
+                                    vL_prim_vf(i)%sf(j, k, l) = vL_x(j, k, l, i)
+                                    vR_prim_vf(i)%sf(j, k, l) = vR_x(j, k, l, i)
+                                end do
+                            end do
+                        end do
+                    end do
+                    $:END_GPU_PARALLEL_LOOP()
+                end if
+            end if
+        end if
+
+    end subroutine s_reconstruct_cell_boundary_values_visc_deriv
+
+    !>  The purpose of this subroutine is to employ the inputted
+        !!      left and right cell-boundary integral-averaged variables
+        !!      to compute the relevant cell-average first-order spatial
+        !!      derivatives in the x-, y- or z-direction by means of the
+        !!      scalar divergence theorem.
+        !!  @param vL_vf Left cell-boundary integral averages
+        !!  @param vR_vf Right cell-boundary integral averages
+        !!  @param dv_ds_vf Cell-average first-order spatial derivatives
+        !!  @param norm_dir Splitting coordinate direction
+        !!  @param ix Index bounds in the x-direction
+        !!  @param iy Index bounds in the y-direction
+        !!  @param iz Index bounds in the z-direction
+        !!  @param iv_in Variable index bounds
+        !!  @param dL Cell width array
+        !!  @param dim Dimension size
+        !!  @param buff_size_in Buffer layer size
+    subroutine s_apply_scalar_divergence_theorem(vL_vf, vR_vf, &
+                                                 dv_ds_vf, &
+                                                 norm_dir, &
+                                                 ix, iy, iz, iv_in, &
+                                                 dL, dim, buff_size_in)
+
+        ! arrays of cell widths
+        type(scalar_field), &
+            dimension(iv%beg:iv%end), &
+            intent(in) :: vL_vf, vR_vf
+
+        type(scalar_field), &
+            dimension(iv%beg:iv%end), &
+            intent(inout) :: dv_ds_vf
+
+        integer, intent(in) :: norm_dir
+        type(int_bounds_info), intent(in) :: ix, iy, iz, iv_in
+        integer, intent(in) :: dim, buff_size_in
+        real(wp), dimension(-buff_size_in:dim + buff_size_in), intent(in) :: dL
+
+        integer :: i, j, k, l !< Generic loop iterators
+
+        is1_viscous = ix
+        is2_viscous = iy
+        is3_viscous = iz
+        iv = iv_in
+
+        $:GPU_UPDATE(device='[is1_viscous, is2_viscous, is3_viscous, iv]')
+
+        ! First-Order Spatial Derivatives in x-direction
+        if (norm_dir == 1) then
+
+            ! A general application of the scalar divergence theorem that
+            ! utilizes the left and right cell-boundary integral-averages,
+            ! inside each cell, or an arithmetic mean of these two at the
+            ! cell-boundaries, to calculate the cell-averaged first-order
+            ! spatial derivatives inside the cell.
+
+            $:GPU_PARALLEL_LOOP(collapse=3)
+            do l = is3_viscous%beg, is3_viscous%end
+                do k = is2_viscous%beg, is2_viscous%end
+                    do j = is1_viscous%beg + 1, is1_viscous%end - 1
+                        $:GPU_LOOP(parallelism='[seq]')
+                        do i = iv%beg, iv%end
+                            dv_ds_vf(i)%sf(j, k, l) = &
+                                1._wp/((1._wp + wa_flg)*dL(j)) &
+                                *(wa_flg*vL_vf(i)%sf(j + 1, k, l) &
+                                  + vR_vf(i)%sf(j, k, l) &
+                                  - vL_vf(i)%sf(j, k, l) &
+                                  - wa_flg*vR_vf(i)%sf(j - 1, k, l))
+                        end do
+                    end do
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+
+            ! END: First-Order Spatial Derivatives in x-direction
+
+            ! First-Order Spatial Derivatives in y-direction
+        elseif (norm_dir == 2) then
+
+            ! A general application of the scalar divergence theorem that
+            ! utilizes the left and right cell-boundary integral-averages,
+            ! inside each cell, or an arithmetic mean of these two at the
+            ! cell-boundaries, to calculate the cell-averaged first-order
+            ! spatial derivatives inside the cell.
+
+            $:GPU_PARALLEL_LOOP(collapse=3)
+            do l = is3_viscous%beg, is3_viscous%end
+                do k = is2_viscous%beg + 1, is2_viscous%end - 1
+                    do j = is1_viscous%beg, is1_viscous%end
+                        $:GPU_LOOP(parallelism='[seq]')
+                        do i = iv%beg, iv%end
+                            dv_ds_vf(i)%sf(j, k, l) = &
+                                1._wp/((1._wp + wa_flg)*dL(k)) &
+                                *(wa_flg*vL_vf(i)%sf(j, k + 1, l) &
+                                  + vR_vf(i)%sf(j, k, l) &
+                                  - vL_vf(i)%sf(j, k, l) &
+                                  - wa_flg*vR_vf(i)%sf(j, k - 1, l))
+                        end do
+                    end do
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+
+            ! END: First-Order Spatial Derivatives in y-direction
+
+            ! First-Order Spatial Derivatives in z-direction
+        else
+
+            ! A general application of the scalar divergence theorem that
+            ! utilizes the left and right cell-boundary integral-averages,
+            ! inside each cell, or an arithmetic mean of these two at the
+            ! cell-boundaries, to calculate the cell-averaged first-order
+            ! spatial derivatives inside the cell.
+
+            $:GPU_PARALLEL_LOOP(collapse=3)
+            do l = is3_viscous%beg + 1, is3_viscous%end - 1
+                do k = is2_viscous%beg, is2_viscous%end
+                    do j = is1_viscous%beg, is1_viscous%end
+                        $:GPU_LOOP(parallelism='[seq]')
+                        do i = iv%beg, iv%end
+                            dv_ds_vf(i)%sf(j, k, l) = &
+                                1._wp/((1._wp + wa_flg)*dL(l)) &
+                                *(wa_flg*vL_vf(i)%sf(j, k, l + 1) &
+                                  + vR_vf(i)%sf(j, k, l) &
+                                  - vL_vf(i)%sf(j, k, l) &
+                                  - wa_flg*vR_vf(i)%sf(j, k, l - 1))
+                        end do
+                    end do
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+
+        end if
+        ! END: First-Order Spatial Derivatives in z-direction
+
+    end subroutine s_apply_scalar_divergence_theorem
+
+    !>  Computes the scalar gradient fields via finite differences
+        !!  @param var Variable to compute derivative of
+        !!  @param grad_x First coordinate direction component of the derivative
+        !!  @param grad_y Second coordinate direction component of the derivative
+        !!  @param grad_z Third coordinate direction component of the derivative
+    subroutine s_compute_fd_gradient(var, grad_x, grad_y, grad_z)
+
+        type(scalar_field), intent(in) :: var
+        type(scalar_field), intent(inout) :: grad_x
+        type(scalar_field), intent(inout) :: grad_y
+        type(scalar_field), intent(inout) :: grad_z
+        type(int_bounds_info) :: ix, iy, iz
+
+        integer :: j, k, l !< Generic loop iterators
+
+        ix%beg = 1 - buff_size; ix%end = m + buff_size - 1
+        if (n > 0) then
+            iy%beg = 1 - buff_size; iy%end = n + buff_size - 1
+        else
+            iy%beg = 0; iy%end = 0
+        end if
+
+        if (p > 0) then
+            iz%beg = 1 - buff_size; iz%end = p + buff_size - 1
+        else
+            iz%beg = 0; iz%end = 0
+        end if
+
+        is1_viscous = ix; is2_viscous = iy; is3_viscous = iz
+
+        $:GPU_UPDATE(device='[is1_viscous,is2_viscous,is3_viscous]')
+
+        $:GPU_PARALLEL_LOOP(collapse=3)
+        do l = is3_viscous%beg, is3_viscous%end
+            do k = is2_viscous%beg, is2_viscous%end
+                do j = is1_viscous%beg, is1_viscous%end
+                    grad_x%sf(j, k, l) = &
+                        (var%sf(j + 1, k, l) - var%sf(j - 1, k, l))/ &
+                        (x_cc(j + 1) - x_cc(j - 1))
+                end do
+            end do
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+        if (n > 0) then
+            $:GPU_PARALLEL_LOOP(collapse=3)
+            do l = is3_viscous%beg, is3_viscous%end
+                do k = is2_viscous%beg, is2_viscous%end
+                    do j = is1_viscous%beg, is1_viscous%end
+                        grad_y%sf(j, k, l) = &
+                            (var%sf(j, k + 1, l) - var%sf(j, k - 1, l))/ &
+                            (y_cc(k + 1) - y_cc(k - 1))
+                    end do
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+        end if
+
+        if (p > 0) then
+            $:GPU_PARALLEL_LOOP(collapse=3)
+            do l = is3_viscous%beg, is3_viscous%end
+                do k = is2_viscous%beg, is2_viscous%end
+                    do j = is1_viscous%beg, is1_viscous%end
+                        grad_z%sf(j, k, l) = &
+                            (var%sf(j, k, l + 1) - var%sf(j, k, l - 1))/ &
+                            (z_cc(l + 1) - z_cc(l - 1))
+                    end do
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+        end if
+
+        $:GPU_PARALLEL_LOOP(collapse=2)
+        do l = idwbuff(3)%beg, idwbuff(3)%end
+            do k = idwbuff(2)%beg, idwbuff(2)%end
+                grad_x%sf(idwbuff(1)%beg, k, l) = &
+                    (-3._wp*var%sf(idwbuff(1)%beg, k, l) + 4._wp*var%sf(idwbuff(1)%beg + 1, k, l) - var%sf(idwbuff(1)%beg + 2, k, l))/ &
+                    (x_cc(idwbuff(1)%beg + 2) - x_cc(idwbuff(1)%beg))
+                grad_x%sf(idwbuff(1)%end, k, l) = &
+                    (+3._wp*var%sf(idwbuff(1)%end, k, l) - 4._wp*var%sf(idwbuff(1)%end - 1, k, l) + var%sf(idwbuff(1)%end - 2, k, l))/ &
+                    (x_cc(idwbuff(1)%end) - x_cc(idwbuff(1)%end - 2))
+            end do
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+        if (n > 0) then
+            $:GPU_PARALLEL_LOOP(collapse=2)
+            do l = idwbuff(3)%beg, idwbuff(3)%end
+                do j = idwbuff(1)%beg, idwbuff(1)%end
+                    grad_y%sf(j, idwbuff(2)%beg, l) = &
+                        (-3._wp*var%sf(j, idwbuff(2)%beg, l) + 4._wp*var%sf(j, idwbuff(2)%beg + 1, l) - var%sf(j, idwbuff(2)%beg + 2, l))/ &
+                        (y_cc(idwbuff(2)%beg + 2) - y_cc(idwbuff(2)%beg))
+                    grad_y%sf(j, idwbuff(2)%end, l) = &
+                        (+3._wp*var%sf(j, idwbuff(2)%end, l) - 4._wp*var%sf(j, idwbuff(2)%end - 1, l) + var%sf(j, idwbuff(2)%end - 2, l))/ &
+                        (y_cc(idwbuff(2)%end) - y_cc(idwbuff(2)%end - 2))
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+            if (p > 0) then
+                $:GPU_PARALLEL_LOOP(collapse=2)
+                do k = idwbuff(2)%beg, idwbuff(2)%end
+                    do j = idwbuff(1)%beg, idwbuff(1)%end
+                        grad_z%sf(j, k, idwbuff(3)%beg) = &
+                            (-3._wp*var%sf(j, k, idwbuff(3)%beg) + 4._wp*var%sf(j, k, idwbuff(3)%beg + 1) - var%sf(j, k, idwbuff(3)%beg + 2))/ &
+                            (z_cc(idwbuff(3)%beg + 2) - z_cc(is3_viscous%beg))
+                        grad_z%sf(j, k, idwbuff(3)%end) = &
+                            (+3._wp*var%sf(j, k, idwbuff(3)%end) - 4._wp*var%sf(j, k, idwbuff(3)%end - 1) + var%sf(j, k, idwbuff(3)%end - 2))/ &
+                            (z_cc(idwbuff(3)%end) - z_cc(idwbuff(3)%end - 2))
+                    end do
+                end do
+                $:END_GPU_PARALLEL_LOOP()
+            end if
+        end if
+
+        if (bc_x%beg <= BC_GHOST_EXTRAP) then
+            $:GPU_PARALLEL_LOOP(collapse=2)
+            do l = idwbuff(3)%beg, idwbuff(3)%end
+                do k = idwbuff(2)%beg, idwbuff(2)%end
+                    grad_x%sf(0, k, l) = (-3._wp*var%sf(0, k, l) + 4._wp*var%sf(1, k, l) - var%sf(2, k, l))/ &
+                                         (x_cc(2) - x_cc(0))
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+        end if
+        if (bc_x%end <= BC_GHOST_EXTRAP) then
+            $:GPU_PARALLEL_LOOP(collapse=2)
+            do l = idwbuff(3)%beg, idwbuff(3)%end
+                do k = idwbuff(2)%beg, idwbuff(2)%end
+                    grad_x%sf(m, k, l) = (3._wp*var%sf(m, k, l) - 4._wp*var%sf(m - 1, k, l) + var%sf(m - 2, k, l))/ &
+                                         (x_cc(m) - x_cc(m - 2))
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+        end if
+        if (n > 0) then
+            if (bc_y%beg <= BC_GHOST_EXTRAP .and. bc_y%beg /= BC_NULL) then
+                $:GPU_PARALLEL_LOOP(collapse=2)
+                do l = idwbuff(3)%beg, idwbuff(3)%end
+                    do j = idwbuff(1)%beg, idwbuff(1)%end
+                        grad_y%sf(j, 0, l) = (-3._wp*var%sf(j, 0, l) + 4._wp*var%sf(j, 1, l) - var%sf(j, 2, l))/ &
+                                             (y_cc(2) - y_cc(0))
+                    end do
+                end do
+                $:END_GPU_PARALLEL_LOOP()
+            end if
+            if (bc_y%end <= BC_GHOST_EXTRAP) then
+                $:GPU_PARALLEL_LOOP(collapse=2)
+                do l = idwbuff(3)%beg, idwbuff(3)%end
+                    do j = idwbuff(1)%beg, idwbuff(1)%end
+                        grad_y%sf(j, n, l) = (3._wp*var%sf(j, n, l) - 4._wp*var%sf(j, n - 1, l) + var%sf(j, n - 2, l))/ &
+                                             (y_cc(n) - y_cc(n - 2))
+                    end do
+                end do
+                $:END_GPU_PARALLEL_LOOP()
+            end if
+            if (p > 0) then
+                if (bc_z%beg <= BC_GHOST_EXTRAP) then
+                    $:GPU_PARALLEL_LOOP(collapse=2)
+                    do k = idwbuff(2)%beg, idwbuff(2)%end
+                        do j = idwbuff(1)%beg, idwbuff(1)%end
+                            grad_z%sf(j, k, 0) = &
+                                (-3._wp*var%sf(j, k, 0) + 4._wp*var%sf(j, k, 1) - var%sf(j, k, 2))/ &
+                                (z_cc(2) - z_cc(0))
+                        end do
+                    end do
+                    $:END_GPU_PARALLEL_LOOP()
+                end if
+                if (bc_z%end <= BC_GHOST_EXTRAP) then
+                    $:GPU_PARALLEL_LOOP(collapse=2)
+                    do k = idwbuff(2)%beg, idwbuff(2)%end
+                        do j = idwbuff(1)%beg, idwbuff(1)%end
+                            grad_z%sf(j, k, p) = &
+                                (3._wp*var%sf(j, k, p) - 4._wp*var%sf(j, k, p - 1) + var%sf(j, k, p - 2))/ &
+                                (z_cc(p) - z_cc(p - 2))
+                        end do
+                    end do
+                    $:END_GPU_PARALLEL_LOOP()
+                end if
+            end if
+        end if
+
+    end subroutine s_compute_fd_gradient
+
+    !> @brief Computes the viscous stress tensor at a single grid cell using finite-difference velocity gradients.
+    subroutine s_compute_viscous_stress_tensor(viscous_stress_tensor, q_prim_vf, dynamic_viscosity, i, j, k)
+        $:GPU_ROUTINE(parallelism='[seq]')
+
+        real(wp), dimension(1:3, 1:3), intent(inout) :: viscous_stress_tensor
+        type(scalar_field), dimension(1:sys_size), intent(in) :: q_prim_vf
+        real(wp), intent(in) :: dynamic_viscosity
+        integer, intent(in) :: i, j, k
+
+        real(wp), dimension(1:3, 1:3) :: velocity_gradient_tensor
+        real(wp), dimension(1:3) :: dx
+        real(wp) :: divergence
+        integer :: l, q ! iterators
+
+        ! zero the viscous stress, collection of velocity diriviatives, and spacial finite differences
+        viscous_stress_tensor = 0._wp
+        velocity_gradient_tensor = 0._wp
+        dx = 0._wp
+
+        ! get the change in x used in the finite difference equaiont
+        dx(1) = 0.5_wp*(x_cc(i + 1) - x_cc(i - 1))
+        dx(2) = 0.5_wp*(y_cc(j + 1) - y_cc(j - 1))
+        if (num_dims == 3) then
+            dx(3) = 0.5_wp*(z_cc(k + 1) - z_cc(k - 1))
+        end if
+
+        ! compute the velocity gradient tensor
+        do l = 1, num_dims
+            velocity_gradient_tensor(l, 1) = (q_prim_vf(momxb + l - 1)%sf(i + 1, j, k) - q_prim_vf(momxb + l - 1)%sf(i - 1, j, k))/(2._wp*dx(1))
+            velocity_gradient_tensor(l, 2) = (q_prim_vf(momxb + l - 1)%sf(i, j + 1, k) - q_prim_vf(momxb + l - 1)%sf(i, j - 1, k))/(2._wp*dx(2))
+            if (num_dims == 3) then
+                velocity_gradient_tensor(l, 3) = (q_prim_vf(momxb + l - 1)%sf(i, j, k + 1) - q_prim_vf(momxb + l - 1)%sf(i, j, k - 1))/(2._wp*dx(3))
+            end if
+        end do
+
+        ! compute divergence
+        divergence = 0._wp
+        do l = 1, num_dims
+            divergence = divergence + velocity_gradient_tensor(l, l)
+        end do
+
+        ! set up the shear stress tensor
+        do l = 1, num_dims
+            do q = 1, num_dims
+                viscous_stress_tensor(l, q) = dynamic_viscosity*(velocity_gradient_tensor(l, q) + velocity_gradient_tensor(q, l))
+            end do
+        end do
+
+        ! populate the viscous_stress_tensor
+        do l = 1, num_dims
+            viscous_stress_tensor(l, l) = viscous_stress_tensor(l, l) - 2._wp*divergence*dynamic_viscosity/3._wp
+        end do
+
+        if (num_dims == 2) then
+            do l = 1, 3
+                viscous_stress_tensor(3, l) = 0._wp
+                viscous_stress_tensor(l, 3) = 0._wp
+            end do
+        end if
+
+    end subroutine s_compute_viscous_stress_tensor
+
+    impure subroutine s_finalize_viscous_module()
+
+    end subroutine s_finalize_viscous_module
+
+end module m_viscous
diff --git a/toolchain/mfc/case_validator.py b/toolchain/mfc/case_validator.py
index 7f52c5fb46..8fc5b66ab1 100644
--- a/toolchain/mfc/case_validator.py
+++ b/toolchain/mfc/case_validator.py
@@ -183,6 +183,15 @@
         "math": r"\mathrm{Re}_1 > 0, \quad \mathrm{Re}_2 > 0",
         "explanation": "Reynolds numbers must be positive. Not supported with model_eqns = 1.",
     },
+    "check_non_newtonian": {
+        "title": "Non-Newtonian Viscosity",
+        "category": "Numerical Schemes",
+        "math": r"\mu = \frac{\tau_0}{\dot\gamma}(1 - e^{-m\dot\gamma}) + K\dot\gamma^{n-1}",
+        "explanation": (
+            "Herschel-Bulkley non-Newtonian viscosity model. Requires viscous=T, "
+            "K > 0, nn > 0, tau0 >= 0, mu_min >= 0, mu_max > mu_min, hb_m > 0."
+        ),
+    },
     # --- Feature Compatibility ---
     "check_mhd": {
         "title": "Magnetohydrodynamics (MHD)",
@@ -1023,6 +1032,39 @@ def check_viscosity(self):
             self.prohibit(Re1 is None and viscous,
                          f"viscous is set to true, but fluid_pp({i})%Re(1) is not specified")
 
+    def check_non_newtonian(self):
+        """Checks constraints on non-Newtonian fluid parameters"""
+        viscous = self.get('viscous', 'F') == 'T'
+        num_fluids = self.get('num_fluids', 1)
+
+        for i in range(1, num_fluids + 1):
+            nn_flag = self.get(f'fluid_pp({i})%non_newtonian', 'F') == 'T'
+            if not nn_flag:
+                continue
+
+            self.prohibit(not viscous,
+                         f"fluid_pp({i})%non_newtonian requires viscous = T")
+
+            K = self.get(f'fluid_pp({i})%K')
+            nn = self.get(f'fluid_pp({i})%nn')
+            tau0 = self.get(f'fluid_pp({i})%tau0')
+            mu_min = self.get(f'fluid_pp({i})%mu_min')
+            mu_max = self.get(f'fluid_pp({i})%mu_max')
+            hb_m = self.get(f'fluid_pp({i})%hb_m')
+
+            self.prohibit(K is not None and K <= 0,
+                         f"fluid_pp({i})%K (consistency index) must be > 0")
+            self.prohibit(nn is not None and nn <= 0,
+                         f"fluid_pp({i})%nn (flow behavior index) must be > 0")
+            self.prohibit(tau0 is not None and tau0 < 0,
+                         f"fluid_pp({i})%tau0 (yield stress) must be >= 0")
+            self.prohibit(mu_min is not None and mu_min < 0,
+                         f"fluid_pp({i})%mu_min must be >= 0")
+            self.prohibit(mu_max is not None and mu_min is not None and mu_max <= mu_min,
+                         f"fluid_pp({i})%mu_max must be > mu_min")
+            self.prohibit(hb_m is not None and hb_m <= 0,
+                         f"fluid_pp({i})%hb_m (Papanastasiou parameter) must be > 0")
+
     def check_mhd_simulation(self):
         """Checks MHD constraints specific to simulation"""
         mhd = self.get('mhd', 'F') == 'T'
diff --git a/toolchain/mfc/params/definitions.py b/toolchain/mfc/params/definitions.py
index 7c456b36fc..8828ea516e 100644
--- a/toolchain/mfc/params/definitions.py
+++ b/toolchain/mfc/params/definitions.py
@@ -1058,6 +1058,9 @@ def _load():  # pylint: disable=too-many-locals,too-many-statements
         _r(f"{px}G", REAL, {"elasticity"}, math=r"\f$G_k\f$")
         _r(f"{px}Re(1)", REAL, {"viscosity"}, math=r"\f$\mathrm{Re}_k\f$ (shear)")
         _r(f"{px}Re(2)", REAL, {"viscosity"}, math=r"\f$\mathrm{Re}_k\f$ (bulk)")
+        _r(f"{px}non_newtonian", LOG, {"viscosity"})
+        for a in ["tau0", "K", "nn", "mu_max", "mu_min", "mu_bulk", "hb_m"]:
+            _r(f"{px}{a}", REAL, {"viscosity"})
 
     # --- bub_pp (bubble properties) ---
     for a, sym in [("R0ref", r"\f$R_0\f$"), ("p0ref", r"\f$p_0\f$"),