diff --git a/docs/module_categories.json b/docs/module_categories.json
index d1da0ed5ed..6d64a0e42f 100644
--- a/docs/module_categories.json
+++ b/docs/module_categories.json
@@ -19,6 +19,8 @@
             "m_bubbles_EE",
             "m_bubbles_EL",
             "m_bubbles_EL_kernels",
+            "m_particles_EL",
+            "m_particles_EL_kernels",
             "m_qbmm",
             "m_hyperelastic",
             "m_hypoelastic",
diff --git a/src/common/m_boundary_common.fpp b/src/common/m_boundary_common.fpp
index 2a33ecfc60..b26838e580 100644
--- a/src/common/m_boundary_common.fpp
+++ b/src/common/m_boundary_common.fpp
@@ -25,6 +25,9 @@ module m_boundary_common
     type(scalar_field), dimension(:, :), allocatable :: bc_buffers
     $:GPU_DECLARE(create='[bc_buffers]')
 
+    type(int_bounds_info), dimension(3) :: beta_bc_bounds
+    $:GPU_DECLARE(create='[beta_bc_bounds]')
+
 #ifdef MFC_MPI
     integer, dimension(1:3, 1:2) :: MPI_BC_TYPE_TYPE
     integer, dimension(1:3, 1:2) :: MPI_BC_BUFFER_TYPE
@@ -32,6 +35,7 @@ module m_boundary_common
 
     private; public :: s_initialize_boundary_common_module, &
  s_populate_variables_buffers, &
+ s_populate_beta_buffers, &
  s_create_mpi_types, &
  s_populate_capillary_buffers, &
  s_populate_F_igr_buffers, &
@@ -78,8 +82,23 @@ contains
                     @:ACC_SETUP_SFs(bc_buffers(i,j))
                 end do
             end do
+        end if
 
+        if (bubbles_lagrange .or. particles_lagrange) then
+            beta_bc_bounds(1)%beg = -mapcells - 1
+            beta_bc_bounds(1)%end = m + mapcells + 1
+            ! n > 0 always for bubbles_lagrange
+            beta_bc_bounds(2)%beg = -mapcells - 1
+            beta_bc_bounds(2)%end = n + mapcells + 1
+            if (p == 0) then
+                beta_bc_bounds(3)%beg = 0
+                beta_bc_bounds(3)%end = 0
+            else
+                beta_bc_bounds(3)%beg = -mapcells - 1
+                beta_bc_bounds(3)%end = p + mapcells + 1
+            end if
         end if
+        $:GPU_UPDATE(device='[beta_bc_bounds]')
 
     end subroutine s_initialize_boundary_common_module
 
@@ -1183,6 +1202,332 @@ contains
 
     end subroutine s_qbmm_extrapolation
 
+    impure subroutine s_populate_beta_buffers(q_beta, bc_type, nvar)
+
+        type(scalar_field), dimension(:), intent(inout) :: q_beta
+        type(integer_field), dimension(1:num_dims, 1:2), intent(in) :: bc_type
+        integer, intent(in) :: nvar
+
+        integer :: k, l
+
+        !< x-direction
+        if (bc_x%beg >= 0) then
+            call s_mpi_reduce_beta_variables_buffers(q_beta, 1, -1, nvar)
+        else
+            $:GPU_PARALLEL_LOOP(private='[l,k]', collapse=2)
+            do l = beta_bc_bounds(3)%beg, beta_bc_bounds(3)%end
+                do k = beta_bc_bounds(2)%beg, beta_bc_bounds(2)%end
+                    select case (bc_x%beg)
+                    case (BC_PERIODIC)
+                        call s_beta_periodic(q_beta, 1, -1, k, l, nvar)
+                    case (BC_REFLECTIVE)
+                        call s_beta_reflective(q_beta, 1, -1, k, l, nvar)
+                    case default
+                    end select
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+        end if
+
+        if (bc_x%end >= 0) then
+            call s_mpi_reduce_beta_variables_buffers(q_beta, 1, 1, nvar)
+        else
+            $:GPU_PARALLEL_LOOP(private='[l,k]', collapse=2)
+            do l = beta_bc_bounds(3)%beg, beta_bc_bounds(3)%end
+                do k = beta_bc_bounds(2)%beg, beta_bc_bounds(2)%end
+                    select case (bc_x%end)
+                    case (BC_PERIODIC)
+                        call s_beta_periodic(q_beta, 1, 1, k, l, nvar)
+                    case (BC_REFLECTIVE)
+                        call s_beta_reflective(q_beta, 1, 1, k, l, nvar)
+                    case default
+                    end select
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+        end if
+
+        !< y-direction
+        if (bc_y%beg >= 0) then
+            call s_mpi_reduce_beta_variables_buffers(q_beta, 2, -1, nvar)
+        else
+            $:GPU_PARALLEL_LOOP(private='[l,k]', collapse=2)
+            do l = beta_bc_bounds(3)%beg, beta_bc_bounds(3)%end
+                do k = beta_bc_bounds(1)%beg, beta_bc_bounds(1)%end
+                    select case (bc_y%beg)
+                    case (BC_PERIODIC)
+                        call s_beta_periodic(q_beta, 2, -1, k, l, nvar)
+                    case (BC_REFLECTIVE)
+                        call s_beta_reflective(q_beta, 2, -1, k, l, nvar)
+                    case default
+                    end select
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+        end if
+
+        if (bc_y%end >= 0) then
+            call s_mpi_reduce_beta_variables_buffers(q_beta, 2, 1, nvar)
+        else
+            $:GPU_PARALLEL_LOOP(private='[l,k]', collapse=2)
+            do l = beta_bc_bounds(3)%beg, beta_bc_bounds(3)%end
+                do k = beta_bc_bounds(1)%beg, beta_bc_bounds(1)%end
+                    select case (bc_y%end)
+                    case (BC_PERIODIC)
+                        call s_beta_periodic(q_beta, 2, 1, k, l, nvar)
+                    case (BC_REFLECTIVE)
+                        call s_beta_reflective(q_beta, 2, 1, k, l, nvar)
+                    case default
+                    end select
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+        end if
+
+        if (num_dims == 2) return
+
+        #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
+            !< z-direction
+            if (bc_z%beg >= 0) then
+                call s_mpi_reduce_beta_variables_buffers(q_beta, 3, -1, nvar)
+            else
+                $:GPU_PARALLEL_LOOP(private='[l,k]', collapse=2)
+                do l = beta_bc_bounds(2)%beg, beta_bc_bounds(2)%end
+                    do k = beta_bc_bounds(1)%beg, beta_bc_bounds(1)%end
+                        select case (bc_type(3, 1)%sf(k, l, 0))
+                        case (BC_PERIODIC)
+                            call s_beta_periodic(q_beta, 3, -1, k, l, nvar)
+                        case (BC_REFLECTIVE)
+                            call s_beta_reflective(q_beta, 3, -1, k, l, nvar)
+                        case default
+                        end select
+                    end do
+                end do
+                $:END_GPU_PARALLEL_LOOP()
+            end if
+
+            if (bc_z%end >= 0) then
+                call s_mpi_reduce_beta_variables_buffers(q_beta, 3, 1, nvar)
+            else
+                $:GPU_PARALLEL_LOOP(private='[l,k]', collapse=2)
+                do l = beta_bc_bounds(2)%beg, beta_bc_bounds(2)%end
+                    do k = beta_bc_bounds(1)%beg, beta_bc_bounds(1)%end
+                        select case (bc_type(3, 2)%sf(k, l, 0))
+                        case (BC_PERIODIC)
+                            call s_beta_periodic(q_beta, 3, 1, k, l, nvar)
+                        case (BC_REFLECTIVE)
+                            call s_beta_reflective(q_beta, 3, 1, k, l, nvar)
+                        case default
+                        end select
+                    end do
+                end do
+                $:END_GPU_PARALLEL_LOOP()
+            end if
+        #:endif
+
+    end subroutine s_populate_beta_buffers
+
+    subroutine s_beta_periodic(q_beta, bc_dir, bc_loc, k, l, nvar)
+        $:GPU_ROUTINE(function_name='s_beta_periodic', &
+            & parallelism='[seq]', cray_inline=True)
+        type(scalar_field), dimension(num_dims + 1), intent(inout) :: q_beta
+        integer, intent(in) :: bc_dir, bc_loc
+        integer, intent(in) :: k, l
+        integer, intent(in) :: nvar
+
+        integer :: j, i
+
+        if (bc_dir == 1) then !< x-direction
+            if (bc_loc == -1) then !bc_x%beg
+                do i = 1, nvar
+                    do j = -mapCells - 1, mapCells
+                        q_beta(beta_vars(i))%sf(j, k, l) = &
+                            q_beta(beta_vars(i))%sf(j, k, l) + q_beta(beta_vars(i))%sf(m + j + 1, k, l)
+                    end do
+                end do
+            else !< bc_x%end
+                do i = 1, nvar
+                    do j = -mapcells, mapcells + 1
+                        q_beta(beta_vars(i))%sf(m + j, k, l) = q_beta(beta_vars(i))%sf(j - 1, k, l)
+                    end do
+                end do
+            end if
+        elseif (bc_dir == 2) then !< y-direction
+            if (bc_loc == -1) then !< bc_y%beg
+                do i = 1, nvar
+                    do j = -mapcells - 1, mapcells
+                        q_beta(beta_vars(i))%sf(k, j, l) = &
+                            q_beta(beta_vars(i))%sf(k, j, l) + q_beta(beta_vars(i))%sf(k, n + j + 1, l)
+                    end do
+                end do
+            else !< bc_y%end
+                do i = 1, nvar
+                    do j = -mapcells, mapcells + 1
+                        q_beta(beta_vars(i))%sf(k, n + j, l) = q_beta(beta_vars(i))%sf(k, j - 1, l)
+                    end do
+                end do
+            end if
+        elseif (bc_dir == 3) then !< z-direction
+            if (bc_loc == -1) then !< bc_z%beg
+                do i = 1, nvar
+                    do j = -mapcells - 1, mapcells
+                        q_beta(beta_vars(i))%sf(k, l, j) = &
+                            q_beta(beta_vars(i))%sf(k, l, j) + q_beta(beta_vars(i))%sf(k, l, p + j + 1)
+                    end do
+                end do
+            else !< bc_z%end
+                do i = 1, nvar
+                    do j = -mapcells, mapcells + 1
+                        q_beta(beta_vars(i))%sf(k, l, p + j) = q_beta(beta_vars(i))%sf(k, l, j - 1)
+                    end do
+                end do
+            end if
+        end if
+
+    end subroutine s_beta_periodic
+
+    subroutine s_beta_extrapolation(q_beta, bc_dir, bc_loc, k, l, nvar)
+        $:GPU_ROUTINE(function_name='s_beta_extrapolation', &
+            & parallelism='[seq]', cray_inline=True)
+        type(scalar_field), dimension(num_dims + 1), intent(inout) :: q_beta
+        integer, intent(in) :: bc_dir, bc_loc
+        integer, intent(in) :: k, l
+        integer, intent(in) :: nvar
+
+        integer :: j, i
+
+        ! Set beta in buffer regions equal to zero
+
+        if (bc_dir == 1) then !< x-direction
+            if (bc_loc == -1) then !bc_x%beg
+                do i = 1, nvar
+                    do j = 1, buff_size
+                        q_beta(beta_vars(i))%sf(-j, k, l) = 0._wp
+                    end do
+                end do
+            else !< bc_x%end
+                do i = 1, nvar
+                    do j = 1, buff_size
+                        q_beta(beta_vars(i))%sf(m + j, k, l) = 0._wp
+                    end do
+                end do
+            end if
+        elseif (bc_dir == 2) then !< y-direction
+            if (bc_loc == -1) then !< bc_y%beg
+                do i = 1, nvar
+                    do j = 1, buff_size
+                        q_beta(beta_vars(i))%sf(k, -j, l) = 0._wp
+                    end do
+                end do
+            else !< bc_y%end
+                do i = 1, nvar
+                    do j = 1, buff_size
+                        q_beta(beta_vars(i))%sf(k, n + j, l) = 0._wp
+                    end do
+                end do
+            end if
+        elseif (bc_dir == 3) then !< z-direction
+            if (bc_loc == -1) then !< bc_z%beg
+                do i = 1, nvar
+                    do j = 1, buff_size
+                        q_beta(beta_vars(i))%sf(k, l, -j) = 0._wp
+                    end do
+                end do
+            else !< bc_z%end
+                do i = 1, nvar
+                    do j = 1, buff_size
+                        q_beta(beta_vars(i))%sf(k, l, p + j) = 0._wp
+                    end do
+                end do
+            end if
+        end if
+
+    end subroutine s_beta_extrapolation
+
+    subroutine s_beta_reflective(q_beta, bc_dir, bc_loc, k, l, nvar)
+        $:GPU_ROUTINE(function_name='s_beta_reflective', &
+            & parallelism='[seq]', cray_inline=True)
+        type(scalar_field), dimension(num_dims + 1), intent(inout) :: q_beta
+        integer, intent(in) :: bc_dir, bc_loc
+        integer, intent(in) :: k, l
+        integer, intent(in) :: nvar
+
+        integer :: j, i
+
+        ! Reflective BC for void fraction:
+        !   1) Fold ghost-cell contributions back onto their mirror interior cells
+        !   2) Set ghost cells = mirror of (now-folded) interior values
+
+        if (bc_dir == 1) then !< x-direction
+            if (bc_loc == -1) then !< bc_x%beg
+                do i = 1, nvar
+                    do j = 1, mapCells + 1
+                        q_beta(beta_vars(i))%sf(j - 1, k, l) = &
+                            q_beta(beta_vars(i))%sf(j - 1, k, l) + q_beta(beta_vars(i))%sf(-j, k, l)
+                    end do
+                    do j = 1, mapCells + 1
+                        q_beta(beta_vars(i))%sf(-j, k, l) = q_beta(beta_vars(i))%sf(j - 1, k, l)
+                    end do
+                end do
+            else !< bc_x%end
+                do i = 1, nvar
+                    do j = 1, mapCells + 1
+                        q_beta(beta_vars(i))%sf(m - (j - 1), k, l) = &
+                            q_beta(beta_vars(i))%sf(m - (j - 1), k, l) + q_beta(beta_vars(i))%sf(m + j, k, l)
+                    end do
+                    do j = 1, mapCells + 1
+                        q_beta(beta_vars(i))%sf(m + j, k, l) = q_beta(beta_vars(i))%sf(m - (j - 1), k, l)
+                    end do
+                end do
+            end if
+        elseif (bc_dir == 2) then !< y-direction
+            if (bc_loc == -1) then !< bc_y%beg
+                do i = 1, nvar
+                    do j = 1, mapCells + 1
+                        q_beta(beta_vars(i))%sf(k, j - 1, l) = &
+                            q_beta(beta_vars(i))%sf(k, j - 1, l) + q_beta(beta_vars(i))%sf(k, -j, l)
+                    end do
+                    do j = 1, mapCells + 1
+                        q_beta(beta_vars(i))%sf(k, -j, l) = q_beta(beta_vars(i))%sf(k, j - 1, l)
+                    end do
+                end do
+            else !< bc_y%end
+                do i = 1, nvar
+                    do j = 1, mapCells + 1
+                        q_beta(beta_vars(i))%sf(k, n - (j - 1), l) = &
+                            q_beta(beta_vars(i))%sf(k, n - (j - 1), l) + q_beta(beta_vars(i))%sf(k, n + j, l)
+                    end do
+                    do j = 1, mapCells + 1
+                        q_beta(beta_vars(i))%sf(k, n + j, l) = q_beta(beta_vars(i))%sf(k, n - (j - 1), l)
+                    end do
+                end do
+            end if
+        elseif (bc_dir == 3) then !< z-direction
+            if (bc_loc == -1) then !< bc_z%beg
+                do i = 1, nvar
+                    do j = 1, mapCells + 1
+                        q_beta(beta_vars(i))%sf(k, l, j - 1) = &
+                            q_beta(beta_vars(i))%sf(k, l, j - 1) + q_beta(beta_vars(i))%sf(k, l, -j)
+                    end do
+                    do j = 1, mapCells + 1
+                        q_beta(beta_vars(i))%sf(k, l, -j) = q_beta(beta_vars(i))%sf(k, l, j - 1)
+                    end do
+                end do
+            else !< bc_z%end
+                do i = 1, nvar
+                    do j = 1, mapCells + 1
+                        q_beta(beta_vars(i))%sf(k, l, p - (j - 1)) = &
+                            q_beta(beta_vars(i))%sf(k, l, p - (j - 1)) + q_beta(beta_vars(i))%sf(k, l, p + j)
+                    end do
+                    do j = 1, mapCells + 1
+                        q_beta(beta_vars(i))%sf(k, l, p + j) = q_beta(beta_vars(i))%sf(k, l, p - (j - 1))
+                    end do
+                end do
+            end if
+        end if
+
+    end subroutine s_beta_reflective
+
     !> @brief Populates ghost cell buffers for the color function and its divergence used in capillary surface tension.
     impure subroutine s_populate_capillary_buffers(c_divs, bc_type)
 
@@ -1317,6 +1662,7 @@ contains
                 $:END_GPU_PARALLEL_LOOP()
             end if
         #:endif
+
     end subroutine s_populate_capillary_buffers
 
     !> @brief Applies periodic boundary conditions to the color function and its divergence fields.
@@ -2027,6 +2373,31 @@ contains
         offset_x%beg = buff_size; offset_x%end = buff_size
         offset_y%beg = buff_size; offset_y%end = buff_size
         offset_z%beg = buff_size; offset_z%end = buff_size
+
+#ifdef MFC_MPI
+        ! Populate global domain boundaries with stretched grids
+        call s_mpi_allreduce_min(x_cb(-1), glb_bounds(1)%beg)
+        call s_mpi_allreduce_max(x_cb(m), glb_bounds(1)%end)
+
+        if (n > 0) then
+            call s_mpi_allreduce_min(y_cb(-1), glb_bounds(2)%beg)
+            call s_mpi_allreduce_max(y_cb(n), glb_bounds(2)%end)
+            if (p > 0) then
+                call s_mpi_allreduce_min(z_cb(-1), glb_bounds(3)%beg)
+                call s_mpi_allreduce_max(z_cb(p), glb_bounds(3)%end)
+            end if
+        end if
+#else
+        glb_bounds(1)%beg = x_cb(-1); glb_bounds(1)%end = x_cb(m)
+        if (n > 0) then
+            glb_bounds(2)%beg = y_cb(-1); glb_bounds(2)%end = y_cb(n)
+            if (p > 0) then
+                glb_bounds(3)%beg = z_cb(-1); glb_bounds(3)%end = z_cb(p)
+            end if
+        end if
+#endif
+        $:GPU_UPDATE(device='[glb_bounds]')
+
 #endif
 
 #ifndef MFC_PRE_PROCESS
diff --git a/src/common/m_constants.fpp b/src/common/m_constants.fpp
index 168d6df96f..6d65d1dbe3 100644
--- a/src/common/m_constants.fpp
+++ b/src/common/m_constants.fpp
@@ -57,9 +57,9 @@ module m_constants
     real(wp), parameter :: initial_distance_buffer = 1.e12_wp !< Initialized levelset distance for the shortest path pair algorithm
 
     ! Lagrange bubbles constants
-    integer, parameter :: mapCells = 3         !< Number of cells around the bubble where the smoothening function will have effect
+    integer, parameter :: mapCells = 3         !< Number of cells around the bubble/particle where the smoothening function will have effect
     real(wp), parameter :: R_uni = 8314._wp    !< Universal gas constant - J/kmol/K
-    integer, parameter :: lag_io_vars = 21 ! Number of variables per particle for MPI_IO
+    integer, parameter :: lag_io_vars = 21     !< Number of variables per particle for MPI_IO
 
     ! Strang Splitting constants
     real(wp), parameter :: dflt_adap_dt_tol = 1.e-4_wp !< Default tolerance for adaptive step size
diff --git a/src/common/m_derived_types.fpp b/src/common/m_derived_types.fpp
index 8559be36d3..2298cad219 100644
--- a/src/common/m_derived_types.fpp
+++ b/src/common/m_derived_types.fpp
@@ -393,6 +393,12 @@ module m_derived_types
         real(wp) :: R_g   !< gas constant of gas (bubble)
     end type subgrid_bubble_physical_parameters
 
+    !> Derived type annexing the physical parameters required for sub-grid particle models
+    type subgrid_particle_physical_parameters
+        real(wp) :: rho0ref_particle !< reference density
+        real(wp) :: cp_particle !<solid particle specific heat
+    end type subgrid_particle_physical_parameters
+
     type mpi_io_airfoil_ib_var
         integer, dimension(2) :: view
         type(vec3_dt), allocatable, dimension(:) :: var
@@ -487,11 +493,25 @@ module m_derived_types
         integer :: smooth_type              !< Smoothing function. 1: Gaussian, 2:Delta 3x3
         logical :: heatTransfer_model       !< Activate HEAT transfer model at the bubble-liquid interface
         logical :: massTransfer_model       !< Activate MASS transfer model at the bubble-liquid interface
+        logical :: write_void_evol          !< Write files to track evolution of void fraction at each time step
         logical :: write_bubbles            !< Write files to track the bubble evolution each time step
         logical :: write_bubbles_stats      !< Write the maximum and minimum radius of each bubble
         integer :: nBubs_glb                !< Global number of bubbles
+        integer :: nParticles_glb           !< Global number of particles
+        integer :: vel_model                !< Particle velocity model
+        integer :: drag_model               !< Particle drag model
+        logical :: pressure_force           !< Include pressure force translational motion
+        logical :: gravity_force            !< Include gravity force in translational motion
+        integer :: qs_drag_model            !< Particle QS drag model
+        integer :: stokes_drag              !< Particle stokes drag
+        integer :: added_mass_model         !< Particle added mass model
+        integer :: interpolation_order      !< Fluid-to-Particle barycentric interpolation order
+        logical :: collision_force           !< Include collision forces
+
+        character(LEN=pathlen_max) :: input_path !< Path to lag_bubbles.dat
         real(wp) :: epsilonb         !< Standard deviation scaling for the gaussian function
         real(wp) :: charwidth        !< Domain virtual depth (z direction, for 2D simulations)
+        integer :: charNz           !< Number of grid cells in characteristic depth
         real(wp) :: valmaxvoid       !< Maximum void fraction permitted
 
     end type bubbles_lagrange_parameters
diff --git a/src/common/m_helper.fpp b/src/common/m_helper.fpp
index 95ba4d78ad..22b2ebd39f 100644
--- a/src/common/m_helper.fpp
+++ b/src/common/m_helper.fpp
@@ -42,7 +42,8 @@ module m_helper
               f_cut_on, &
               f_cut_off, &
               s_downsample_data, &
-              s_upsample_data
+              s_upsample_data, &
+              s_initialize_particles_model
 
 contains
 
@@ -111,6 +112,13 @@ contains
 
     end subroutine s_print_2D_array
 
+    impure subroutine s_initialize_particles_model()
+
+        rho0ref_particle = particle_pp%rho0ref_particle
+        cp_particle = particle_pp%cp_particle
+
+    end subroutine s_initialize_particles_model
+
     !>
           !! bubbles_euler + polytropic
           !! bubbles_euler + non-polytropic
@@ -160,6 +168,7 @@ contains
             R_v = bub_pp%R_v; R_g = bub_pp%R_g
             Tw = bub_pp%T0ref
         end if
+
         if (bubbles_lagrange) then
             cp_v = bub_pp%cp_v; cp_g = bub_pp%cp_g
             k_vl = bub_pp%k_v; k_gl = bub_pp%k_g
diff --git a/src/common/m_helper_basic.fpp b/src/common/m_helper_basic.fpp
index 0b430cb4d5..d28acdbf6e 100644
--- a/src/common/m_helper_basic.fpp
+++ b/src/common/m_helper_basic.fpp
@@ -114,13 +114,14 @@ contains
 
     subroutine s_configure_coordinate_bounds(recon_type, weno_polyn, muscl_polyn, &
                                              igr_order, buff_size, idwint, idwbuff, &
-                                             viscous, bubbles_lagrange, m, n, p, num_dims, igr, ib)
+                                             viscous, bubbles_lagrange, particles_lagrange, &
+                                             m, n, p, num_dims, igr, ib, fd_number)
 
         integer, intent(in) :: recon_type, weno_polyn, muscl_polyn
-        integer, intent(in) :: m, n, p, num_dims, igr_order
+        integer, intent(in) :: m, n, p, num_dims, igr_order, fd_number
         integer, intent(inout) :: buff_size
         type(int_bounds_info), dimension(3), intent(inout) :: idwint, idwbuff
-        logical, intent(in) :: viscous, bubbles_lagrange
+        logical, intent(in) :: viscous, bubbles_lagrange, particles_lagrange
         logical, intent(in) :: igr
         logical, intent(in) :: ib
 
@@ -142,7 +143,12 @@ contains
 
         ! Correction for smearing function in the lagrangian subgrid bubble model
         if (bubbles_lagrange) then
-            buff_size = max(buff_size, 6)
+            buff_size = max(buff_size + fd_number, mapCells + 1 + fd_number)
+        end if
+
+        ! Correction for smearing function in the lagrangian subgrid particle model
+        if (particles_lagrange) then
+            buff_size = max(buff_size + fd_number, mapCells + 1 + fd_number)
         end if
 
         if (ib) then
diff --git a/src/common/m_mpi_common.fpp b/src/common/m_mpi_common.fpp
index 8768298306..1aa249b344 100644
--- a/src/common/m_mpi_common.fpp
+++ b/src/common/m_mpi_common.fpp
@@ -38,6 +38,11 @@ module m_mpi_common
     !! average primitive variables, for a single computational domain boundary
     !! at the time, from the relevant neighboring processor.
 
+    type(int_bounds_info) :: comm_coords(3)
+    integer :: comm_size(3)
+    $:GPU_DECLARE(create='[comm_coords, comm_size]')
+    !! Variables for EL bubbles communication
+
 #ifndef __NVCOMPILER_GPU_UNIFIED_MEM
     $:GPU_DECLARE(create='[buff_send, buff_recv]')
 #endif
@@ -52,6 +57,8 @@ contains
         !!      other procedures that are necessary to setup the module.
     impure subroutine s_initialize_mpi_common_module
 
+        integer :: beta_v_size, beta_comm_size_1, beta_comm_size_2, beta_comm_size_3, beta_halo_size
+
 #ifdef MFC_MPI
         ! Allocating buff_send/recv and. Please note that for the sake of
         ! simplicity, both variables are provided sufficient storage to hold
@@ -78,6 +85,28 @@ contains
             halo_size = -1 + buff_size*(v_size)
         end if
 
+        if (bubbles_lagrange .or. particles_lagrange) then
+            beta_v_size = size(beta_vars)
+            beta_comm_size_1 = m + 2*mapCells + 3
+            beta_comm_size_2 = merge(n + 2*mapCells + 3, 1, n > 0)
+            beta_comm_size_3 = merge(p + 2*mapCells + 3, 1, p > 0)
+            if (n > 0) then
+                if (p > 0) then
+                    beta_halo_size = 2*(mapCells + 1)*beta_v_size*max( &
+                                     beta_comm_size_2*beta_comm_size_3, &
+                                     beta_comm_size_1*beta_comm_size_3, &
+                                     beta_comm_size_1*beta_comm_size_2) - 1
+                else
+                    beta_halo_size = 2*(mapCells + 1)*beta_v_size*max( &
+                                     beta_comm_size_2, &
+                                     beta_comm_size_1) - 1
+                end if
+            else
+                beta_halo_size = 2*(mapCells + 1)*beta_v_size - 1
+            end if
+            halo_size = max(halo_size, beta_halo_size)
+        end if
+
         $:GPU_UPDATE(device='[halo_size, v_size]')
 
 #ifndef __NVCOMPILER_GPU_UNIFIED_MEM
@@ -343,17 +372,21 @@ contains
     impure subroutine s_mpi_reduce_stability_criteria_extrema(icfl_max_loc, &
                                                               vcfl_max_loc, &
                                                               Rc_min_loc, &
+                                                              bubs_loc, &
                                                               icfl_max_glb, &
                                                               vcfl_max_glb, &
-                                                              Rc_min_glb)
+                                                              Rc_min_glb, &
+                                                              bubs_glb)
 
         real(wp), intent(in) :: icfl_max_loc
         real(wp), intent(in) :: vcfl_max_loc
         real(wp), intent(in) :: Rc_min_loc
+        integer, intent(in) :: bubs_loc
 
         real(wp), intent(out) :: icfl_max_glb
         real(wp), intent(out) :: vcfl_max_glb
         real(wp), intent(out) :: Rc_min_glb
+        integer, intent(out) :: bubs_glb
 
 #ifdef MFC_SIMULATION
 #ifdef MFC_MPI
@@ -374,6 +407,11 @@ contains
                             MPI_COMM_WORLD, ierr)
         end if
 
+        if (bubbles_lagrange) then
+            call MPI_REDUCE(bubs_loc, bubs_glb, 1, &
+                            MPI_INTEGER, MPI_SUM, 0, &
+                            MPI_COMM_WORLD, ierr)
+        end if
 #else
 
         icfl_max_glb = icfl_max_loc
@@ -383,11 +421,34 @@ contains
             Rc_min_glb = Rc_min_loc
         end if
 
+        if (bubbles_lagrange) bubs_glb = bubs_loc
 #endif
 #endif
 
     end subroutine s_mpi_reduce_stability_criteria_extrema
 
+    !>  The following subroutine takes the inputted variable and
+        !!     determines its sum on the entire computational domain.
+        !!  @param var_loc holds the local value to be reduced among
+        !!      all the processors in communicator. On output, the variable holds
+        !!      the sum, reduced amongst all of the local values.
+    subroutine s_mpi_reduce_int_sum(var_loc, sum)
+
+        integer, intent(in) :: var_loc
+        integer, intent(out) :: sum
+
+#ifdef MFC_MPI
+        integer :: ierr !< Generic flag used to identify and report MPI errors
+
+        ! Performing reduction procedure and eventually storing its result
+        ! into the variable that was initially inputted into the subroutine
+        call MPI_REDUCE(var_loc, sum, 1, MPI_INTEGER, &
+                        MPI_SUM, 0, MPI_COMM_WORLD, ierr)
+
+#endif
+
+    end subroutine s_mpi_reduce_int_sum
+
     !>  The following subroutine takes the input local variable
         !!      from all processors and reduces to the sum of all
         !!      values. The reduced variable is recorded back onto the
@@ -1106,6 +1167,273 @@ contains
 
     end subroutine s_mpi_sendrecv_variables_buffers
 
+    !>  The goal of this procedure is to populate the buffers of
+        !!      the cell-average conservative variables by communicating
+        !!      with the neighboring processors.
+        !!  @param q_cons_vf Cell-average conservative variables
+        !!  @param mpi_dir MPI communication coordinate direction
+        !!  @param pbc_loc Processor boundary condition (PBC) location
+    subroutine s_mpi_reduce_beta_variables_buffers(q_comm, &
+                                                   mpi_dir, &
+                                                   pbc_loc, &
+                                                   nVar)
+
+        type(scalar_field), dimension(1:), intent(inout) :: q_comm
+        integer, intent(in) :: mpi_dir, pbc_loc, nVar
+
+        integer :: i, j, k, l, r, q !< Generic loop iterators
+        integer :: lb_size
+
+        integer :: buffer_counts(1:3), buffer_count
+
+        type(int_bounds_info) :: boundary_conditions(1:3)
+        integer :: beg_end(1:2), grid_dims(1:3)
+        integer :: dst_proc, src_proc, recv_tag, send_tag
+
+        logical :: beg_end_geq_0, qbmm_comm, replace_buff
+
+        integer :: pack_offset, unpack_offset
+
+#ifdef MFC_MPI
+        integer :: ierr !< Generic flag used to identify and report MPI errors
+
+        call nvtxStartRange("BETA-COMM-PACKBUF")
+
+        ! Set bounds for each dimension
+        ! Always include the full buffer range for each existing dimension.
+        ! The Gaussian smearing kernel writes to buffer cells even at physical
+        ! boundaries, and these contributions must be communicated to neighbors
+        ! in other directions via ADD operations.
+        comm_coords(1)%beg = -mapcells - 1
+        comm_coords(1)%end = m + mapcells + 1
+        comm_coords(2)%beg = merge(-mapcells - 1, 0, n > 0)
+        comm_coords(2)%end = merge(n + mapcells + 1, n, n > 0)
+        comm_coords(3)%beg = merge(-mapcells - 1, 0, p > 0)
+        comm_coords(3)%end = merge(p + mapcells + 1, p, p > 0)
+
+        ! Compute sizes
+        comm_size(1) = comm_coords(1)%end - comm_coords(1)%beg + 1
+        comm_size(2) = comm_coords(2)%end - comm_coords(2)%beg + 1
+        comm_size(3) = comm_coords(3)%end - comm_coords(3)%beg + 1
+
+        ! Buffer counts using the conditional sizes
+        v_size = nVar
+        lb_size = 2*(mapcells + 1) ! Size of the buffer region for beta variables (-mapcells - 1, mapcells)
+        buffer_counts = (/ &
+                        lb_size*v_size*comm_size(2)*comm_size(3), &  ! mpi_dir=1
+                        lb_size*v_size*comm_size(1)*comm_size(3), &  ! mpi_dir=2
+                        lb_size*v_size*comm_size(1)*comm_size(2) &   ! mpi_dir=3
+                        /)
+
+        $:GPU_UPDATE(device='[v_size, comm_coords, comm_size]')
+
+        buffer_count = buffer_counts(mpi_dir)
+        boundary_conditions = (/bc_x, bc_y, bc_z/)
+        beg_end = (/boundary_conditions(mpi_dir)%beg, boundary_conditions(mpi_dir)%end/)
+        beg_end_geq_0 = beg_end(max(pbc_loc, 0) - pbc_loc + 1) >= 0
+
+        ! Implements:
+        ! pbc_loc  bc_x >= 0 -> [send/recv]_tag  [dst/src]_proc
+        ! -1 (=0)      0            ->     [1,0]       [0,0]      | 0 0 [1,0] [beg,beg]
+        ! -1 (=0)      1            ->     [0,0]       [1,0]      | 0 1 [0,0] [end,beg]
+        ! +1 (=1)      0            ->     [0,1]       [1,1]      | 1 0 [0,1] [end,end]
+        ! +1 (=1)      1            ->     [1,1]       [0,1]      | 1 1 [1,1] [beg,end]
+
+        send_tag = f_logical_to_int(.not. f_xor(beg_end_geq_0, pbc_loc == 1))
+        recv_tag = f_logical_to_int(pbc_loc == 1)
+
+        dst_proc = beg_end(1 + f_logical_to_int(f_xor(pbc_loc == 1, beg_end_geq_0)))
+        src_proc = beg_end(1 + f_logical_to_int(pbc_loc == 1))
+
+        grid_dims = (/m, n, p/)
+
+        pack_offset = 0
+        if (f_xor(pbc_loc == 1, beg_end_geq_0)) then
+            pack_offset = grid_dims(mpi_dir) + 1
+        end if
+
+        unpack_offset = 0
+        if (pbc_loc == 1) then
+            unpack_offset = grid_dims(mpi_dir) + 1
+        end if
+
+        replace_buff = .false.
+        if (pbc_loc == 1 .and. beg_end_geq_0) replace_buff = .true.
+
+        ! Pack Buffer to Send
+        #:for mpi_dir in [1, 2, 3]
+            if (mpi_dir == ${mpi_dir}$) then
+                #:if mpi_dir == 1
+                    $:GPU_PARALLEL_LOOP(collapse=4,private='[r]')
+                    do l = comm_coords(3)%beg, comm_coords(3)%end
+                        do k = comm_coords(2)%beg, comm_coords(2)%end
+                            do j = -mapcells - 1, mapcells
+                                do i = 1, v_size
+                                    r = (i - 1) + v_size*( &
+                                        (j + mapcells + 1) + lb_size*( &
+                                        (k - comm_coords(2)%beg) + comm_size(2)* &
+                                        (l - comm_coords(3)%beg)))
+                                    buff_send(r) = real(q_comm(beta_vars(i))%sf(j + pack_offset, k, l), kind=wp)
+                                end do
+                            end do
+                        end do
+                    end do
+                    $:END_GPU_PARALLEL_LOOP()
+                #:elif mpi_dir == 2
+                    $:GPU_PARALLEL_LOOP(collapse=4,private='[r]')
+                    do i = 1, v_size
+                        do l = comm_coords(3)%beg, comm_coords(3)%end
+                            do k = -mapcells - 1, mapcells
+                                do j = comm_coords(1)%beg, comm_coords(1)%end
+                                    r = (i - 1) + v_size*( &
+                                        (j - comm_coords(1)%beg) + comm_size(1)*( &
+                                        (k + mapcells + 1) + lb_size* &
+                                        (l - comm_coords(3)%beg)))
+                                    buff_send(r) = real(q_comm(beta_vars(i))%sf(j, k + pack_offset, l), kind=wp)
+                                end do
+                            end do
+                        end do
+                    end do
+                    $:END_GPU_PARALLEL_LOOP()
+                #:else
+                    $:GPU_PARALLEL_LOOP(collapse=4,private='[r]')
+                    do i = 1, v_size
+                        do l = -mapcells - 1, mapcells
+                            do k = comm_coords(2)%beg, comm_coords(2)%end
+                                do j = comm_coords(1)%beg, comm_coords(1)%end
+                                    r = (i - 1) + v_size*( &
+                                        (j - comm_coords(1)%beg) + comm_size(1)*( &
+                                        (k - comm_coords(2)%beg) + comm_size(2)* &
+                                        (l + mapcells + 1)))
+                                    buff_send(r) = real(q_comm(beta_vars(i))%sf(j, k, l + pack_offset), kind=wp)
+                                end do
+                            end do
+                        end do
+                    end do
+                    $:END_GPU_PARALLEL_LOOP()
+                #:endif
+            end if
+        #:endfor
+        call nvtxEndRange ! Packbuf
+
+        ! Send/Recv
+#ifdef MFC_SIMULATION
+        #:for rdma_mpi in [False, True]
+            if (rdma_mpi .eqv. ${'.true.' if rdma_mpi else '.false.'}$) then
+                #:if rdma_mpi
+                    #:call GPU_HOST_DATA(use_device_addr='[buff_send, buff_recv]')
+                        call nvtxStartRange("BETA-COMM-SENDRECV-RDMA")
+
+                        call MPI_SENDRECV( &
+                            buff_send, buffer_count, mpi_p, dst_proc, send_tag, &
+                            buff_recv, buffer_count, mpi_p, src_proc, recv_tag, &
+                            MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+
+                        call nvtxEndRange ! BETA-MPI-SENDRECV-(NO)-RDMA
+
+                    #:endcall GPU_HOST_DATA
+                    $:GPU_WAIT()
+                #:else
+                    call nvtxStartRange("BETA-COMM-DEV2HOST")
+                    $:GPU_UPDATE(host='[buff_send]')
+                    call nvtxEndRange
+                    call nvtxStartRange("BETA-COMM-SENDRECV-NO-RMDA")
+
+                    call MPI_SENDRECV( &
+                        buff_send, buffer_count, mpi_p, dst_proc, send_tag, &
+                        buff_recv, buffer_count, mpi_p, src_proc, recv_tag, &
+                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+
+                    call nvtxEndRange ! BETA-MPI-SENDRECV-(NO)-RDMA
+
+                    call nvtxStartRange("BETA-COMM-HOST2DEV")
+                    $:GPU_UPDATE(device='[buff_recv]')
+                    call nvtxEndRange
+                #:endif
+            end if
+        #:endfor
+#else
+        call MPI_SENDRECV( &
+            buff_send, buffer_count, mpi_p, dst_proc, send_tag, &
+            buff_recv, buffer_count, mpi_p, src_proc, recv_tag, &
+            MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+#endif
+
+        ! Unpack Received Buffer
+        call nvtxStartRange("BETA-COMM-UNPACKBUF")
+        #:for mpi_dir in [1, 2, 3]
+            if (mpi_dir == ${mpi_dir}$) then
+                #:if mpi_dir == 1
+                    $:GPU_PARALLEL_LOOP(collapse=4,private='[r]',copyin='[replace_buff]')
+                    do l = comm_coords(3)%beg, comm_coords(3)%end
+                        do k = comm_coords(2)%beg, comm_coords(2)%end
+                            do j = -mapcells - 1, mapcells
+                                do i = 1, v_size
+                                    r = (i - 1) + v_size*( &
+                                        (j + mapcells + 1) + lb_size*( &
+                                        (k - comm_coords(2)%beg) + comm_size(2)* &
+                                        (l - comm_coords(3)%beg)))
+                                    if (replace_buff) then
+                                        q_comm(beta_vars(i))%sf(j + unpack_offset, k, l) = real(buff_recv(r), kind=stp)
+                                    else
+                                        q_comm(beta_vars(i))%sf(j + unpack_offset, k, l) = &
+                                            q_comm(beta_vars(i))%sf(j + unpack_offset, k, l) + real(buff_recv(r), kind=stp)
+                                    end if
+                                end do
+                            end do
+                        end do
+                    end do
+                    $:END_GPU_PARALLEL_LOOP()
+                #:elif mpi_dir == 2
+                    $:GPU_PARALLEL_LOOP(collapse=4,private='[r]',copyin='[replace_buff]')
+                    do i = 1, v_size
+                        do l = comm_coords(3)%beg, comm_coords(3)%end
+                            do k = -mapcells - 1, mapcells
+                                do j = comm_coords(1)%beg, comm_coords(1)%end
+                                    r = (i - 1) + v_size*( &
+                                        (j - comm_coords(1)%beg) + comm_size(1)*( &
+                                        (k + mapcells + 1) + lb_size* &
+                                        (l - comm_coords(3)%beg)))
+                                    if (replace_buff) then
+                                        q_comm(beta_vars(i))%sf(j, k + unpack_offset, l) = real(buff_recv(r), kind=stp)
+                                    else
+                                        q_comm(beta_vars(i))%sf(j, k + unpack_offset, l) = &
+                                            q_comm(beta_vars(i))%sf(j, k + unpack_offset, l) + real(buff_recv(r), kind=stp)
+                                    end if
+                                end do
+                            end do
+                        end do
+                    end do
+                    $:END_GPU_PARALLEL_LOOP()
+                #:else
+                    $:GPU_PARALLEL_LOOP(collapse=4,private='[r]',copyin='[replace_buff]')
+                    do i = 1, v_size
+                        do l = -mapcells - 1, mapcells
+                            do k = comm_coords(2)%beg, comm_coords(2)%end
+                                do j = comm_coords(1)%beg, comm_coords(1)%end
+                                    r = (i - 1) + v_size*( &
+                                        (j - comm_coords(1)%beg) + comm_size(1)*( &
+                                        (k - comm_coords(2)%beg) + comm_size(2)* &
+                                        (l + mapcells + 1)))
+                                    if (replace_buff) then
+                                        q_comm(beta_vars(i))%sf(j, k, l + unpack_offset) = real(buff_recv(r), kind=stp)
+                                    else
+                                        q_comm(beta_vars(i))%sf(j, k, l + unpack_offset) = &
+                                            q_comm(beta_vars(i))%sf(j, k, l + unpack_offset) + real(buff_recv(r), kind=stp)
+                                    end if
+                                end do
+                            end do
+                        end do
+                    end do
+                    $:END_GPU_PARALLEL_LOOP()
+                #:endif
+            end if
+        #:endfor
+        call nvtxEndRange
+#endif
+
+    end subroutine s_mpi_reduce_beta_variables_buffers
+
     !>  The purpose of this procedure is to optimally decompose
         !!      the computational domain among the available processors.
         !!      This is performed by attempting to award each processor,
@@ -1135,9 +1463,17 @@ contains
         integer :: recon_order !<
             !! WENO or MUSCL reconstruction order
 
-        integer :: i, j !< Generic loop iterators
+        integer :: i, j, k !< Generic loop iterators
         integer :: ierr !< Generic flag used to identify and report MPI errors
 
+        ! temp array to store neighbor rank coordinates
+        integer, dimension(1:num_dims) :: neighbor_coords
+
+        ! Zeroing out communication needs for moving EL bubbles/particles
+        nidx(1)%beg = 0; nidx(1)%end = 0
+        nidx(2)%beg = 0; nidx(2)%end = 0
+        nidx(3)%beg = 0; nidx(3)%end = 0
+
         if (recon_type == WENO_TYPE) then
             recon_order = weno_order
         else
@@ -1356,6 +1692,7 @@ contains
                     call MPI_CART_RANK(MPI_COMM_CART, proc_coords, &
                                        bc_z%beg, ierr)
                     proc_coords(3) = proc_coords(3) + 1
+                    nidx(3)%beg = -1
                 end if
 
                 ! Boundary condition at the end
@@ -1364,6 +1701,7 @@ contains
                     call MPI_CART_RANK(MPI_COMM_CART, proc_coords, &
                                        bc_z%end, ierr)
                     proc_coords(3) = proc_coords(3) - 1
+                    nidx(3)%end = 1
                 end if
 
 #ifdef MFC_POST_PROCESS
@@ -1495,6 +1833,7 @@ contains
                 call MPI_CART_RANK(MPI_COMM_CART, proc_coords, &
                                    bc_y%beg, ierr)
                 proc_coords(2) = proc_coords(2) + 1
+                nidx(2)%beg = -1
             end if
 
             ! Boundary condition at the end
@@ -1503,6 +1842,7 @@ contains
                 call MPI_CART_RANK(MPI_COMM_CART, proc_coords, &
                                    bc_y%end, ierr)
                 proc_coords(2) = proc_coords(2) - 1
+                nidx(2)%end = 1
             end if
 
 #ifdef MFC_POST_PROCESS
@@ -1588,6 +1928,7 @@ contains
             proc_coords(1) = proc_coords(1) - 1
             call MPI_CART_RANK(MPI_COMM_CART, proc_coords, bc_x%beg, ierr)
             proc_coords(1) = proc_coords(1) + 1
+            nidx(1)%beg = -1
         end if
 
         ! Boundary condition at the end
@@ -1595,6 +1936,7 @@ contains
             proc_coords(1) = proc_coords(1) + 1
             call MPI_CART_RANK(MPI_COMM_CART, proc_coords, bc_x%end, ierr)
             proc_coords(1) = proc_coords(1) - 1
+            nidx(1)%end = 1
         end if
 
 #ifdef MFC_POST_PROCESS
@@ -1640,6 +1982,23 @@ contains
             end if
 #endif
         end if
+
+        @:ALLOCATE(neighbor_ranks(nidx(1)%beg:nidx(1)%end, &
+            nidx(2)%beg:nidx(2)%end, &
+            nidx(3)%beg:nidx(3)%end))
+        do k = nidx(3)%beg, nidx(3)%end
+            do j = nidx(2)%beg, nidx(2)%end
+                do i = nidx(1)%beg, nidx(1)%end
+                    if (abs(i) + abs(j) + abs(k) > 0) then
+                        neighbor_coords(1) = proc_coords(1) + i
+                        if (num_dims > 1) neighbor_coords(2) = proc_coords(2) + j
+                        if (num_dims > 2) neighbor_coords(3) = proc_coords(3) + k
+                        call MPI_CART_RANK(MPI_COMM_CART, neighbor_coords, &
+                                           neighbor_ranks(i, j, k), ierr)
+                    end if
+                end do
+            end do
+        end do
 #endif
 
     end subroutine s_mpi_decompose_computational_domain
diff --git a/src/post_process/m_global_parameters.fpp b/src/post_process/m_global_parameters.fpp
index 219667a158..a3f4e2cea9 100644
--- a/src/post_process/m_global_parameters.fpp
+++ b/src/post_process/m_global_parameters.fpp
@@ -76,6 +76,8 @@ module m_global_parameters
     !! region, this region is used to store information outside the computational
     !! domain based on the boundary conditions.
 
+    integer, allocatable :: beta_vars(:) !< Indices of variables to communicate for bubble/particle coupling
+
     integer :: t_step_start  !< First time-step directory
     integer :: t_step_stop   !< Last time-step directory
     integer :: t_step_save   !< Interval between consecutive time-step directory
@@ -177,6 +179,11 @@ module m_global_parameters
     integer, allocatable, dimension(:) :: proc_coords !<
     !! Processor coordinates in MPI_CART_COMM
 
+    type(int_bounds_info), dimension(3) :: nidx
+
+    integer, allocatable, dimension(:, :, :) :: neighbor_ranks
+    !! Neighbor processor ranks
+
     integer, allocatable, dimension(:) :: start_idx !<
     !! Starting cell-center index of local processor in global grid
 
@@ -206,6 +213,9 @@ module m_global_parameters
     ! Subgrid Bubble Parameters
     type(subgrid_bubble_physical_parameters) :: bub_pp
 
+    ! Subgrid Particle Parameters
+    type(subgrid_particle_physical_parameters) :: particle_pp
+
     real(wp), allocatable, dimension(:) :: adv !< Advection variables
 
     ! Formatted Database File(s) Structure Parameters
@@ -328,6 +338,11 @@ module m_global_parameters
     integer :: nmom
     !> @}
 
+    !> @name Particle modeling variables and parameters
+    !> @{
+    real(wp) :: cp_particle, rho0ref_particle
+    !> @}
+
     !> @name surface tension coefficient
     !> @{
 
@@ -350,6 +365,7 @@ module m_global_parameters
     !> @name Lagrangian bubbles
     !> @{
     logical :: bubbles_lagrange
+    logical :: particles_lagrange
     !> @}
 
     real(wp) :: Bx0 !< Constant magnetic field in the x-direction (1D)
@@ -458,6 +474,10 @@ contains
         bub_pp%R_v = dflt_real; R_v = dflt_real
         bub_pp%R_g = dflt_real; R_g = dflt_real
 
+        ! Subgrid particle parameters
+        particle_pp%rho0ref_particle = dflt_real
+        particle_pp%cp_particle = dflt_real
+
         ! Formatted database file(s) structure parameters
         format = dflt_int
 
@@ -536,6 +556,7 @@ contains
 
         ! Lagrangian bubbles modeling
         bubbles_lagrange = .false.
+        particles_lagrange = .false.
 
         ! IBM
         num_ibs = dflt_int
@@ -673,10 +694,15 @@ contains
                 end if
             end if
 
-            if (bubbles_lagrange) then
-                beta_idx = sys_size + 1
-                sys_size = beta_idx
-            end if
+            ! if (bubbles_lagrange) then
+            !     beta_idx = sys_size + 1
+            !     sys_size = beta_idx
+            ! end if
+
+            ! if (particles_lagrange) then
+            !     beta_idx = sys_size + 1
+            !     sys_size = beta_idx
+            ! end if
 
             if (mhd) then
                 B_idx%beg = sys_size + 1
@@ -806,6 +832,16 @@ contains
                 sys_size = c_idx
             end if
 
+            if (bubbles_lagrange) then
+                beta_idx = sys_size + 1
+                sys_size = beta_idx
+            end if
+
+            if (particles_lagrange) then
+                beta_idx = sys_size + 1
+                sys_size = beta_idx
+            end if
+
             if (cont_damage) then
                 damage_idx = sys_size + 1
                 sys_size = damage_idx
@@ -822,6 +858,14 @@ contains
 
         end if
 
+        if (bubbles_lagrange) then
+            allocate (beta_vars(1:3))
+            beta_vars(1:3) = [1, 2, 5]
+        elseif (particles_lagrange) then
+            allocate (beta_vars(1:8))
+            beta_vars(1:8) = [1, 2, 3, 4, 5, 6, 7, 8]
+        end if
+
         if (chemistry) then
             species_idx%beg = sys_size + 1
             species_idx%end = sys_size + num_species
@@ -1049,6 +1093,8 @@ contains
         if (ib) MPI_IO_IB_DATA%var%sf => null()
 #endif
 
+        if (allocated(neighbor_ranks)) deallocate (neighbor_ranks)
+
     end subroutine s_finalize_global_parameters_module
 
 end module m_global_parameters
diff --git a/src/post_process/m_mpi_proxy.fpp b/src/post_process/m_mpi_proxy.fpp
index 29e65942c6..5e7308d325 100644
--- a/src/post_process/m_mpi_proxy.fpp
+++ b/src/post_process/m_mpi_proxy.fpp
@@ -101,11 +101,22 @@ contains
             & 'adv_n', 'ib', 'cfl_adap_dt', 'cfl_const_dt', 'cfl_dt',          &
             & 'surface_tension', 'hyperelasticity', 'bubbles_lagrange',        &
             & 'output_partial_domain', 'relativity', 'cont_damage', 'bc_io',   &
-            & 'down_sample','fft_wrt', 'hyper_cleaning' ]
+            & 'down_sample','fft_wrt', 'hyper_cleaning',                       &
+            & 'particles_lagrange' ]
             call MPI_BCAST(${VAR}$, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr)
         #:endfor
 
         if (bubbles_lagrange) then
+            #:for VAR in ['lag_header', 'lag_txt_wrt', 'lag_db_wrt', 'lag_id_wrt',         &
+                & 'lag_pos_wrt', 'lag_pos_prev_wrt', 'lag_vel_wrt', 'lag_rad_wrt', &
+                & 'lag_rvel_wrt', 'lag_r0_wrt', 'lag_rmax_wrt', 'lag_rmin_wrt',    &
+                & 'lag_dphidt_wrt', 'lag_pres_wrt', 'lag_mv_wrt', 'lag_mg_wrt',    &
+                & 'lag_betaT_wrt', 'lag_betaC_wrt']
+                call MPI_BCAST(${VAR}$, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr)
+            #:endfor
+        end if
+
+        if (particles_lagrange) then
             #:for VAR in ['lag_header', 'lag_txt_wrt', 'lag_db_wrt', 'lag_id_wrt',         &
                 & 'lag_pos_wrt', 'lag_pos_prev_wrt', 'lag_vel_wrt', 'lag_rad_wrt', &
                 & 'lag_rvel_wrt', 'lag_r0_wrt', 'lag_rmax_wrt', 'lag_rmin_wrt',    &
@@ -141,6 +152,13 @@ contains
             #:endfor
         end if
 
+        ! Subgrid particle parameters
+        if (particles_lagrange) then
+            #:for VAR in ['rho0ref_particle','cp_particle']
+                call MPI_BCAST(particle_pp%${VAR}$, 1, mpi_p, 0, MPI_COMM_WORLD, ierr)
+            #:endfor
+        end if
+
         #:for VAR in [ 'pref', 'rhoref', 'R0ref', 'poly_sigma', 'Web', 'Ca', &
             & 'Re_inv', 'Bx0', 'sigma', 't_save', 't_stop',   &
             & 'x_output%beg', 'x_output%end', 'y_output%beg', &
diff --git a/src/post_process/m_start_up.fpp b/src/post_process/m_start_up.fpp
index 74a15d04db..48f4d1da4b 100644
--- a/src/post_process/m_start_up.fpp
+++ b/src/post_process/m_start_up.fpp
@@ -116,7 +116,7 @@ contains
             lag_rad_wrt, lag_rvel_wrt, lag_r0_wrt, lag_rmax_wrt, &
             lag_rmin_wrt, lag_dphidt_wrt, lag_pres_wrt, lag_mv_wrt, &
             lag_mg_wrt, lag_betaT_wrt, lag_betaC_wrt, &
-            alpha_rho_e_wrt
+            alpha_rho_e_wrt, particles_lagrange, particle_pp
 
         ! Inquiring the status of the post_process.inp file
         file_loc = 'post_process.inp'
@@ -868,7 +868,7 @@ contains
         end if
 
         ! Adding the lagrangian subgrid variables  to the formatted database file
-        if (bubbles_lagrange) then
+        if (bubbles_lagrange .or. particles_lagrange) then
             !! Void fraction field
             q_sf(:, :, :) = 1._wp - q_cons_vf(beta_idx)%sf( &
                             -offset_x%beg:m + offset_x%end, &
@@ -984,6 +984,9 @@ contains
         if (bubbles_euler .or. bubbles_lagrange) then
             call s_initialize_bubbles_model()
         end if
+        if (particles_lagrange) then
+            call s_initialize_particles_model()
+        end if
         if (num_procs > 1) then
             call s_initialize_mpi_proxy_module()
             call s_initialize_mpi_common_module()
diff --git a/src/pre_process/m_global_parameters.fpp b/src/pre_process/m_global_parameters.fpp
index 11fa95e124..99fe8b1e9a 100644
--- a/src/pre_process/m_global_parameters.fpp
+++ b/src/pre_process/m_global_parameters.fpp
@@ -131,6 +131,21 @@ module m_global_parameters
     ! Stands for "InDices With BUFFer".
     type(int_bounds_info) :: idwbuff(1:3)
 
+    integer :: fd_order !<
+    !! The order of the finite-difference (fd) approximations of the first-order
+    !! derivatives that need to be evaluated when the CoM or flow probe data
+    !! files are to be written at each time step
+
+    integer :: fd_number !<
+    !! The finite-difference number is given by MAX(1, fd_order/2). Essentially,
+    !! it is a measure of the half-size of the finite-difference stencil for the
+    !! selected order of accuracy.
+
+    !> @name lagrangian subgrid bubble parameters
+    !> @{!
+    type(bubbles_lagrange_parameters) :: lag_params     !< Lagrange bubbles' parameters
+    !> @}
+
     type(int_bounds_info) :: bc_x, bc_y, bc_z !<
     !! Boundary conditions in the x-, y- and z-coordinate directions
 
@@ -162,6 +177,7 @@ module m_global_parameters
 
     logical :: viscous
     logical :: bubbles_lagrange
+    logical :: particles_lagrange
 
     ! Perturb density of surrounding air so as to break symmetry of grid
     logical :: perturb_flow
@@ -177,6 +193,11 @@ module m_global_parameters
     integer, allocatable, dimension(:) :: proc_coords !<
     !! Processor coordinates in MPI_CART_COMM
 
+    type(int_bounds_info), dimension(3) :: nidx
+
+    integer, allocatable, dimension(:, :, :) :: neighbor_ranks
+    !! Neighbor ranks
+
     integer, allocatable, dimension(:) :: start_idx !<
     !! Starting cell-center index of local processor in global grid
 
@@ -214,6 +235,8 @@ module m_global_parameters
     ! Subgrid Bubble Parameters
     type(subgrid_bubble_physical_parameters) :: bub_pp
 
+    type(subgrid_particle_physical_parameters) :: particle_pp
+
     real(wp) :: rhoref, pref !< Reference parameters for Tait EOS
 
     type(chemistry_parameters) :: chem_params
@@ -263,6 +286,9 @@ module m_global_parameters
     real(wp) :: R0ref, p0ref, rho0ref, T0ref, ss, pv, vd, mu_l, mu_v, mu_g, &
                 gam_v, gam_g, M_v, M_g, cp_v, cp_g, R_v, R_g
 
+    !Solid particle physical parameters
+    real(wp) :: cp_particle, rho0ref_particle
+
     !> @}
 
     !> @name Surface Tension Modeling
@@ -295,9 +321,15 @@ module m_global_parameters
     !! conditions data to march the solution in the physical computational domain
     !! to the next time-step.
 
+    integer, allocatable :: beta_vars(:) !< Indices of variables to communicate for bubble/particle coupling
+
     logical :: fft_wrt
     logical :: dummy  !< AMDFlang workaround: keep a dummy logical to avoid a compiler case-optimization bug when a parameter+GPU-kernel conditional is false
 
+    ! Variables for hardcoded initial conditions that are read from input files
+    character(LEN=2*path_len) :: interface_file
+    real(wp) :: normFac, normMag, g0_ic, p0_ic
+
 contains
 
     !>  Assigns default values to user inputs prior to reading
@@ -406,6 +438,8 @@ contains
         elliptic_smoothing_iters = dflt_int
         elliptic_smoothing = .false.
 
+        particles_lagrange = .false.
+
         fft_wrt = .false.
         dummy = .false.
 
@@ -422,6 +456,27 @@ contains
         ! Initial condition parameters
         num_patches = dflt_int
 
+        fd_order = dflt_int
+        lag_params%cluster_type = dflt_int
+        lag_params%pressure_corrector = .false.
+        lag_params%smooth_type = dflt_int
+        lag_params%heatTransfer_model = .false.
+        lag_params%massTransfer_model = .false.
+        lag_params%write_bubbles = .false.
+        lag_params%write_bubbles_stats = .false.
+        lag_params%nBubs_glb = dflt_int
+        lag_params%vel_model = dflt_int
+        lag_params%drag_model = dflt_int
+        lag_params%epsilonb = 1._wp
+        lag_params%charwidth = dflt_real
+        lag_params%nParticles_glb = dflt_int
+        lag_params%qs_drag_model = dflt_int
+        lag_params%stokes_drag = dflt_int
+        lag_params%added_mass_model = dflt_int
+        lag_params%interpolation_order = dflt_int
+        lag_params%charNz = dflt_int
+        lag_params%valmaxvoid = dflt_real
+
         do i = 1, num_patches_max
             patch_icpp(i)%geometry = dflt_int
             patch_icpp(i)%model_scale(:) = 1._wp
@@ -618,6 +673,10 @@ contains
         bub_pp%R_v = dflt_real; R_v = dflt_real
         bub_pp%R_g = dflt_real; R_g = dflt_real
 
+        ! Subgrid particle parameters
+        particle_pp%rho0ref_particle = dflt_real
+        particle_pp%cp_particle = dflt_real
+
     end subroutine s_assign_default_values_to_user_inputs
 
     !> Computation of parameters, allocation procedures, and/or
@@ -892,6 +951,14 @@ contains
 
         end if
 
+        if (bubbles_lagrange) then
+            allocate (beta_vars(1:3))
+            beta_vars(1:3) = [1, 2, 5]
+        elseif (particles_lagrange) then
+            allocate (beta_vars(1:8))
+            beta_vars(1:8) = [1, 2, 3, 4, 5, 6, 7, 8]
+        end if
+
         if (chemistry) then
             species_idx%beg = sys_size + 1
             species_idx%end = sys_size + num_species
@@ -915,11 +982,13 @@ contains
         chemxb = species_idx%beg
         chemxe = species_idx%end
 
+        if (bubbles_lagrange .or. particles_lagrange) fd_number = max(1, fd_order/2)
+
         call s_configure_coordinate_bounds(recon_type, weno_polyn, muscl_polyn, &
                                            igr_order, buff_size, &
                                            idwint, idwbuff, viscous, &
-                                           bubbles_lagrange, m, n, p, &
-                                           num_dims, igr, ib)
+                                           bubbles_lagrange, particles_lagrange, &
+                                           m, n, p, num_dims, igr, ib, fd_number)
 
 #ifdef MFC_MPI
 
@@ -1038,6 +1107,8 @@ contains
 
 #endif
 
+        if (allocated(neighbor_ranks)) deallocate (neighbor_ranks)
+
     end subroutine s_finalize_global_parameters_module
 
 end module m_global_parameters
diff --git a/src/pre_process/m_mpi_proxy.fpp b/src/pre_process/m_mpi_proxy.fpp
index cbfac0571b..06592e592f 100644
--- a/src/pre_process/m_mpi_proxy.fpp
+++ b/src/pre_process/m_mpi_proxy.fpp
@@ -43,7 +43,7 @@ contains
             & 'perturb_sph_fluid', 'num_patches', 'thermal', 'nb', 'dist_type',&
             & 'relax_model', 'num_ibs', 'n_start', 'elliptic_smoothing_iters', &
             & 'num_bc_patches', 'mixlayer_perturb_nk', 'recon_type',           &
-            & 'muscl_order', 'igr_order' ]
+            & 'muscl_order', 'igr_order', 'fd_order']
             call MPI_BCAST(${VAR}$, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
         #:endfor
 
@@ -55,7 +55,8 @@ contains
             & 'cfl_const_dt', 'cfl_dt', 'surface_tension',                     &
             & 'hyperelasticity', 'pre_stress', 'elliptic_smoothing', 'viscous',&
             & 'bubbles_lagrange', 'bc_io', 'mhd', 'relativity', 'cont_damage', &
-            & 'igr', 'down_sample', 'simplex_perturb','fft_wrt', 'hyper_cleaning' ]
+            & 'igr', 'down_sample', 'simplex_perturb','fft_wrt', 'hyper_cleaning',&
+            & 'particles_lagrange' ]
             call MPI_BCAST(${VAR}$, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr)
         #:endfor
         call MPI_BCAST(fluid_rho(1), num_fluids_max, mpi_p, 0, MPI_COMM_WORLD, ierr)
@@ -137,6 +138,13 @@ contains
             #:endfor
         end do
 
+        ! Variables from input files for hardcoded patches
+        call MPI_BCAST(interface_file, len(interface_file), MPI_CHARACTER, 0, MPI_COMM_WORLD, ierr)
+        call MPI_BCAST(normFac, 1, mpi_p, 0, MPI_COMM_WORLD, ierr)
+        call MPI_BCAST(normMag, 1, mpi_p, 0, MPI_COMM_WORLD, ierr)
+        call MPI_BCAST(g0_ic, 1, mpi_p, 0, MPI_COMM_WORLD, ierr)
+        call MPI_BCAST(p0_ic, 1, mpi_p, 0, MPI_COMM_WORLD, ierr)
+
         ! Simplex noise  and fluid physical parameters
         do i = 1, num_fluids_max
             #:for VAR in [ 'gamma','pi_inf', 'G', 'cv', 'qv', 'qvp' ]
@@ -152,15 +160,6 @@ contains
             end do
         end do
 
-        ! Subgrid bubble parameters
-        if (bubbles_euler .or. bubbles_lagrange) then
-            #:for VAR in [ 'R0ref','p0ref','rho0ref','T0ref', &
-                'ss','pv','vd','mu_l','mu_v','mu_g','gam_v','gam_g', &
-                'M_v','M_g','k_v','k_g','cp_v','cp_g','R_v','R_g']
-                call MPI_BCAST(bub_pp%${VAR}$, 1, mpi_p, 0, MPI_COMM_WORLD, ierr)
-            #:endfor
-        end if
-
         do i = 1, 3
             call MPI_BCAST(simplex_params%perturb_vel(i), 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr)
             call MPI_BCAST(simplex_params%perturb_vel_freq(i), 1, mpi_p, 0, MPI_COMM_WORLD, ierr)
@@ -170,7 +169,6 @@ contains
                 call MPI_BCAST(simplex_params%perturb_vel_offset(i, j), 1, mpi_p, 0, MPI_COMM_WORLD, ierr)
             end do
         end do
-
 #endif
 
     end subroutine s_mpi_bcast_user_inputs
diff --git a/src/pre_process/m_start_up.fpp b/src/pre_process/m_start_up.fpp
index 3b45547c29..4a48e392b2 100644
--- a/src/pre_process/m_start_up.fpp
+++ b/src/pre_process/m_start_up.fpp
@@ -145,8 +145,11 @@ contains
             elliptic_smoothing, elliptic_smoothing_iters, &
             viscous, bubbles_lagrange, num_bc_patches, &
             patch_bc, Bx0, relativity, cont_damage, igr, igr_order, &
-            down_sample, recon_type, muscl_order, hyper_cleaning, &
-            simplex_perturb, simplex_params, fft_wrt
+            down_sample, recon_type, muscl_order, fft_wrt, &
+            fd_order, lag_params, simplex_perturb, simplex_params, &
+            interface_file, normFac, normMag, &
+            g0_ic, p0_ic, hyper_cleaning, &
+            particles_lagrange, particle_pp
 
         ! Inquiring the status of the pre_process.inp file
         file_loc = 'pre_process.inp'
@@ -714,6 +717,9 @@ contains
         if (bubbles_euler .or. bubbles_lagrange) then
             call s_initialize_bubbles_model()
         end if
+        if (particles_lagrange) then
+            call s_initialize_particles_model()
+        end if
         call s_initialize_mpi_common_module()
         call s_initialize_data_output_module()
         call s_initialize_variables_conversion_module()
diff --git a/src/simulation/m_bubbles.fpp b/src/simulation/m_bubbles.fpp
index 0f17bd60c3..204a824b56 100644
--- a/src/simulation/m_bubbles.fpp
+++ b/src/simulation/m_bubbles.fpp
@@ -17,6 +17,8 @@ module m_bubbles
 
     use m_helper_basic         !< Functions to compare floating point numbers
 
+    use m_bubbles_EL_kernels
+
     implicit none
 
     real(wp) :: chi_vw  !< Bubble wall properties (Ando 2010)
@@ -40,7 +42,7 @@ contains
         !!  @param f_bub_adv_src Source for bubble volume fraction
         !!  @param f_divu Divergence of velocity
         !!  @param fCson Speed of sound from fP (EL)
-    elemental function f_rddot(fRho, fP, fR, fV, fR0, fpb, fpbdot, alf, fntait, fBtait, f_bub_adv_src, f_divu, fCson)
+    function f_rddot(fRho, fP, fR, fV, fR0, fpb, fpbdot, alf, fntait, fBtait, f_bub_adv_src, f_divu, fCson)
         $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in) :: fRho, fP, fR, fV, fR0, fpb, fpbdot, alf
         real(wp), intent(in) :: fntait, fBtait, f_bub_adv_src, f_divu
@@ -84,7 +86,7 @@ contains
         !!  @param fR Current bubble radius
         !!  @param fV Current bubble velocity
         !!  @param fpb Internal bubble pressure
-    elemental function f_cpbw(fR0, fR, fV, fpb)
+    function f_cpbw(fR0, fR, fV, fpb)
         $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in) :: fR0, fR, fV, fpb
 
@@ -103,7 +105,7 @@ contains
         !!  @param fCpinf Driving bubble pressure
         !!  @param fntait Tait EOS parameter
         !!  @param fBtait Tait EOS parameter
-    elemental function f_H(fCpbw, fCpinf, fntait, fBtait)
+    function f_H(fCpbw, fCpinf, fntait, fBtait)
         $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in) :: fCpbw, fCpinf, fntait, fBtait
 
@@ -123,7 +125,7 @@ contains
         !! @param fntait Tait EOS parameter
         !! @param fBtait Tait EOS parameter
         !! @param fH Bubble enthalpy
-    elemental function f_cgas(fCpinf, fntait, fBtait, fH)
+    function f_cgas(fCpinf, fntait, fBtait, fH)
         $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in) :: fCpinf, fntait, fBtait, fH
 
@@ -146,7 +148,7 @@ contains
         !!  @param fBtait Tait EOS parameter
         !!  @param advsrc Advection equation source term
         !!  @param divu Divergence of velocity
-    elemental function f_cpinfdot(fRho, fP, falf, fntait, fBtait, advsrc, divu)
+    function f_cpinfdot(fRho, fP, falf, fntait, fBtait, advsrc, divu)
         $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in) :: fRho, fP, falf, fntait, fBtait, advsrc, divu
 
@@ -176,7 +178,7 @@ contains
         !!  @param fV Current bubble velocity
         !!  @param fR0 Equilibrium bubble radius
         !!  @param fpbdot Time derivative of the internal bubble pressure
-    elemental function f_Hdot(fCpbw, fCpinf, fCpinf_dot, fntait, fBtait, fR, fV, fR0, fpbdot)
+    function f_Hdot(fCpbw, fCpinf, fCpinf_dot, fntait, fBtait, fR, fV, fR0, fpbdot)
         $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in) :: fCpbw, fCpinf, fCpinf_dot, fntait, fBtait
         real(wp), intent(in) :: fR, fV, fR0, fpbdot
@@ -211,7 +213,7 @@ contains
         !!  @param fR Current bubble radius
         !!  @param fV Current bubble velocity
         !!  @param fCpbw Boundary wall pressure
-    elemental function f_rddot_RP(fCp, fRho, fR, fV, fCpbw)
+    function f_rddot_RP(fCp, fRho, fR, fV, fCpbw)
         $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in) :: fCp, fRho, fR, fV, fCpbw
 
@@ -234,7 +236,7 @@ contains
         !!  @param fcgas Current gas sound speed
         !!  @param fntait Tait EOS parameter
         !!  @param fBtait Tait EOS parameter
-    elemental function f_rddot_G(fCpbw, fR, fV, fH, fHdot, fcgas, fntait, fBtait)
+    function f_rddot_G(fCpbw, fR, fV, fH, fHdot, fcgas, fntait, fBtait)
         $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in) :: fCpbw, fR, fV, fH, fHdot
         real(wp), intent(in) :: fcgas, fntait, fBtait
@@ -257,7 +259,7 @@ contains
         !!  @param fR Current bubble radius
         !!  @param fV Current bubble velocity
         !!  @param fpb Internal bubble pressure
-    elemental function f_cpbw_KM(fR0, fR, fV, fpb)
+    function f_cpbw_KM(fR0, fR, fV, fpb)
         $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in) :: fR0, fR, fV, fpb
         real(wp) :: f_cpbw_KM
@@ -284,7 +286,7 @@ contains
         !!  @param fV Current bubble velocity
         !!  @param fR0 Equilibrium bubble radius
         !!  @param fC Current sound speed
-    elemental function f_rddot_KM(fpbdot, fCp, fCpbw, fRho, fR, fV, fR0, fC)
+    function f_rddot_KM(fpbdot, fCp, fCpbw, fRho, fR, fV, fR0, fC)
         $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in) :: fpbdot, fCp, fCpbw
         real(wp), intent(in) :: fRho, fR, fV, fR0, fC
@@ -318,7 +320,7 @@ contains
     !>  Subroutine that computes bubble wall properties for vapor bubbles
         !!  @param pb_in Internal bubble pressure
         !!  @param iR0 Current bubble size index
-    elemental subroutine s_bwproperty(pb_in, iR0, chi_vw_out, k_mw_out, rho_mw_out)
+    subroutine s_bwproperty(pb_in, iR0, chi_vw_out, k_mw_out, rho_mw_out)
         $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in) :: pb_in
         integer, intent(in) :: iR0
@@ -349,7 +351,7 @@ contains
         !!  @param fbeta_c Mass transfer coefficient (EL)
         !!  @param fR_m Mixture gas constant (EL)
         !!  @param fgamma_m Mixture gamma (EL)
-    elemental subroutine s_vflux(fR, fV, fpb, fmass_v, iR0, vflux, fmass_g, fbeta_c, fR_m, fgamma_m)
+    subroutine s_vflux(fR, fV, fpb, fmass_v, iR0, vflux, fmass_g, fbeta_c, fR_m, fgamma_m)
         $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in) :: fR
         real(wp), intent(in) :: fV
@@ -407,7 +409,8 @@ contains
         !!  @param fbeta_t Mass transfer coefficient (EL)
         !!  @param fR_m Mixture gas constant (EL)
         !!  @param fgamma_m Mixture gamma (EL)
-    elemental function f_bpres_dot(fvflux, fR, fV, fpb, fmass_v, iR0, fbeta_t, fR_m, fgamma_m)
+    function f_bpres_dot(fvflux, fR, fV, fpb, fmass_v, iR0, fbeta_t, fR_m, fgamma_m)
+        !$DIR INLINENEVER f_bpres_dot
         $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in) :: fvflux
         real(wp), intent(in) :: fR
@@ -463,27 +466,32 @@ contains
         !!  @param fbeta_t Heat transfer coefficient (EL)
         !!  @param fCson Speed of sound (EL)
         !!  @param adap_dt_stop Fail-safe exit if max iteration count reached
-    subroutine s_advance_step(fRho, fP, fR, fV, fR0, fpb, fpbdot, alf, &
-                              fntait, fBtait, f_bub_adv_src, f_divu, &
-                              bub_id, fmass_v, fmass_g, fbeta_c, &
-                              fbeta_t, fCson, adap_dt_stop)
-        $:GPU_ROUTINE(function_name='s_advance_step',parallelism='[seq]', &
-            & cray_inline=True)
+    function f_advance_step(fRho, fP, fR, fV, fR0, fpb, fpbdot, alf, &
+                            fntait, fBtait, f_bub_adv_src, f_divu, &
+                            bub_id, fmass_v, fmass_g, fbeta_c, &
+                            fbeta_t, fCson, fRe, fPos, &
+                            fVel, cell, q_prim_vf) result(adap_dt_stop)
+        $:GPU_ROUTINE(parallelism='[seq]')
 
         real(wp), intent(inout) :: fR, fV, fpb, fmass_v
         real(wp), intent(in) :: fRho, fP, fR0, fpbdot, alf
         real(wp), intent(in) :: fntait, fBtait, f_bub_adv_src, f_divu
         integer, intent(in) :: bub_id
         real(wp), intent(in) :: fmass_g, fbeta_c, fbeta_t, fCson
-        integer, intent(inout) :: adap_dt_stop
+        real(wp), intent(inout), dimension(3), optional :: fPos, fVel
+        real(wp), intent(in), optional :: fRe
+        integer, intent(in), dimension(3), optional :: cell
+        type(scalar_field), intent(in), dimension(sys_size), optional :: q_prim_vf
 
         real(wp), dimension(5) :: err !< Error estimates for adaptive time stepping
         real(wp) :: t_new !< Updated time step size
         real(wp) :: h0, h !< Time step size
         real(wp), dimension(4) :: myR_tmp1, myV_tmp1, myR_tmp2, myV_tmp2 !< Bubble radius, radial velocity, and radial acceleration for the inner loop
         real(wp), dimension(4) :: myPb_tmp1, myMv_tmp1, myPb_tmp2, myMv_tmp2 !< Gas pressure and vapor mass for the inner loop (EL)
-        real(wp) :: fR2, fV2, fpb2, fmass_v2
-        integer :: iter_count
+        real(wp) :: fR2, fV2, fpb2, fmass_v2, f_bTemp
+        real(wp), dimension(3) :: vTemp, aTemp
+        integer :: adap_dt_stop
+        integer :: l, iter_count
 
         call s_initial_substep_h(fRho, fP, fR, fV, fR0, fpb, fpbdot, alf, &
                                  fntait, fBtait, f_bub_adv_src, f_divu, fCson, h0)
@@ -565,6 +573,42 @@ contains
                         ! Update pb and mass_v
                         fpb = myPb_tmp1(4)
                         fmass_v = myMv_tmp1(4)
+
+                        select case (lag_vel_model)
+                        case (1)
+                            do l = 1, num_dims
+                                vTemp(l) = f_interpolate_velocity(fR, cell, l, q_prim_vf)
+                            end do
+                            do l = 1, num_dims
+                                fVel(l) = vTemp(l)
+                                fPos(l) = fPos(l) + h*vTemp(l)
+                            end do
+                        case (2)
+                            do l = 1, num_dims
+                                f_bTemp = f_get_bubble_force(fPos(l), fR, fV, fVel(l), fmass_g, fmass_v, &
+                                                             fRe, fRho, cell, l, q_prim_vf)
+                                aTemp(l) = f_bTemp/(fmass_g + fmass_v)
+                            end do
+                            do l = 1, num_dims
+                                fVel(l) = fVel(l) + h*aTemp(l)
+                                fPos(l) = fPos(l) + h*fVel(l)
+                            end do
+                        case (3)
+                            do l = 1, num_dims
+                                f_bTemp = f_get_bubble_force(fPos(l), fR, fV, fVel(l), fmass_g, fmass_v, &
+                                                             fRe, fRho, cell, l, q_prim_vf)
+                                aTemp(l) = 2._wp*f_bTemp/(fmass_g + fmass_v) - 3*fV*fVel(l)/fR
+                            end do
+                            do l = 1, num_dims
+                                fVel(l) = fVel(l) + h*aTemp(l)
+                                fPos(l) = fPos(l) + h*fVel(l)
+                            end do
+                        case default
+                            do l = 1, num_dims
+                                fVel(l) = fVel(l)
+                                fPos(l) = fPos(l)
+                            end do
+                        end select
                     end if
 
                     ! Update step size for the next sub-step
@@ -588,7 +632,7 @@ contains
 
         if (iter_count >= adap_dt_max_iters) adap_dt_stop = 1
 
-    end subroutine s_advance_step
+    end function f_advance_step
 
     !> Choose the initial time step size for the adaptive time stepping routine
         !!  (See Heirer, E. Hairer S.P.Nørsett G. Wanner, Solving Ordinary
@@ -613,10 +657,10 @@ contains
         $:GPU_ROUTINE(function_name='s_initial_substep_h',parallelism='[seq]', &
             & cray_inline=True)
 
-        real(wp), intent(IN) :: fRho, fP, fR, fV, fR0, fpb, fpbdot, alf
-        real(wp), intent(IN) :: fntait, fBtait, f_bub_adv_src, f_divu
-        real(wp), intent(IN) :: fCson
-        real(wp), intent(OUT) :: h
+        real(wp), intent(in) :: fRho, fP, fR, fV, fR0, fpb, fpbdot, alf
+        real(wp), intent(in) :: fntait, fBtait, f_bub_adv_src, f_divu
+        real(wp), intent(in) :: fCson
+        real(wp), intent(out) :: h
 
         real(wp), dimension(2) :: h_size !< Time step size (h0, h1)
         real(wp), dimension(3) :: d_norms !< norms (d_0, d_1, d_2)
@@ -697,12 +741,12 @@ contains
         $:GPU_ROUTINE(function_name='s_advance_substep',parallelism='[seq]', &
             & cray_inline=True)
 
-        real(wp), intent(OUT) :: err
-        real(wp), intent(IN) :: fRho, fP, fR, fV, fR0, fpb, fpbdot, alf
-        real(wp), intent(IN) :: fntait, fBtait, f_bub_adv_src, f_divu, h
-        integer, intent(IN) :: bub_id
-        real(wp), intent(IN) :: fmass_v, fmass_g, fbeta_c, fbeta_t, fCson
-        real(wp), dimension(4), intent(OUT) :: myR_tmp, myV_tmp, myPb_tmp, myMv_tmp
+        real(wp), intent(out) :: err
+        real(wp), intent(in) :: fRho, fP, fR, fV, fR0, fpb, fpbdot, alf
+        real(wp), intent(in) :: fntait, fBtait, f_bub_adv_src, f_divu, h
+        integer, intent(in) :: bub_id
+        real(wp), intent(in) :: fmass_v, fmass_g, fbeta_c, fbeta_t, fCson
+        real(wp), dimension(4), intent(out) :: myR_tmp, myV_tmp, myPb_tmp, myMv_tmp
 
         real(wp), dimension(4) :: myA_tmp, mydPbdt_tmp, mydMvdt_tmp
         real(wp) :: err_R, err_V
@@ -802,14 +846,15 @@ contains
         !!  @param fbeta_c Mass transfer coefficient
         !!  @param fbeta_t Heat transfer coefficient
         !!  @param fdPbdt_tmp Rate of change of the internal bubble pressure
+        !!  @param fdMvdt_tmp Rate of change of the mass of vapor in the bubble
         !!  @param advance_EL Rate of change of the mass of vapor in the bubble
-    elemental subroutine s_advance_EL(fR_tmp, fV_tmp, fPb_tmp, fMv_tmp, bub_id, &
-                                      fmass_g, fbeta_c, fbeta_t, fdPbdt_tmp, advance_EL)
+    subroutine s_advance_EL(fR_tmp, fV_tmp, fPb_tmp, fMv_tmp, bub_id, &
+                            fmass_g, fbeta_c, fbeta_t, fdPbdt_tmp, advance_EL)
         $:GPU_ROUTINE(parallelism='[seq]')
-        real(wp), intent(IN) :: fR_tmp, fV_tmp, fPb_tmp, fMv_tmp
-        real(wp), intent(IN) :: fmass_g, fbeta_c, fbeta_t
-        integer, intent(IN) :: bub_id
-        real(wp), intent(INOUT) :: fdPbdt_tmp
+        real(wp), intent(in) :: fR_tmp, fV_tmp, fPb_tmp, fMv_tmp
+        real(wp), intent(in) :: fmass_g, fbeta_c, fbeta_t
+        integer, intent(in) :: bub_id
+        real(wp), intent(inout) :: fdPbdt_tmp
         real(wp), intent(out) :: advance_EL
         real(wp) :: fVapFlux, myR_m, mygamma_m
 
diff --git a/src/simulation/m_bubbles_EE.fpp b/src/simulation/m_bubbles_EE.fpp
index dad03d3f87..4674a36338 100644
--- a/src/simulation/m_bubbles_EE.fpp
+++ b/src/simulation/m_bubbles_EE.fpp
@@ -183,7 +183,7 @@ contains
 
         integer :: i, j, k, l, q, ii !< Loop variables
 
-        integer :: adap_dt_stop_max, adap_dt_stop !< Fail-safe exit if max iteration count reached
+        integer :: adap_dt_stop_sum, adap_dt_stop !< Fail-safe exit if max iteration count reached
         integer :: dmBub_id !< Dummy variables for unified subgrid bubble subroutines
         real(wp) :: dmMass_v, dmMass_n, dmBeta_c, dmBeta_t, dmCson
 
@@ -205,10 +205,9 @@ contains
         end do
         $:END_GPU_PARALLEL_LOOP()
 
-        adap_dt_stop_max = 0
+        adap_dt_stop_sum = 0
         $:GPU_PARALLEL_LOOP(private='[j,k,l,Rtmp, Vtmp, myalpha_rho, myalpha, myR, myV, alf, myP, myRho, R2Vav, R3, nbub, pb_local, mv_local, vflux, pbdot, rddot, n_tait, B_tait, my_divu]', collapse=3, &
-            & reduction='[[adap_dt_stop_max]]', reductionOp='[MAX]', &
-            & copy='[adap_dt_stop_max]')
+            & copy='[adap_dt_stop_sum]')
         do l = 0, p
             do k = 0, n
                 do j = 0, m
@@ -298,21 +297,20 @@ contains
                                 pb_local = 0._wp; mv_local = 0._wp; vflux = 0._wp; pbdot = 0._wp
                             end if
 
+                            adap_dt_stop = 0
+
                             ! Adaptive time stepping
                             if (adap_dt) then
-                                adap_dt_stop = 0
 
-                                call s_advance_step(myRho, myP, myR, myV, R0(q), &
-                                                    pb_local, pbdot, alf, n_tait, B_tait, &
-                                                    bub_adv_src(j, k, l), divu_in%sf(j, k, l), &
-                                                    dmBub_id, dmMass_v, dmMass_n, dmBeta_c, &
-                                                    dmBeta_t, dmCson, adap_dt_stop)
+                                adap_dt_stop = f_advance_step(myRho, myP, myR, myV, R0(q), &
+                                                              pb_local, pbdot, alf, n_tait, B_tait, &
+                                                              bub_adv_src(j, k, l), divu_in%sf(j, k, l), &
+                                                              dmBub_id, dmMass_v, dmMass_n, dmBeta_c, &
+                                                              dmBeta_t, dmCson)
 
                                 q_cons_vf(rs(q))%sf(j, k, l) = nbub*myR
                                 q_cons_vf(vs(q))%sf(j, k, l) = nbub*myV
 
-                                adap_dt_stop_max = max(adap_dt_stop_max, adap_dt_stop)
-
                             else
                                 rddot = f_rddot(myRho, myP, myR, myV, R0(q), &
                                                 pb_local, pbdot, alf, n_tait, B_tait, &
@@ -321,6 +319,9 @@ contains
                                 bub_v_src(j, k, l, q) = nbub*rddot
                                 bub_r_src(j, k, l, q) = q_cons_vf(vs(q))%sf(j, k, l)
                             end if
+
+                            $:GPU_ATOMIC(atomic='update')
+                            adap_dt_stop_sum = adap_dt_stop_sum + adap_dt_stop
                         end if
                     end do
                 end do
@@ -328,7 +329,7 @@ contains
         end do
         $:END_GPU_PARALLEL_LOOP()
 
-        if (adap_dt .and. adap_dt_stop_max > 0) call s_mpi_abort("Adaptive time stepping failed to converge.")
+        if (adap_dt .and. adap_dt_stop_sum > 0) call s_mpi_abort("Adaptive time stepping failed to converge.")
 
         if (.not. adap_dt) then
             $:GPU_PARALLEL_LOOP(private='[i,k,l,q]', collapse=3)
diff --git a/src/simulation/m_bubbles_EL.fpp b/src/simulation/m_bubbles_EL.fpp
index 4ae590a4b8..b8f94fc074 100644
--- a/src/simulation/m_bubbles_EL.fpp
+++ b/src/simulation/m_bubbles_EL.fpp
@@ -27,6 +27,12 @@ module m_bubbles_EL
 
     use m_helper
 
+    use m_mpi_common
+
+    use m_ibm
+
+    use m_finite_differences
+
     implicit none
 
     !(nBub)
@@ -67,21 +73,31 @@ module m_bubbles_EL
 
     $:GPU_DECLARE(create='[lag_num_ts]')
 
-    integer :: nBubs                            !< Number of bubbles in the local domain
-    real(wp) :: Rmax_glb, Rmin_glb       !< Maximum and minimum bubbe size in the local domain
+    real(wp) :: Rmax_glb, Rmin_glb       !< Maximum and minimum bubble size in the local domain
     !< Projection of the lagrangian particles in the Eulerian framework
     type(scalar_field), dimension(:), allocatable :: q_beta
-    integer :: q_beta_idx                       !< Size of the q_beta vector field
+    type(scalar_field), dimension(:), allocatable :: kahan_comp !< Kahan compensation for q_beta accumulation
+    integer :: q_beta_idx                                       !< Size of the q_beta vector field
 
-    $:GPU_DECLARE(create='[nBubs,Rmax_glb,Rmin_glb,q_beta,q_beta_idx]')
+    $:GPU_DECLARE(create='[Rmax_glb,Rmin_glb,q_beta,kahan_comp,q_beta_idx]')
+
+    integer, parameter :: LAG_EVOL_ID = 11 ! File id for lag_bubbles_evol_*.dat
+    integer, parameter :: LAG_STATS_ID = 12 ! File id for stats_lag_bubbles_*.dat
+    integer, parameter :: LAG_VOID_ID = 13 ! File id for voidfraction.dat
+
+    integer, allocatable, dimension(:) :: keep_bubble
+    integer, allocatable, dimension(:, :) :: wrap_bubble_loc, wrap_bubble_dir
+    $:GPU_DECLARE(create='[keep_bubble]')
+    $:GPU_DECLARE(create='[wrap_bubble_loc, wrap_bubble_dir]')
 
 contains
 
     !> Initializes the lagrangian subgrid bubble solver
         !! @param q_cons_vf Initial conservative variables
-    impure subroutine s_initialize_bubbles_EL_module(q_cons_vf)
+    impure subroutine s_initialize_bubbles_EL_module(q_cons_vf, bc_type)
 
         type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
+        type(integer_field), dimension(1:num_dims, 1:2), intent(in) :: bc_type
 
         integer :: nBubs_glb, i
 
@@ -103,24 +119,39 @@ contains
             call s_mpi_abort('Please check the lag_params%solver_approach input')
         end if
 
+        pcomm_coords(1)%beg = x_cb(-1)
+        pcomm_coords(1)%end = x_cb(m)
+        $:GPU_UPDATE(device='[pcomm_coords(1)]')
+        if (n > 0) then
+            pcomm_coords(2)%beg = y_cb(-1)
+            pcomm_coords(2)%end = y_cb(n)
+            $:GPU_UPDATE(device='[pcomm_coords(2)]')
+            if (p > 0) then
+                pcomm_coords(3)%beg = z_cb(-1)
+                pcomm_coords(3)%end = z_cb(p)
+                $:GPU_UPDATE(device='[pcomm_coords(3)]')
+            end if
+        end if
+
         $:GPU_UPDATE(device='[lag_num_ts, q_beta_idx]')
 
         @:ALLOCATE(q_beta(1:q_beta_idx))
+        @:ALLOCATE(kahan_comp(1:q_beta_idx))
 
         do i = 1, q_beta_idx
             @:ALLOCATE(q_beta(i)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
                 idwbuff(2)%beg:idwbuff(2)%end, &
                 idwbuff(3)%beg:idwbuff(3)%end))
-        end do
-
-        do i = 1, q_beta_idx
             @:ACC_SETUP_SFs(q_beta(i))
+            @:ALLOCATE(kahan_comp(i)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+                idwbuff(2)%beg:idwbuff(2)%end, &
+                idwbuff(3)%beg:idwbuff(3)%end))
+            @:ACC_SETUP_SFs(kahan_comp(i))
         end do
 
         ! Allocating space for lagrangian variables
         nBubs_glb = lag_params%nBubs_glb
 
-        @:ALLOCATE(lag_id(1:nBubs_glb, 1:2))
         @:ALLOCATE(bub_R0(1:nBubs_glb))
         @:ALLOCATE(Rmax_stats(1:nBubs_glb))
         @:ALLOCATE(Rmin_stats(1:nBubs_glb))
@@ -128,6 +159,7 @@ contains
         @:ALLOCATE(gas_betaT(1:nBubs_glb))
         @:ALLOCATE(gas_betaC(1:nBubs_glb))
         @:ALLOCATE(bub_dphidt(1:nBubs_glb))
+        @:ALLOCATE(lag_id(1:nBubs_glb, 1:2))
         @:ALLOCATE(gas_p(1:nBubs_glb, 1:2))
         @:ALLOCATE(gas_mv(1:nBubs_glb, 1:2))
         @:ALLOCATE(intfc_rad(1:nBubs_glb, 1:2))
@@ -143,24 +175,73 @@ contains
         @:ALLOCATE(mtn_dposdt(1:nBubs_glb, 1:3, 1:lag_num_ts))
         @:ALLOCATE(mtn_dveldt(1:nBubs_glb, 1:3, 1:lag_num_ts))
 
+        @:ALLOCATE(keep_bubble(1:nBubs_glb))
+        @:ALLOCATE(wrap_bubble_loc(1:nBubs_glb, 1:num_dims), wrap_bubble_dir(1:nBubs_glb, 1:num_dims))
+
         if (adap_dt .and. f_is_default(adap_dt_tol)) adap_dt_tol = dflt_adap_dt_tol
 
+        if (num_procs > 1) call s_initialize_particles_mpi(lag_num_ts)
+
         ! Starting bubbles
-        call s_read_input_bubbles(q_cons_vf)
+        if (lag_params%write_void_evol) call s_open_void_evol
+        if (lag_params%write_bubbles) call s_open_lag_bubble_evol()
+        if (lag_params%write_bubbles_stats) call s_open_lag_bubble_stats()
+
+        if (lag_params%vel_model > 0) then
+            moving_lag_bubbles = .true.
+            lag_pressure_force = lag_params%pressure_force
+            lag_gravity_force = lag_params%gravity_force
+            lag_vel_model = lag_params%vel_model
+            lag_drag_model = lag_params%drag_model
+        end if
+        $:GPU_UPDATE(device='[moving_lag_bubbles, lag_pressure_force, &
+            & lag_gravity_force, lag_vel_model, lag_drag_model]')
+
+        ! Allocate cell-centered pressure gradient arrays and FD coefficients
+        if (lag_params%vel_model > 0 .and. lag_params%pressure_force) then
+            @:ALLOCATE(grad_p_x(0:m, 0:n, 0:p))
+            @:ALLOCATE(fd_coeff_x_pgrad(-fd_number:fd_number, 0:m))
+            call s_compute_finite_difference_coefficients(m, x_cc, fd_coeff_x_pgrad, &
+                                                          buff_size, fd_number, fd_order)
+            $:GPU_UPDATE(device='[fd_coeff_x_pgrad]')
+            if (n > 0) then
+                @:ALLOCATE(grad_p_y(0:m, 0:n, 0:p))
+                @:ALLOCATE(fd_coeff_y_pgrad(-fd_number:fd_number, 0:n))
+                call s_compute_finite_difference_coefficients(n, y_cc, fd_coeff_y_pgrad, &
+                                                              buff_size, fd_number, fd_order)
+                $:GPU_UPDATE(device='[fd_coeff_y_pgrad]')
+            end if
+            if (p > 0) then
+                @:ALLOCATE(grad_p_z(0:m, 0:n, 0:p))
+                @:ALLOCATE(fd_coeff_z_pgrad(-fd_number:fd_number, 0:p))
+                call s_compute_finite_difference_coefficients(p, z_cc, fd_coeff_z_pgrad, &
+                                                              buff_size, fd_number, fd_order)
+                $:GPU_UPDATE(device='[fd_coeff_z_pgrad]')
+            end if
+        end if
+
+        ! Allocate cell list arrays for atomic-free Gaussian smearing
+        @:ALLOCATE(cell_list_start(0:m, 0:n, 0:p))
+        @:ALLOCATE(cell_list_count(0:m, 0:n, 0:p))
+        @:ALLOCATE(cell_list_idx(1:lag_params%nBubs_glb))
+
+        call s_read_input_bubbles(q_cons_vf, bc_type)
 
     end subroutine s_initialize_bubbles_EL_module
 
     !> The purpose of this procedure is to obtain the initial bubbles' information
         !! @param q_cons_vf Conservative variables
-    impure subroutine s_read_input_bubbles(q_cons_vf)
+    impure subroutine s_read_input_bubbles(q_cons_vf, bc_type)
 
         type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
+        type(integer_field), dimension(1:num_dims, 1:2), intent(in) :: bc_type
 
         real(wp), dimension(8) :: inputBubble
         real(wp) :: qtime
         integer :: id, bub_id, save_count
         integer :: i, ios
         logical :: file_exist, indomain
+        integer, dimension(3) :: cell
 
         character(LEN=path_len + 2*name_len) :: path_D_dir !<
 
@@ -179,9 +260,9 @@ contains
 
         if (save_count == 0) then
             if (proc_rank == 0) print *, 'Reading lagrange bubbles input file.'
-            inquire (file='input/lag_bubbles.dat', exist=file_exist)
+            call my_inquire(trim(lag_params%input_path), file_exist)
             if (file_exist) then
-                open (94, file='input/lag_bubbles.dat', form='formatted', iostat=ios)
+                open (94, file=trim(lag_params%input_path), form='formatted', iostat=ios)
                 do while (ios == 0)
                     read (94, *, iostat=ios) (inputBubble(i), i=1, 8)
                     if (ios /= 0) cycle
@@ -195,12 +276,12 @@ contains
                         call s_add_bubbles(inputBubble, q_cons_vf, bub_id)
                         lag_id(bub_id, 1) = id      !global ID
                         lag_id(bub_id, 2) = bub_id  !local ID
-                        nBubs = bub_id              ! local number of bubbles
+                        n_el_bubs_loc = bub_id              ! local number of bubbles
                     end if
                 end do
                 close (94)
             else
-                call s_mpi_abort("Initialize the lagrange bubbles in input/lag_bubbles.dat")
+                call s_mpi_abort("Initialize the lagrange bubbles in "//trim(lag_params%input_path))
             end if
         else
             if (proc_rank == 0) print *, 'Restarting lagrange bubbles at save_count: ', save_count
@@ -209,13 +290,23 @@ contains
 
         print *, " Lagrange bubbles running, in proc", proc_rank, "number:", bub_id, "/", id
 
+        if (num_procs > 1) then
+            call s_mpi_reduce_int_sum(n_el_bubs_loc, n_el_bubs_glb)
+        else
+            n_el_bubs_glb = n_el_bubs_loc
+        end if
+
+        if (proc_rank == 0) then
+            if (n_el_bubs_glb == 0) call s_mpi_abort('No bubbles in the domain. Check '//trim(lag_params%input_path))
+        end if
+
         $:GPU_UPDATE(device='[bubbles_lagrange, lag_params]')
 
         $:GPU_UPDATE(device='[lag_id,bub_R0,Rmax_stats,Rmin_stats,gas_mg, &
             & gas_betaT,gas_betaC,bub_dphidt,gas_p,gas_mv, &
             & intfc_rad,intfc_vel,mtn_pos,mtn_posPrev,mtn_vel, &
             & mtn_s,intfc_draddt,intfc_dveldt,gas_dpdt,gas_dmvdt, &
-            & mtn_dposdt,mtn_dveldt,nBubs]')
+            & mtn_dposdt,mtn_dveldt,n_el_bubs_loc]')
 
         Rmax_glb = min(dflt_real, -dflt_real)
         Rmin_glb = max(dflt_real, -dflt_real)
@@ -225,19 +316,22 @@ contains
 
         !Populate temporal variables
         call s_transfer_data_to_tmp()
-        call s_smear_voidfraction()
-
-        if (lag_params%write_bubbles) call s_write_lag_particles(qtime)
+        call s_smear_voidfraction(bc_type)
 
         if (save_count == 0) then
             ! Create ./D directory
-            write (path_D_dir, '(A,I0,A,I0)') trim(case_dir)//'/D'
-            call my_inquire(path_D_dir, file_exist)
-            if (.not. file_exist) call s_create_directory(trim(path_D_dir))
+            if (proc_rank == 0) then
+                write (path_D_dir, '(A,I0,A,I0)') trim(case_dir)//'/D'
+                call my_inquire(trim(path_D_dir), file_exist)
+                if (.not. file_exist) call s_create_directory(trim(path_D_dir))
+            end if
+            call s_mpi_barrier()
             call s_write_restart_lag_bubbles(save_count) ! Needed for post_processing
-            call s_write_void_evol(qtime)
+            if (lag_params%write_void_evol) call s_write_void_evol(qtime)
         end if
 
+        if (lag_params%write_bubbles) call s_write_lag_bubble_evol(qtime)
+
     end subroutine s_read_input_bubbles
 
     !> The purpose of this procedure is to obtain the information of the bubbles when starting fresh
@@ -280,7 +374,7 @@ contains
             mtn_posPrev(bub_id, 1:3, 1) = mtn_pos(bub_id, 1:3, 1)
         end if
 
-        cell = -buff_size
+        cell = fd_number - buff_size
         call s_locate_cell(mtn_pos(bub_id, 1:3, 1), cell, mtn_s(bub_id, 1:3, 1))
 
         ! Check if the bubble is located in the ghost cell of a symmetric, or wall boundary
@@ -466,7 +560,7 @@ contains
             call MPI_FILE_CLOSE(ifile, ierr)
             call MPI_TYPE_FREE(view, ierr)
 
-            nBubs = bub_id
+            n_el_bubs_loc = bub_id
 
             do i = 1, bub_id
                 lag_id(i, 1) = int(MPI_IO_DATA_lag_bubbles(i, 1))
@@ -491,7 +585,7 @@ contains
             deallocate (MPI_IO_DATA_lag_bubbles)
 
         else
-            nBubs = 0
+            n_el_bubs_loc = 0
 
             call MPI_TYPE_CONTIGUOUS(0, mpi_p, view, ierr)
             call MPI_TYPE_COMMIT(view, ierr)
@@ -523,9 +617,9 @@ contains
     !>  Contains the bubble dynamics subroutines.
         !! @param q_prim_vf Primitive variables
         !! @param stage Current stage in the time-stepper algorithm
-    subroutine s_compute_bubble_EL_dynamics(q_prim_vf, stage)
-
+    subroutine s_compute_bubble_EL_dynamics(q_prim_vf, bc_type, stage)
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
+        type(integer_field), dimension(1:num_dims, 1:2), intent(in) :: bc_type
         integer, intent(in) :: stage
 
         real(wp) :: myVapFlux
@@ -533,7 +627,8 @@ contains
         real(wp) :: myR_m, mygamma_m, myPb, myMass_n, myMass_v
         real(wp) :: myR, myV, myBeta_c, myBeta_t, myR0, myPbdot, myMvdot
         real(wp) :: myPinf, aux1, aux2, myCson, myRho
-        real(wp) :: gamma, pi_inf, qv
+        real(wp), dimension(3) :: myPos, myVel
+        real(wp) :: gamma, pi_inf, qv, f_b, myRe
         #:if not MFC_CASE_OPTIMIZATION and USING_AMD
             real(wp), dimension(3) :: myalpha_rho, myalpha
         #:else
@@ -542,18 +637,17 @@ contains
         real(wp), dimension(2) :: Re
         integer, dimension(3) :: cell
 
-        integer :: adap_dt_stop_max, adap_dt_stop !< Fail-safe exit if max iteration count reached
+        integer :: adap_dt_stop_sum, adap_dt_stop !< Fail-safe exit if max iteration count reached
         real(wp) :: dmalf, dmntait, dmBtait, dm_bub_adv_src, dm_divu !< Dummy variables for unified subgrid bubble subroutines
 
-        integer :: i, k, l
-
-        call nvtxStartRange("LAGRANGE-BUBBLE-DYNAMICS")
+        integer :: k, l
 
         ! Subgrid p_inf model based on Maeda and Colonius (2018).
         if (lag_params%pressure_corrector) then
+            call nvtxStartRange("LAGRANGE-BUBBLE-PINF-CORRECTION")
             ! Calculate velocity potentials (valid for one bubble per cell)
             $:GPU_PARALLEL_LOOP(private='[k,cell,paux,preterm1,term2,Romega,myR0,myR,myV,myPb,pint,term1_fac]')
-            do k = 1, nBubs
+            do k = 1, n_el_bubs_loc
                 call s_get_pinf(k, q_prim_vf, 2, paux, cell, preterm1, term2, Romega)
                 myR0 = bub_R0(k)
                 myR = intfc_rad(k, 2)
@@ -570,14 +664,22 @@ contains
                 end if
             end do
             $:END_GPU_PARALLEL_LOOP()
+            call nvtxEndRange()
         end if
 
+        ! Precompute cell-centered pressure gradients for translational motion
+        if (moving_lag_bubbles .and. lag_pressure_force) then
+            call nvtxStartRange("LAGRANGE-BUBBLE-PRESSURE-GRADIENT")
+            call s_compute_pressure_gradients(q_prim_vf)
+            call nvtxEndRange()
+        end if
+
+        call nvtxStartRange("LAGRANGE-BUBBLE-DYNAMICS")
         ! Radial motion model
-        adap_dt_stop_max = 0
-        $:GPU_PARALLEL_LOOP(private='[k,i,myalpha_rho,myalpha,Re,cell,myVapFlux,preterm1, term2, paux, pint, Romega, term1_fac,myR_m, mygamma_m, myPb, myMass_n, myMass_v,myR, myV, myBeta_c, myBeta_t, myR0, myPbdot, myMvdot,myPinf, aux1, aux2, myCson, myRho,gamma,pi_inf,qv,dmalf, dmntait, dmBtait, dm_bub_adv_src, dm_divu,adap_dt_stop]', &
-            & reduction='[[adap_dt_stop_max]]',reductionOp='[MAX]', &
-            & copy='[adap_dt_stop_max]',copyin='[stage]')
-        do k = 1, nBubs
+        adap_dt_stop_sum = 0
+        $:GPU_PARALLEL_LOOP(private='[k,myalpha_rho,myalpha,Re,cell,myVapFlux,preterm1, term2, paux, pint, Romega,term1_fac,myR_m, mygamma_m, myPb, myMass_n, myMass_v,myR, myV, myBeta_c, myBeta_t, myR0, myPbdot, myMvdot,myPinf, aux1,aux2, myCson, myRho,gamma,pi_inf,qv,dmalf, dmntait, dmBtait, dm_bub_adv_src, dm_divu,adap_dt_stop,myPos,myVel]', &
+            & copy='[adap_dt_stop_sum]',copyin='[stage]')
+        do k = 1, n_el_bubs_loc
             ! Keller-Miksis model
 
             ! Current bubble state
@@ -589,6 +691,8 @@ contains
             myBeta_c = gas_betaC(k)
             myBeta_t = gas_betaT(k)
             myR0 = bub_R0(k)
+            myPos = mtn_pos(k, :, 2)
+            myVel = mtn_vel(k, :, 2)
 
             ! Vapor and heat fluxes
             call s_vflux(myR, myV, myPb, myMass_v, k, myVapFlux, myMass_n, myBeta_c, myR_m, mygamma_m)
@@ -609,48 +713,77 @@ contains
 
             if (adap_dt) then
 
-                call s_advance_step(myRho, myPinf, myR, myV, myR0, myPb, myPbdot, dmalf, &
-                                    dmntait, dmBtait, dm_bub_adv_src, dm_divu, &
-                                    k, myMass_v, myMass_n, myBeta_c, &
-                                    myBeta_t, myCson, adap_dt_stop)
+                mtn_posPrev(k, :, 1) = myPos
+
+                myRe = Re(1)
+                adap_dt_stop = f_advance_step(myRho, myPinf, myR, myV, myR0, myPb, myPbdot, dmalf, &
+                                              dmntait, dmBtait, dm_bub_adv_src, dm_divu, &
+                                              k, myMass_v, myMass_n, myBeta_c, &
+                                              myBeta_t, myCson, myRe, &
+                                              myPos, myVel, cell, q_prim_vf)
 
                 ! Update bubble state
                 intfc_rad(k, 1) = myR
                 intfc_vel(k, 1) = myV
                 gas_p(k, 1) = myPb
                 gas_mv(k, 1) = myMass_v
+                mtn_pos(k, :, 1) = myPos
+                mtn_vel(k, :, 1) = myVel
 
             else
-
                 ! Radial acceleration from bubble models
                 intfc_dveldt(k, stage) = f_rddot(myRho, myPinf, myR, myV, myR0, &
                                                  myPb, myPbdot, dmalf, dmntait, dmBtait, &
                                                  dm_bub_adv_src, dm_divu, &
                                                  myCson)
+
                 intfc_draddt(k, stage) = myV
                 gas_dmvdt(k, stage) = myMvdot
                 gas_dpdt(k, stage) = myPbdot
 
+                if (moving_lag_bubbles) then
+                    do l = 1, num_dims
+                        select case (lag_vel_model)
+                        case (1)
+                            mtn_dposdt(k, l, stage) = f_interpolate_velocity(myPos(l), &
+                                                                             cell, l, q_prim_vf)
+                            mtn_dveldt(k, l, stage) = 0._wp
+                        case (2)
+                            mtn_dposdt(k, l, stage) = myVel(l)
+                            f_b = f_get_bubble_force(myPos(l), &
+                                                     myR, myV, myVel(l), &
+                                                     myMass_n, myMass_v, &
+                                                     Re(1), myRho, cell, l, q_prim_vf)
+                            mtn_dveldt(k, l, stage) = f_b/(myMass_n + myMass_v)
+                        case (3)
+                            mtn_dposdt(k, l, stage) = myVel(l)
+                            f_b = f_get_bubble_force(myPos(l), &
+                                                     myR, myV, myVel(l), &
+                                                     myMass_n, myMass_v, &
+                                                     Re(1), myRho, cell, l, q_prim_vf)
+                            mtn_dveldt(k, l, stage) = 2._wp*f_b/(myMass_n + myMass_v) - 3._wp*myV*myVel(l)/myR
+                        case default
+                            mtn_dposdt(k, l, stage) = 0._wp
+                            mtn_dveldt(k, l, stage) = 0._wp
+                        end select
+                    end do
+                end if
             end if
 
-            adap_dt_stop_max = max(adap_dt_stop_max, adap_dt_stop)
+            $:GPU_ATOMIC(atomic='update')
+            adap_dt_stop_sum = adap_dt_stop_sum + adap_dt_stop
 
         end do
         $:END_GPU_PARALLEL_LOOP()
+        call nvtxEndRange
 
-        if (adap_dt .and. adap_dt_stop_max > 0) call s_mpi_abort("Adaptive time stepping failed to converge.")
-
-        ! Bubbles remain in a fixed position
-        $:GPU_PARALLEL_LOOP(collapse=2, private='[k,l]', copyin='[stage]')
-        do k = 1, nBubs
-            do l = 1, 3
-                mtn_dposdt(k, l, stage) = 0._wp
-                mtn_dveldt(k, l, stage) = 0._wp
-            end do
-        end do
-        $:END_GPU_PARALLEL_LOOP()
+        if (adap_dt .and. adap_dt_stop_sum > 0) call s_mpi_abort("Adaptive time stepping failed to converge.")
 
-        call nvtxEndRange
+        if (adap_dt) then
+            call s_transfer_data_to_tmp()
+            if (moving_lag_bubbles) call s_enforce_EL_bubbles_boundary_conditions(q_prim_vf)
+            call s_smear_voidfraction(bc_type)
+        end if
 
     end subroutine s_compute_bubble_EL_dynamics
 
@@ -667,96 +800,91 @@ contains
 
         integer :: i, j, k, l
 
-        if (.not. adap_dt) call s_smear_voidfraction()
-
-        if (lag_params%solver_approach == 2) then
-
-            ! (q / (1 - beta)) * d(beta)/dt source
-            if (p == 0) then
-                $:GPU_PARALLEL_LOOP(private='[i,j,k,l]', collapse=4)
-                do k = 0, p
-                    do j = 0, n
-                        do i = 0, m
-                            do l = 1, E_idx
-                                if (q_beta(1)%sf(i, j, k) > (1._wp - lag_params%valmaxvoid)) then
-                                    rhs_vf(l)%sf(i, j, k) = rhs_vf(l)%sf(i, j, k) + &
-                                                            q_cons_vf(l)%sf(i, j, k)*(q_beta(2)%sf(i, j, k) + &
-                                                                                      q_beta(5)%sf(i, j, k))
-
-                                end if
-                            end do
+        call nvtxStartRange("LAGRANGE-BUBBLE-EL-SOURCE")
+        ! (q / (1 - beta)) * d(beta)/dt source
+        if (lag_params%cluster_type >= 4) then
+            $:GPU_PARALLEL_LOOP(private='[i,j,k,l]', collapse=4)
+            do k = idwint(3)%beg, idwint(3)%end
+                do j = idwint(2)%beg, idwint(2)%end
+                    do i = idwint(1)%beg, idwint(1)%end
+                        do l = 1, E_idx
+                            if (q_beta(1)%sf(i, j, k) > (1._wp - lag_params%valmaxvoid)) then
+                                rhs_vf(l)%sf(i, j, k) = rhs_vf(l)%sf(i, j, k) + &
+                                                        q_cons_vf(l)%sf(i, j, k)*(q_beta(2)%sf(i, j, k) + &
+                                                                                  q_beta(5)%sf(i, j, k))
+                            end if
                         end do
                     end do
                 end do
-                $:END_GPU_PARALLEL_LOOP()
-            else
-                $:GPU_PARALLEL_LOOP(private='[i,j,k,l]', collapse=4)
-                do k = 0, p
-                    do j = 0, n
-                        do i = 0, m
-                            do l = 1, E_idx
-                                if (q_beta(1)%sf(i, j, k) > (1._wp - lag_params%valmaxvoid)) then
-                                    rhs_vf(l)%sf(i, j, k) = rhs_vf(l)%sf(i, j, k) + &
-                                                            q_cons_vf(l)%sf(i, j, k)/q_beta(1)%sf(i, j, k)* &
-                                                            q_beta(2)%sf(i, j, k)
-                                end if
-                            end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+        else
+            $:GPU_PARALLEL_LOOP(private='[i,j,k,l]', collapse=4)
+            do k = idwint(3)%beg, idwint(3)%end
+                do j = idwint(2)%beg, idwint(2)%end
+                    do i = idwint(1)%beg, idwint(1)%end
+                        do l = 1, E_idx
+                            if (q_beta(1)%sf(i, j, k) > (1._wp - lag_params%valmaxvoid)) then
+                                rhs_vf(l)%sf(i, j, k) = rhs_vf(l)%sf(i, j, k) + &
+                                                        (q_cons_vf(l)%sf(i, j, k)/q_beta(1)%sf(i, j, k))* &
+                                                        q_beta(2)%sf(i, j, k)
+                            end if
                         end do
                     end do
                 end do
-                $:END_GPU_PARALLEL_LOOP()
-            end if
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+        end if
 
-            do l = 1, num_dims
+        do l = 1, num_dims
 
-                call s_gradient_dir(q_prim_vf(E_idx)%sf, q_beta(3)%sf, l)
+            call s_gradient_dir(q_prim_vf(E_idx)%sf, q_beta(3)%sf, l)
 
-                ! (q / (1 - beta)) * d(beta)/dt source
-                $:GPU_PARALLEL_LOOP(private='[i,j,k]', collapse=3)
-                do k = 0, p
-                    do j = 0, n
-                        do i = 0, m
-                            if (q_beta(1)%sf(i, j, k) > (1._wp - lag_params%valmaxvoid)) then
-                                rhs_vf(contxe + l)%sf(i, j, k) = rhs_vf(contxe + l)%sf(i, j, k) - &
-                                                                 (1._wp - q_beta(1)%sf(i, j, k))/ &
-                                                                 q_beta(1)%sf(i, j, k)* &
-                                                                 q_beta(3)%sf(i, j, k)
-                            end if
-                        end do
+            ! (q / (1 - beta)) * d(beta)/dt source
+            $:GPU_PARALLEL_LOOP(private='[i,j,k]', collapse=3)
+            do k = idwint(3)%beg, idwint(3)%end
+                do j = idwint(2)%beg, idwint(2)%end
+                    do i = idwint(1)%beg, idwint(1)%end
+                        if (q_beta(1)%sf(i, j, k) > (1._wp - lag_params%valmaxvoid)) then
+                            rhs_vf(contxe + l)%sf(i, j, k) = rhs_vf(contxe + l)%sf(i, j, k) - &
+                                                             (1._wp - q_beta(1)%sf(i, j, k))/ &
+                                                             q_beta(1)%sf(i, j, k)* &
+                                                             q_beta(3)%sf(i, j, k)
+                        end if
                     end do
                 end do
-                $:END_GPU_PARALLEL_LOOP()
+            end do
+            $:END_GPU_PARALLEL_LOOP()
 
-                !source in energy
-                $:GPU_PARALLEL_LOOP(private='[i,j,k]', collapse=3)
-                do k = idwbuff(3)%beg, idwbuff(3)%end
-                    do j = idwbuff(2)%beg, idwbuff(2)%end
-                        do i = idwbuff(1)%beg, idwbuff(1)%end
-                            q_beta(3)%sf(i, j, k) = q_prim_vf(E_idx)%sf(i, j, k)*q_prim_vf(contxe + l)%sf(i, j, k)
-                        end do
+            !source in energy
+            $:GPU_PARALLEL_LOOP(private='[i,j,k]', collapse=3)
+            do k = idwbuff(3)%beg, idwbuff(3)%end
+                do j = idwbuff(2)%beg, idwbuff(2)%end
+                    do i = idwbuff(1)%beg, idwbuff(1)%end
+                        q_beta(3)%sf(i, j, k) = q_prim_vf(E_idx)%sf(i, j, k)*q_prim_vf(contxe + l)%sf(i, j, k)
                     end do
                 end do
-                $:END_GPU_PARALLEL_LOOP()
+            end do
+            $:END_GPU_PARALLEL_LOOP()
 
-                call s_gradient_dir(q_beta(3)%sf, q_beta(4)%sf, l)
+            call s_gradient_dir(q_beta(3)%sf, q_beta(4)%sf, l)
 
-                ! (beta / (1 - beta)) * d(Pu)/dl source
-                $:GPU_PARALLEL_LOOP(private='[i,j,k]', collapse=3)
-                do k = 0, p
-                    do j = 0, n
-                        do i = 0, m
-                            if (q_beta(1)%sf(i, j, k) > (1._wp - lag_params%valmaxvoid)) then
-                                rhs_vf(E_idx)%sf(i, j, k) = rhs_vf(E_idx)%sf(i, j, k) - &
-                                                            q_beta(4)%sf(i, j, k)*(1._wp - q_beta(1)%sf(i, j, k))/ &
-                                                            q_beta(1)%sf(i, j, k)
-                            end if
-                        end do
+            ! (beta / (1 - beta)) * d(Pu)/dl source
+            $:GPU_PARALLEL_LOOP(private='[i,j,k]', collapse=3)
+            do k = idwint(3)%beg, idwint(3)%end
+                do j = idwint(2)%beg, idwint(2)%end
+                    do i = idwint(1)%beg, idwint(1)%end
+                        if (q_beta(1)%sf(i, j, k) > (1._wp - lag_params%valmaxvoid)) then
+                            rhs_vf(E_idx)%sf(i, j, k) = rhs_vf(E_idx)%sf(i, j, k) - &
+                                                        q_beta(4)%sf(i, j, k)*(1._wp - q_beta(1)%sf(i, j, k))/ &
+                                                        q_beta(1)%sf(i, j, k)
+                        end if
                     end do
                 end do
-                $:END_GPU_PARALLEL_LOOP()
             end do
-
-        end if
+            $:END_GPU_PARALLEL_LOOP()
+        end do
+        call nvtxEndRange ! LAGRANGE-BUBBLE-EL-SOURCE
 
     end subroutine s_compute_bubbles_EL_source
 
@@ -797,26 +925,38 @@ contains
     end subroutine s_compute_cson_from_pinf
 
     !>  The purpose of this subroutine is to smear the effect of the bubbles in the Eulerian framework
-    subroutine s_smear_voidfraction()
+    subroutine s_smear_voidfraction(bc_type)
 
+        type(integer_field), dimension(1:num_dims, 1:2), intent(in) :: bc_type
         integer :: i, j, k, l
 
-        call nvtxStartRange("BUBBLES-LAGRANGE-KERNELS")
-
+        call nvtxStartRange("BUBBLES-LAGRANGE-SMEARING")
         $:GPU_PARALLEL_LOOP(private='[i,j,k,l]', collapse=4)
         do i = 1, q_beta_idx
             do l = idwbuff(3)%beg, idwbuff(3)%end
                 do k = idwbuff(2)%beg, idwbuff(2)%end
                     do j = idwbuff(1)%beg, idwbuff(1)%end
                         q_beta(i)%sf(j, k, l) = 0._wp
+                        kahan_comp(i)%sf(j, k, l) = 0._wp
                     end do
                 end do
             end do
         end do
         $:END_GPU_PARALLEL_LOOP()
 
-        call s_smoothfunction(nBubs, intfc_rad, intfc_vel, &
-                              mtn_s, mtn_pos, q_beta)
+        ! Build cell list before smearing (CPU-side counting sort)
+        call s_build_cell_list(n_el_bubs_loc, mtn_s)
+
+        call s_smoothfunction(n_el_bubs_loc, intfc_rad, intfc_vel, &
+                              mtn_s, mtn_pos, q_beta, kahan_comp)
+
+        call nvtxStartRange("BUBBLES-LAGRANGE-BETA-COMM")
+        if (lag_params%cluster_type >= 4) then
+            call s_populate_beta_buffers(q_beta, bc_type, 3)
+        else
+            call s_populate_beta_buffers(q_beta, bc_type, 2)
+        end if
+        call nvtxEndRange
 
         !Store 1-beta
         $:GPU_PARALLEL_LOOP(private='[j,k,l]', collapse=3)
@@ -831,8 +971,7 @@ contains
             end do
         end do
         $:END_GPU_PARALLEL_LOOP()
-
-        call nvtxEndRange
+        call nvtxEndRange ! BUBBLES-LAGRANGE-SMEARING
 
     end subroutine s_smear_voidfraction
 
@@ -855,92 +994,176 @@ contains
         integer, dimension(3), intent(out) :: cell
         real(wp), intent(out), optional :: preterm1, term2, Romega
 
-        real(wp), dimension(3) :: scoord, psi
+        real(wp), dimension(3) :: scoord, psi_pos, psi_x, psi_y, psi_z
+        real(wp) :: xi, eta, zeta
         real(wp) :: dc, vol, aux
         real(wp) :: volgas, term1, Rbeq, denom
         real(wp) :: charvol, charpres, charvol2, charpres2
         integer, dimension(3) :: cellaux
         integer :: i, j, k
         integer :: smearGrid, smearGridz
-        logical :: celloutside
 
-        scoord = mtn_s(bub_id, 1:3, 2)
         f_pinfl = 0._wp
 
-        !< Find current bubble cell
-        cell(:) = int(scoord(:))
-        $:GPU_LOOP(parallelism='[seq]')
-        do i = 1, num_dims
-            if (scoord(i) < 0._wp) cell(i) = cell(i) - 1
-        end do
+        if (moving_lag_bubbles) then
+            cell = fd_number - buff_size
+            call s_locate_cell(mtn_pos(bub_id, 1:3, 2), cell, mtn_s(bub_id, 1:3, 2))
+            scoord = mtn_s(bub_id, 1:3, 2)
+        else
+            scoord = mtn_s(bub_id, 1:3, 2)
+            cell(:) = int(scoord(:))
+            $:GPU_LOOP(parallelism='[seq]')
+            do i = 1, num_dims
+                if (scoord(i) < 0._wp) cell(i) = cell(i) - 1
+            end do
+        end if
 
         if ((lag_params%cluster_type == 1)) then
             !< Getting p_cell in terms of only the current cell by interpolation
 
-            !< Getting the cell volulme as Omega
-            if (p > 0) then
-                vol = dx(cell(1))*dy(cell(2))*dz(cell(3))
-            else
-                if (cyl_coord) then
-                    vol = dx(cell(1))*dy(cell(2))*y_cc(cell(2))*2._wp*pi
+            if (fd_order == 2) then ! Bilinear interpolation
+
+                if (p > 0) then
+                    vol = dx(cell(1))*dy(cell(2))*dz(cell(3))
                 else
-                    vol = dx(cell(1))*dy(cell(2))*lag_params%charwidth
+                    if (cyl_coord) then
+                        vol = dx(cell(1))*dy(cell(2))*y_cc(cell(2))*2._wp*pi
+                    else
+                        vol = dx(cell(1))*dy(cell(2))*lag_params%charwidth
+                    end if
                 end if
-            end if
 
-            !< Obtain bilinear interpolation coefficients, based on the current location of the bubble.
-            psi(1) = (scoord(1) - real(cell(1)))*dx(cell(1)) + x_cb(cell(1) - 1)
-            if (cell(1) == (m + buff_size)) then
-                cell(1) = cell(1) - 1
-                psi(1) = 1._wp
-            else if (cell(1) == (-buff_size)) then
-                psi(1) = 0._wp
-            else
-                if (psi(1) < x_cc(cell(1))) cell(1) = cell(1) - 1
-                psi(1) = abs((psi(1) - x_cc(cell(1)))/(x_cc(cell(1) + 1) - x_cc(cell(1))))
-            end if
+                !< Obtain bilinear interpolation coefficients, based on the current location of the bubble.
+                psi_pos(1) = (scoord(1) - real(cell(1)))*dx(cell(1)) + x_cb(cell(1) - 1)
+                psi_pos(1) = abs((psi_pos(1) - x_cc(cell(1)))/(x_cc(cell(1) + 1) - x_cc(cell(1))))
 
-            psi(2) = (scoord(2) - real(cell(2)))*dy(cell(2)) + y_cb(cell(2) - 1)
-            if (cell(2) == (n + buff_size)) then
-                cell(2) = cell(2) - 1
-                psi(2) = 1._wp
-            else if (cell(2) == (-buff_size)) then
-                psi(2) = 0._wp
-            else
-                if (psi(2) < y_cc(cell(2))) cell(2) = cell(2) - 1
-                psi(2) = abs((psi(2) - y_cc(cell(2)))/(y_cc(cell(2) + 1) - y_cc(cell(2))))
-            end if
+                psi_pos(2) = (scoord(2) - real(cell(2)))*dy(cell(2)) + y_cb(cell(2) - 1)
+                psi_pos(2) = abs((psi_pos(2) - y_cc(cell(2)))/(y_cc(cell(2) + 1) - y_cc(cell(2))))
 
-            if (p > 0) then
-                psi(3) = (scoord(3) - real(cell(3)))*dz(cell(3)) + z_cb(cell(3) - 1)
-                if (cell(3) == (p + buff_size)) then
-                    cell(3) = cell(3) - 1
-                    psi(3) = 1._wp
-                else if (cell(3) == (-buff_size)) then
-                    psi(3) = 0._wp
+                if (p > 0) then
+                    psi_pos(3) = (scoord(3) - real(cell(3)))*dz(cell(3)) + z_cb(cell(3) - 1)
+                    psi_pos(3) = abs((psi_pos(3) - z_cc(cell(3)))/(z_cc(cell(3) + 1) - z_cc(cell(3))))
                 else
-                    if (psi(3) < z_cc(cell(3))) cell(3) = cell(3) - 1
-                    psi(3) = abs((psi(3) - z_cc(cell(3)))/(z_cc(cell(3) + 1) - z_cc(cell(3))))
+                    psi_pos(3) = 0._wp
+                end if
+
+                ! Calculate bilinear basis functions for each direction
+                ! For normalized coordinate xi in [0, 1], the two basis functions are:
+                ! phi_0(xi) = 1 - xi, phi_1(xi) = xi
+
+                ! X-direction basis functions
+                psi_x(1) = 1._wp - psi_pos(1)  ! Left basis function
+                psi_x(2) = psi_pos(1)          ! Right basis function
+
+                ! Y-direction basis functions
+                psi_y(1) = 1._wp - psi_pos(2)  ! Left basis function
+                psi_y(2) = psi_pos(2)          ! Right basis function
+
+                if (p > 0) then
+                    ! Z-direction basis functions
+                    psi_z(1) = 1._wp - psi_pos(3)  ! Left basis function
+                    psi_z(2) = psi_pos(3)          ! Right basis function
+                else
+                    psi_z(1) = 1._wp
+                    psi_z(2) = 0._wp
+                end if
+
+                !< Perform bilinear interpolation
+                f_pinfl = 0._wp
+
+                if (p == 0) then  !2D - 4 point interpolation (2x2)
+                    do j = 1, 2
+                        do i = 1, 2
+                            f_pinfl = f_pinfl + q_prim_vf(E_idx)%sf(cell(1) + i - 1, cell(2) + j - 1, cell(3))* &
+                                      psi_x(i)*psi_y(j)
+                        end do
+                    end do
+                else              !3D - 8 point interpolation (2x2x2)
+                    do k = 1, 2
+                        do j = 1, 2
+                            do i = 1, 2
+                                f_pinfl = f_pinfl + q_prim_vf(E_idx)%sf(cell(1) + i - 1, cell(2) + j - 1, cell(3) + k - 1)* &
+                                          psi_x(i)*psi_y(j)*psi_z(k)
+                            end do
+                        end do
+                    end do
+                end if
+
+            elseif (fd_order == 4) then ! Biquadratic interpolation
+
+                if (p > 0) then
+                    vol = dx(cell(1))*dy(cell(2))*dz(cell(3))
+                else
+                    if (cyl_coord) then
+                        vol = dx(cell(1))*dy(cell(2))*y_cc(cell(2))*2._wp*pi
+                    else
+                        vol = dx(cell(1))*dy(cell(2))*lag_params%charwidth
+                    end if
                 end if
-            else
-                psi(3) = 0._wp
-            end if
 
-            !< Perform bilinear interpolation
-            if (p == 0) then  !2D
-                f_pinfl = q_prim_vf(E_idx)%sf(cell(1), cell(2), cell(3))*(1._wp - psi(1))*(1._wp - psi(2))
-                f_pinfl = f_pinfl + q_prim_vf(E_idx)%sf(cell(1) + 1, cell(2), cell(3))*psi(1)*(1._wp - psi(2))
-                f_pinfl = f_pinfl + q_prim_vf(E_idx)%sf(cell(1) + 1, cell(2) + 1, cell(3))*psi(1)*psi(2)
-                f_pinfl = f_pinfl + q_prim_vf(E_idx)%sf(cell(1), cell(2) + 1, cell(3))*(1._wp - psi(1))*psi(2)
-            else              !3D
-                f_pinfl = q_prim_vf(E_idx)%sf(cell(1), cell(2), cell(3))*(1._wp - psi(1))*(1._wp - psi(2))*(1._wp - psi(3))
-                f_pinfl = f_pinfl + q_prim_vf(E_idx)%sf(cell(1) + 1, cell(2), cell(3))*psi(1)*(1._wp - psi(2))*(1._wp - psi(3))
-                f_pinfl = f_pinfl + q_prim_vf(E_idx)%sf(cell(1) + 1, cell(2) + 1, cell(3))*psi(1)*psi(2)*(1._wp - psi(3))
-                f_pinfl = f_pinfl + q_prim_vf(E_idx)%sf(cell(1), cell(2) + 1, cell(3))*(1._wp - psi(1))*psi(2)*(1._wp - psi(3))
-                f_pinfl = f_pinfl + q_prim_vf(E_idx)%sf(cell(1), cell(2), cell(3) + 1)*(1._wp - psi(1))*(1._wp - psi(2))*psi(3)
-                f_pinfl = f_pinfl + q_prim_vf(E_idx)%sf(cell(1) + 1, cell(2), cell(3) + 1)*psi(1)*(1._wp - psi(2))*psi(3)
-                f_pinfl = f_pinfl + q_prim_vf(E_idx)%sf(cell(1) + 1, cell(2) + 1, cell(3) + 1)*psi(1)*psi(2)*psi(3)
-                f_pinfl = f_pinfl + q_prim_vf(E_idx)%sf(cell(1), cell(2) + 1, cell(3) + 1)*(1._wp - psi(1))*psi(2)*psi(3)
+                !< Obtain biquadratic interpolation coefficients, based on the current location of the bubble.
+                ! For biquadratic interpolation, we need coefficients for 3 points in each direction
+                psi_pos(1) = (scoord(1) - real(cell(1)))*dx(cell(1)) + x_cb(cell(1) - 1)
+                psi_pos(1) = (psi_pos(1) - x_cc(cell(1)))/(x_cc(cell(1) + 1) - x_cc(cell(1)))
+
+                psi_pos(2) = (scoord(2) - real(cell(2)))*dy(cell(2)) + y_cb(cell(2) - 1)
+                psi_pos(2) = (psi_pos(2) - y_cc(cell(2)))/(y_cc(cell(2) + 1) - y_cc(cell(2)))
+
+                if (p > 0) then
+                    psi_pos(3) = (scoord(3) - real(cell(3)))*dz(cell(3)) + z_cb(cell(3) - 1)
+                    psi_pos(3) = (psi_pos(3) - z_cc(cell(3)))/(z_cc(cell(3) + 1) - z_cc(cell(3)))
+                else
+                    psi_pos(3) = 0._wp
+                end if
+
+                ! Calculate biquadratic basis functions for each direction
+                ! For normalized coordinate xi in [-1, 1], the three basis functions are:
+                ! phi_0(xi) = xi*(xi-1)/2, phi_1(xi) = (1-xi)*(1+xi), phi_2(xi) = xi*(xi+1)/2
+
+                ! X-direction basis functions
+                xi = 2._wp*psi_pos(1) - 1._wp  ! Convert to [-1, 1] range
+                psi_x(1) = xi*(xi - 1._wp)/2._wp        ! Left basis function
+                psi_x(2) = (1._wp - xi)*(1._wp + xi)    ! Center basis function
+                psi_x(3) = xi*(xi + 1._wp)/2._wp        ! Right basis function
+
+                ! Y-direction basis functions
+                eta = 2._wp*psi_pos(2) - 1._wp  ! Convert to [-1, 1] range
+                psi_y(1) = eta*(eta - 1._wp)/2._wp      ! Left basis function
+                psi_y(2) = (1._wp - eta)*(1._wp + eta)  ! Center basis function
+                psi_y(3) = eta*(eta + 1._wp)/2._wp      ! Right basis function
+
+                if (p > 0) then
+                    ! Z-direction basis functions
+                    zeta = 2._wp*psi_pos(3) - 1._wp  ! Convert to [-1, 1] range
+                    psi_z(1) = zeta*(zeta - 1._wp)/2._wp      ! Left basis function
+                    psi_z(2) = (1._wp - zeta)*(1._wp + zeta)  ! Center basis function
+                    psi_z(3) = zeta*(zeta + 1._wp)/2._wp      ! Right basis function
+                else
+                    psi_z(1) = 0._wp
+                    psi_z(2) = 1._wp
+                    psi_z(3) = 0._wp
+                end if
+
+                !< Perform biquadratic interpolation
+                f_pinfl = 0._wp
+
+                if (p == 0) then  !2D - 9 point interpolation (3x3)
+                    do j = 1, 3
+                        do i = 1, 3
+                            f_pinfl = f_pinfl + q_prim_vf(E_idx)%sf(cell(1) + i - 2, cell(2) + j - 2, cell(3))* &
+                                      psi_x(i)*psi_y(j)
+                        end do
+                    end do
+                else              !3D - 27 point interpolation (3x3x3)
+                    do k = 1, 3
+                        do j = 1, 3
+                            do i = 1, 3
+                                f_pinfl = f_pinfl + q_prim_vf(E_idx)%sf(cell(1) + i - 2, cell(2) + j - 2, cell(3) + k - 2)* &
+                                          psi_x(i)*psi_y(j)*psi_z(k)
+                            end do
+                        end do
+                    end do
+                end if
             end if
 
             !R_Omega
@@ -971,56 +1194,25 @@ contains
                         cellaux(3) = cell(3) + k - (mapCells + 1)
                         if (p == 0) cellaux(3) = 0
 
-                        !< check if the current cell is outside the computational domain or not (including ghost cells)
-                        celloutside = .false.
-                        if (num_dims == 2) then
-                            if ((cellaux(1) < -buff_size) .or. (cellaux(2) < -buff_size)) then
-                                celloutside = .true.
-                            end if
-                            if (cyl_coord .and. y_cc(cellaux(2)) < 0._wp) then
-                                celloutside = .true.
-                            end if
-                            if ((cellaux(2) > n + buff_size) .or. (cellaux(1) > m + buff_size)) then
-                                celloutside = .true.
-                            end if
+                        !< Obtaining the cell volume
+                        if (p > 0) then
+                            vol = dx(cellaux(1))*dy(cellaux(2))*dz(cellaux(3))
                         else
-                            if ((cellaux(3) < -buff_size) .or. (cellaux(1) < -buff_size) .or. (cellaux(2) < -buff_size)) then
-                                celloutside = .true.
-                            end if
-
-                            if ((cellaux(3) > p + buff_size) .or. (cellaux(2) > n + buff_size) .or. (cellaux(1) > m + buff_size)) then
-                                celloutside = .true.
-                            end if
-                        end if
-                        if (.not. celloutside) then
-                            if (cyl_coord .and. (p == 0) .and. (y_cc(cellaux(2)) < 0._wp)) then
-                                celloutside = .true.
-                            end if
-                        end if
-
-                        if (.not. celloutside) then
-                            !< Obtaining the cell volulme
-                            if (p > 0) then
-                                vol = dx(cellaux(1))*dy(cellaux(2))*dz(cellaux(3))
+                            if (cyl_coord) then
+                                vol = dx(cellaux(1))*dy(cellaux(2))*y_cc(cellaux(2))*2._wp*pi
                             else
-                                if (cyl_coord) then
-                                    vol = dx(cellaux(1))*dy(cellaux(2))*y_cc(cellaux(2))*2._wp*pi
-                                else
-                                    vol = dx(cellaux(1))*dy(cellaux(2))*lag_params%charwidth
-                                end if
+                                vol = dx(cellaux(1))*dy(cellaux(2))*lag_params%charwidth
                             end if
-                            !< Update values
-                            charvol = charvol + vol
-                            charpres = charpres + q_prim_vf(E_idx)%sf(cellaux(1), cellaux(2), cellaux(3))*vol
-                            charvol2 = charvol2 + vol*q_beta(1)%sf(cellaux(1), cellaux(2), cellaux(3))
-                            charpres2 = charpres2 + q_prim_vf(E_idx)%sf(cellaux(1), cellaux(2), cellaux(3)) &
-                                        *vol*q_beta(1)%sf(cellaux(1), cellaux(2), cellaux(3))
                         end if
-
+                        !< Update values
+                        charvol = charvol + vol
+                        charpres = charpres + q_prim_vf(E_idx)%sf(cellaux(1), cellaux(2), cellaux(3))*vol
+                        charvol2 = charvol2 + vol*q_beta(1)%sf(cellaux(1), cellaux(2), cellaux(3))
+                        charpres2 = charpres2 + q_prim_vf(E_idx)%sf(cellaux(1), cellaux(2), cellaux(3)) &
+                                    *vol*q_beta(1)%sf(cellaux(1), cellaux(2), cellaux(3))
                     end do
                 end do
             end do
-
             f_pinfl = charpres2/charvol2
             vol = charvol
             dc = (3._wp*abs(vol)/(4._wp*pi))**(1._wp/3._wp)
@@ -1056,118 +1248,157 @@ contains
     !>  This subroutine updates the Lagrange variables using the tvd RK time steppers.
         !!      The time derivative of the bubble variables must be stored at every stage to avoid precision errors.
         !! @param stage Current tvd RK stage
-    impure subroutine s_update_lagrange_tdv_rk(stage)
+    impure subroutine s_update_lagrange_tdv_rk(q_prim_vf, bc_type, stage)
 
+        type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf
+        type(integer_field), dimension(1:num_dims, 1:2), intent(in) :: bc_type
         integer, intent(in) :: stage
 
         integer :: k
 
         if (time_stepper == 1) then ! 1st order TVD RK
+
             $:GPU_PARALLEL_LOOP(private='[k]')
-            do k = 1, nBubs
+            do k = 1, n_el_bubs_loc
                 !u{1} = u{n} +  dt * RHS{n}
                 intfc_rad(k, 1) = intfc_rad(k, 1) + dt*intfc_draddt(k, 1)
                 intfc_vel(k, 1) = intfc_vel(k, 1) + dt*intfc_dveldt(k, 1)
-                mtn_pos(k, 1:3, 1) = mtn_pos(k, 1:3, 1) + dt*mtn_dposdt(k, 1:3, 1)
-                mtn_vel(k, 1:3, 1) = mtn_vel(k, 1:3, 1) + dt*mtn_dveldt(k, 1:3, 1)
                 gas_p(k, 1) = gas_p(k, 1) + dt*gas_dpdt(k, 1)
                 gas_mv(k, 1) = gas_mv(k, 1) + dt*gas_dmvdt(k, 1)
+                if (moving_lag_bubbles) then
+                    mtn_posPrev(k, 1:3, 1) = mtn_pos(k, 1:3, 1)
+                    mtn_pos(k, 1:3, 1) = mtn_pos(k, 1:3, 1) + dt*mtn_dposdt(k, 1:3, 1)
+                    mtn_vel(k, 1:3, 1) = mtn_vel(k, 1:3, 1) + dt*mtn_dveldt(k, 1:3, 1)
+                end if
             end do
             $:END_GPU_PARALLEL_LOOP()
 
             call s_transfer_data_to_tmp()
-            call s_write_void_evol(mytime)
+            if (moving_lag_bubbles) call s_enforce_EL_bubbles_boundary_conditions(q_prim_vf)
+            call s_smear_voidfraction(bc_type)
+            if (lag_params%write_void_evol) call s_write_void_evol(mytime)
             if (lag_params%write_bubbles_stats) call s_calculate_lag_bubble_stats()
-
             if (lag_params%write_bubbles) then
                 $:GPU_UPDATE(host='[gas_p,gas_mv,intfc_rad,intfc_vel]')
-                call s_write_lag_particles(mytime)
+                call s_write_lag_bubble_evol(mytime)
             end if
 
         elseif (time_stepper == 2) then ! 2nd order TVD RK
             if (stage == 1) then
+
                 $:GPU_PARALLEL_LOOP(private='[k]')
-                do k = 1, nBubs
+                do k = 1, n_el_bubs_loc
                     !u{1} = u{n} +  dt * RHS{n}
                     intfc_rad(k, 2) = intfc_rad(k, 1) + dt*intfc_draddt(k, 1)
                     intfc_vel(k, 2) = intfc_vel(k, 1) + dt*intfc_dveldt(k, 1)
-                    mtn_pos(k, 1:3, 2) = mtn_pos(k, 1:3, 1) + dt*mtn_dposdt(k, 1:3, 1)
-                    mtn_vel(k, 1:3, 2) = mtn_vel(k, 1:3, 1) + dt*mtn_dveldt(k, 1:3, 1)
                     gas_p(k, 2) = gas_p(k, 1) + dt*gas_dpdt(k, 1)
                     gas_mv(k, 2) = gas_mv(k, 1) + dt*gas_dmvdt(k, 1)
+                    if (moving_lag_bubbles) then
+                        mtn_posPrev(k, 1:3, 2) = mtn_pos(k, 1:3, 1)
+                        mtn_pos(k, 1:3, 2) = mtn_pos(k, 1:3, 1) + dt*mtn_dposdt(k, 1:3, 1)
+                        mtn_vel(k, 1:3, 2) = mtn_vel(k, 1:3, 1) + dt*mtn_dveldt(k, 1:3, 1)
+                    end if
                 end do
                 $:END_GPU_PARALLEL_LOOP()
 
+                if (moving_lag_bubbles) call s_enforce_EL_bubbles_boundary_conditions(q_prim_vf)
+                call s_smear_voidfraction(bc_type)
+
             elseif (stage == 2) then
+
                 $:GPU_PARALLEL_LOOP(private='[k]')
-                do k = 1, nBubs
+                do k = 1, n_el_bubs_loc
                     !u{1} = u{n} + (1/2) * dt * (RHS{n} + RHS{1})
                     intfc_rad(k, 1) = intfc_rad(k, 1) + dt*(intfc_draddt(k, 1) + intfc_draddt(k, 2))/2._wp
                     intfc_vel(k, 1) = intfc_vel(k, 1) + dt*(intfc_dveldt(k, 1) + intfc_dveldt(k, 2))/2._wp
-                    mtn_pos(k, 1:3, 1) = mtn_pos(k, 1:3, 1) + dt*(mtn_dposdt(k, 1:3, 1) + mtn_dposdt(k, 1:3, 2))/2._wp
-                    mtn_vel(k, 1:3, 1) = mtn_vel(k, 1:3, 1) + dt*(mtn_dveldt(k, 1:3, 1) + mtn_dveldt(k, 1:3, 2))/2._wp
                     gas_p(k, 1) = gas_p(k, 1) + dt*(gas_dpdt(k, 1) + gas_dpdt(k, 2))/2._wp
                     gas_mv(k, 1) = gas_mv(k, 1) + dt*(gas_dmvdt(k, 1) + gas_dmvdt(k, 2))/2._wp
+                    if (moving_lag_bubbles) then
+                        mtn_posPrev(k, 1:3, 1) = mtn_pos(k, 1:3, 2)
+                        mtn_pos(k, 1:3, 1) = mtn_pos(k, 1:3, 1) + dt*(mtn_dposdt(k, 1:3, 1) + mtn_dposdt(k, 1:3, 2))/2._wp
+                        mtn_vel(k, 1:3, 1) = mtn_vel(k, 1:3, 1) + dt*(mtn_dveldt(k, 1:3, 1) + mtn_dveldt(k, 1:3, 2))/2._wp
+                    end if
                 end do
                 $:END_GPU_PARALLEL_LOOP()
 
                 call s_transfer_data_to_tmp()
-                call s_write_void_evol(mytime)
+                if (moving_lag_bubbles) call s_enforce_EL_bubbles_boundary_conditions(q_prim_vf)
+                call s_smear_voidfraction(bc_type)
+                if (lag_params%write_void_evol) call s_write_void_evol(mytime)
                 if (lag_params%write_bubbles_stats) call s_calculate_lag_bubble_stats()
-
                 if (lag_params%write_bubbles) then
                     $:GPU_UPDATE(host='[gas_p,gas_mv,intfc_rad,intfc_vel]')
-                    call s_write_lag_particles(mytime)
+                    call s_write_lag_bubble_evol(mytime)
                 end if
 
             end if
 
         elseif (time_stepper == 3) then ! 3rd order TVD RK
             if (stage == 1) then
+
                 $:GPU_PARALLEL_LOOP(private='[k]')
-                do k = 1, nBubs
+                do k = 1, n_el_bubs_loc
                     !u{1} = u{n} +  dt * RHS{n}
                     intfc_rad(k, 2) = intfc_rad(k, 1) + dt*intfc_draddt(k, 1)
                     intfc_vel(k, 2) = intfc_vel(k, 1) + dt*intfc_dveldt(k, 1)
-                    mtn_pos(k, 1:3, 2) = mtn_pos(k, 1:3, 1) + dt*mtn_dposdt(k, 1:3, 1)
-                    mtn_vel(k, 1:3, 2) = mtn_vel(k, 1:3, 1) + dt*mtn_dveldt(k, 1:3, 1)
                     gas_p(k, 2) = gas_p(k, 1) + dt*gas_dpdt(k, 1)
                     gas_mv(k, 2) = gas_mv(k, 1) + dt*gas_dmvdt(k, 1)
+                    if (moving_lag_bubbles) then
+                        mtn_posPrev(k, 1:3, 2) = mtn_pos(k, 1:3, 1)
+                        mtn_pos(k, 1:3, 2) = mtn_pos(k, 1:3, 1) + dt*mtn_dposdt(k, 1:3, 1)
+                        mtn_vel(k, 1:3, 2) = mtn_vel(k, 1:3, 1) + dt*mtn_dveldt(k, 1:3, 1)
+                    end if
                 end do
                 $:END_GPU_PARALLEL_LOOP()
 
+                if (moving_lag_bubbles) call s_enforce_EL_bubbles_boundary_conditions(q_prim_vf)
+                call s_smear_voidfraction(bc_type)
+
             elseif (stage == 2) then
+
                 $:GPU_PARALLEL_LOOP(private='[k]')
-                do k = 1, nBubs
+                do k = 1, n_el_bubs_loc
                     !u{2} = u{n} + (1/4) * dt * [RHS{n} + RHS{1}]
                     intfc_rad(k, 2) = intfc_rad(k, 1) + dt*(intfc_draddt(k, 1) + intfc_draddt(k, 2))/4._wp
                     intfc_vel(k, 2) = intfc_vel(k, 1) + dt*(intfc_dveldt(k, 1) + intfc_dveldt(k, 2))/4._wp
-                    mtn_pos(k, 1:3, 2) = mtn_pos(k, 1:3, 1) + dt*(mtn_dposdt(k, 1:3, 1) + mtn_dposdt(k, 1:3, 2))/4._wp
-                    mtn_vel(k, 1:3, 2) = mtn_vel(k, 1:3, 1) + dt*(mtn_dveldt(k, 1:3, 1) + mtn_dveldt(k, 1:3, 2))/4._wp
                     gas_p(k, 2) = gas_p(k, 1) + dt*(gas_dpdt(k, 1) + gas_dpdt(k, 2))/4._wp
                     gas_mv(k, 2) = gas_mv(k, 1) + dt*(gas_dmvdt(k, 1) + gas_dmvdt(k, 2))/4._wp
+                    if (moving_lag_bubbles) then
+                        mtn_posPrev(k, 1:3, 2) = mtn_pos(k, 1:3, 2)
+                        mtn_pos(k, 1:3, 2) = mtn_pos(k, 1:3, 1) + dt*(mtn_dposdt(k, 1:3, 1) + mtn_dposdt(k, 1:3, 2))/4._wp
+                        mtn_vel(k, 1:3, 2) = mtn_vel(k, 1:3, 1) + dt*(mtn_dveldt(k, 1:3, 1) + mtn_dveldt(k, 1:3, 2))/4._wp
+                    end if
                 end do
                 $:END_GPU_PARALLEL_LOOP()
+
+                if (moving_lag_bubbles) call s_enforce_EL_bubbles_boundary_conditions(q_prim_vf)
+                call s_smear_voidfraction(bc_type)
+
             elseif (stage == 3) then
+
                 $:GPU_PARALLEL_LOOP(private='[k]')
-                do k = 1, nBubs
+                do k = 1, n_el_bubs_loc
                     !u{n+1} = u{n} + (2/3) * dt * [(1/4)* RHS{n} + (1/4)* RHS{1} + RHS{2}]
                     intfc_rad(k, 1) = intfc_rad(k, 1) + (2._wp/3._wp)*dt*(intfc_draddt(k, 1)/4._wp + intfc_draddt(k, 2)/4._wp + intfc_draddt(k, 3))
                     intfc_vel(k, 1) = intfc_vel(k, 1) + (2._wp/3._wp)*dt*(intfc_dveldt(k, 1)/4._wp + intfc_dveldt(k, 2)/4._wp + intfc_dveldt(k, 3))
-                    mtn_pos(k, 1:3, 1) = mtn_pos(k, 1:3, 1) + (2._wp/3._wp)*dt*(mtn_dposdt(k, 1:3, 1)/4._wp + mtn_dposdt(k, 1:3, 2)/4._wp + mtn_dposdt(k, 1:3, 3))
-                    mtn_vel(k, 1:3, 1) = mtn_vel(k, 1:3, 1) + (2._wp/3._wp)*dt*(mtn_dveldt(k, 1:3, 1)/4._wp + mtn_dveldt(k, 1:3, 2)/4._wp + mtn_dveldt(k, 1:3, 3))
                     gas_p(k, 1) = gas_p(k, 1) + (2._wp/3._wp)*dt*(gas_dpdt(k, 1)/4._wp + gas_dpdt(k, 2)/4._wp + gas_dpdt(k, 3))
                     gas_mv(k, 1) = gas_mv(k, 1) + (2._wp/3._wp)*dt*(gas_dmvdt(k, 1)/4._wp + gas_dmvdt(k, 2)/4._wp + gas_dmvdt(k, 3))
+                    if (moving_lag_bubbles) then
+                        mtn_posPrev(k, 1:3, 1) = mtn_pos(k, 1:3, 2)
+                        mtn_pos(k, 1:3, 1) = mtn_pos(k, 1:3, 1) + (2._wp/3._wp)*dt*(mtn_dposdt(k, 1:3, 1)/4._wp + mtn_dposdt(k, 1:3, 2)/4._wp + mtn_dposdt(k, 1:3, 3))
+                        mtn_vel(k, 1:3, 1) = mtn_vel(k, 1:3, 1) + (2._wp/3._wp)*dt*(mtn_dveldt(k, 1:3, 1)/4._wp + mtn_dveldt(k, 1:3, 2)/4._wp + mtn_dveldt(k, 1:3, 3))
+                    end if
                 end do
                 $:END_GPU_PARALLEL_LOOP()
 
                 call s_transfer_data_to_tmp()
-                call s_write_void_evol(mytime)
+                if (moving_lag_bubbles) call s_enforce_EL_bubbles_boundary_conditions(q_prim_vf)
+                call s_smear_voidfraction(bc_type)
+                if (lag_params%write_void_evol) call s_write_void_evol(mytime)
                 if (lag_params%write_bubbles_stats) call s_calculate_lag_bubble_stats()
-
                 if (lag_params%write_bubbles) then
                     $:GPU_UPDATE(host='[gas_p,gas_mv,gas_mg,intfc_rad,intfc_vel]')
-                    call s_write_lag_particles(mytime)
+                    call s_write_lag_bubble_evol(mytime)
                 end if
 
             end if
@@ -1176,11 +1407,199 @@ contains
 
     end subroutine s_update_lagrange_tdv_rk
 
+    !> This subroutine enforces reflective and wall boundary conditions for EL bubbles
+        !! @param dest Destination for the bubble position update
+    impure subroutine s_enforce_EL_bubbles_boundary_conditions(q_prim_vf)
+
+        type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf
+        integer :: k, i, q
+        integer :: patch_id, newBubs, new_idx
+        real(wp) :: offset
+        integer, dimension(3) :: cell
+
+        call nvtxStartRange("LAG-BC")
+        call nvtxStartRange("LAG-BC-DEV2HOST")
+        $:GPU_UPDATE(host='[bub_R0, Rmax_stats, Rmin_stats, gas_mg, gas_betaT, &
+            & gas_betaC, bub_dphidt, lag_id, gas_p, gas_mv, intfc_rad, intfc_vel, &
+            & mtn_pos, mtn_posPrev, mtn_vel, mtn_s, intfc_draddt, intfc_dveldt, &
+            & gas_dpdt, gas_dmvdt, mtn_dposdt, mtn_dveldt, keep_bubble, n_el_bubs_loc, &
+            & wrap_bubble_dir, wrap_bubble_loc]')
+        call nvtxEndRange
+
+        ! Handle MPI transfer of bubbles going to another processor's local domain
+        if (num_procs > 1) then
+            call nvtxStartRange("LAG-BC-TRANSFER-LIST")
+            call s_add_particles_to_transfer_list(n_el_bubs_loc, mtn_pos(:, :, 2), mtn_posPrev(:, :, 2))
+            call nvtxEndRange
+
+            call nvtxStartRange("LAG-BC-SENDRECV")
+            call s_mpi_sendrecv_particles(bub_R0, Rmax_stats, Rmin_stats, gas_mg, gas_betaT, &
+                                          gas_betaC, bub_dphidt, lag_id, gas_p, gas_mv, &
+                                          intfc_rad, intfc_vel, mtn_pos, mtn_posPrev, mtn_vel, &
+                                          mtn_s, intfc_draddt, intfc_dveldt, gas_dpdt, &
+                                          gas_dmvdt, mtn_dposdt, mtn_dveldt, lag_num_ts, n_el_bubs_loc, &
+                                          2)
+            call nvtxEndRange
+        end if
+
+        call nvtxStartRange("LAG-BC-HOST2DEV")
+        $:GPU_UPDATE(device='[bub_R0, Rmax_stats, Rmin_stats, gas_mg, gas_betaT, &
+            & gas_betaC, bub_dphidt, lag_id, gas_p, gas_mv, intfc_rad, intfc_vel, &
+            & mtn_pos, mtn_posPrev, mtn_vel, mtn_s, intfc_draddt, intfc_dveldt, &
+            & gas_dpdt, gas_dmvdt, mtn_dposdt, mtn_dveldt, n_el_bubs_loc]')
+        call nvtxEndRange
+
+        $:GPU_PARALLEL_LOOP(private='[k, cell]')
+        do k = 1, n_el_bubs_loc
+            keep_bubble(k) = 1
+            wrap_bubble_loc(k, :) = 0
+            wrap_bubble_dir(k, :) = 0
+
+            ! Relocate bubbles at solid boundaries and delete bubbles that leave
+            ! buffer regions
+            if (any(bc_x%beg == (/BC_REFLECTIVE, BC_CHAR_SLIP_WALL, BC_SLIP_WALL, BC_NO_SLIP_WALL/)) &
+                .and. mtn_pos(k, 1, 2) < x_cb(-1) + intfc_rad(k, 2)) then
+                mtn_pos(k, 1, 2) = x_cb(-1) + intfc_rad(k, 2)
+            elseif (any(bc_x%end == (/BC_REFLECTIVE, BC_CHAR_SLIP_WALL, BC_SLIP_WALL, BC_NO_SLIP_WALL/)) &
+                    .and. mtn_pos(k, 1, 2) > x_cb(m) - intfc_rad(k, 2)) then
+                mtn_pos(k, 1, 2) = x_cb(m) - intfc_rad(k, 2)
+            elseif (bc_x%beg == BC_PERIODIC .and. mtn_pos(k, 1, 2) < pcomm_coords(1)%beg .and. &
+                    mtn_posPrev(k, 1, 2) >= pcomm_coords(1)%beg) then
+                wrap_bubble_dir(k, 1) = 1
+                wrap_bubble_loc(k, 1) = -1
+            elseif (bc_x%end == BC_PERIODIC .and. mtn_pos(k, 1, 2) > pcomm_coords(1)%end .and. &
+                    mtn_posPrev(k, 1, 2) <= pcomm_coords(1)%end) then
+                wrap_bubble_dir(k, 1) = 1
+                wrap_bubble_loc(k, 1) = 1
+            elseif (mtn_pos(k, 1, 2) >= x_cb(m)) then
+                keep_bubble(k) = 0
+            elseif (mtn_pos(k, 1, 2) < x_cb(-1)) then
+                keep_bubble(k) = 0
+            end if
+
+            if (any(bc_y%beg == (/BC_REFLECTIVE, BC_CHAR_SLIP_WALL, BC_SLIP_WALL, BC_NO_SLIP_WALL/)) &
+                .and. mtn_pos(k, 2, 2) < y_cb(-1) + intfc_rad(k, 2)) then
+                mtn_pos(k, 2, 2) = y_cb(-1) + intfc_rad(k, 2)
+            else if (any(bc_y%end == (/BC_REFLECTIVE, BC_CHAR_SLIP_WALL, BC_SLIP_WALL, BC_NO_SLIP_WALL/)) &
+                     .and. mtn_pos(k, 2, 2) > y_cb(n) - intfc_rad(k, 2)) then
+                mtn_pos(k, 2, 2) = y_cb(n) - intfc_rad(k, 2)
+            elseif (bc_y%beg == BC_PERIODIC .and. mtn_pos(k, 2, 2) < pcomm_coords(2)%beg .and. &
+                    mtn_posPrev(k, 2, 2) >= pcomm_coords(2)%beg) then
+                wrap_bubble_dir(k, 2) = 1
+                wrap_bubble_loc(k, 2) = -1
+            elseif (bc_y%end == BC_PERIODIC .and. mtn_pos(k, 2, 2) > pcomm_coords(2)%end .and. &
+                    mtn_posPrev(k, 2, 2) <= pcomm_coords(2)%end) then
+                wrap_bubble_dir(k, 2) = 1
+                wrap_bubble_loc(k, 2) = 1
+            elseif (mtn_pos(k, 2, 2) >= y_cb(n)) then
+                keep_bubble(k) = 0
+            elseif (mtn_pos(k, 2, 2) < y_cb(-1)) then
+                keep_bubble(k) = 0
+            end if
+
+            if (p > 0) then
+                if (any(bc_z%beg == (/BC_REFLECTIVE, BC_CHAR_SLIP_WALL, BC_SLIP_WALL, BC_NO_SLIP_WALL/)) &
+                    .and. mtn_pos(k, 3, 2) < z_cb(-1) + intfc_rad(k, 2)) then
+                    mtn_pos(k, 3, 2) = z_cb(-1) + intfc_rad(k, 2)
+                else if (any(bc_z%end == (/BC_REFLECTIVE, BC_CHAR_SLIP_WALL, BC_SLIP_WALL, BC_NO_SLIP_WALL/)) &
+                         .and. mtn_pos(k, 3, 2) > z_cb(p) - intfc_rad(k, 2)) then
+                    mtn_pos(k, 3, 2) = z_cb(p) - intfc_rad(k, 2)
+                elseif (bc_z%beg == BC_PERIODIC .and. mtn_pos(k, 3, 2) < pcomm_coords(3)%beg .and. &
+                        mtn_posPrev(k, 3, 2) >= pcomm_coords(3)%beg) then
+                    wrap_bubble_dir(k, 3) = 1
+                    wrap_bubble_loc(k, 3) = -1
+                elseif (bc_z%end == BC_PERIODIC .and. mtn_pos(k, 3, 2) > pcomm_coords(3)%end .and. &
+                        mtn_posPrev(k, 3, 2) <= pcomm_coords(3)%end) then
+                    wrap_bubble_dir(k, 3) = 1
+                    wrap_bubble_loc(k, 3) = 1
+                elseif (mtn_pos(k, 3, 2) >= z_cb(p)) then
+                    keep_bubble(k) = 0
+                elseif (mtn_pos(k, 3, 2) < z_cb(-1)) then
+                    keep_bubble(k) = 0
+                end if
+            end if
+
+            if (keep_bubble(k) == 1) then
+                ! Remove bubbles that are no longer in a liquid
+                cell = fd_number - buff_size
+                call s_locate_cell(mtn_pos(k, 1:3, 2), cell, mtn_s(k, 1:3, 2))
+
+                if (q_prim_vf(advxb)%sf(cell(1), cell(2), cell(3)) < (1._wp - lag_params%valmaxvoid)) then
+                    keep_bubble(k) = 0
+                end if
+            end if
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+        if (n_el_bubs_loc > 0) then
+            call nvtxStartRange("LAG-BC-DEV2HOST")
+            $:GPU_UPDATE(host='[bub_R0, Rmax_stats, Rmin_stats, gas_mg, gas_betaT, &
+                & gas_betaC, bub_dphidt, lag_id, gas_p, gas_mv, intfc_rad, intfc_vel, &
+                & mtn_pos, mtn_posPrev, mtn_vel, mtn_s, intfc_draddt, intfc_dveldt, &
+                & gas_dpdt, gas_dmvdt, mtn_dposdt, mtn_dveldt, keep_bubble, n_el_bubs_loc, &
+                & wrap_bubble_dir, wrap_bubble_loc]')
+            call nvtxEndRange
+
+            newBubs = 0
+            do k = 1, n_el_bubs_loc
+                if (keep_bubble(k) == 1) then
+                    newBubs = newBubs + 1
+                    if (newBubs /= k) then
+                        call s_copy_lag_bubble(newBubs, k)
+                        wrap_bubble_dir(newBubs, :) = wrap_bubble_dir(k, :)
+                        wrap_bubble_loc(newBubs, :) = wrap_bubble_loc(k, :)
+                    end if
+                end if
+            end do
+            n_el_bubs_loc = newBubs
+
+            ! Handle periodic wrapping of bubbles on same processor
+            do k = 1, n_el_bubs_loc
+                if (any(wrap_bubble_dir(k, :) == 1)) then
+                    do i = 1, num_dims
+                        if (wrap_bubble_dir(k, i) == 1) then
+                            offset = glb_bounds(i)%end - glb_bounds(i)%beg
+                            if (wrap_bubble_loc(k, i) == 1) then
+                                do q = 1, 2
+                                    mtn_pos(k, i, q) = mtn_pos(k, i, q) - offset
+                                    mtn_posPrev(k, i, q) = mtn_posPrev(k, i, q) - offset
+                                end do
+                            else if (wrap_bubble_loc(k, i) == -1) then
+                                do q = 1, 2
+                                    mtn_pos(k, i, q) = mtn_pos(k, i, q) + offset
+                                    mtn_posPrev(k, i, q) = mtn_posPrev(k, i, q) + offset
+                                end do
+                            end if
+                        end if
+                    end do
+                end if
+            end do
+            call nvtxStartRange("LAG-BC-HOST2DEV")
+            $:GPU_UPDATE(device='[bub_R0, Rmax_stats, Rmin_stats, gas_mg, gas_betaT, &
+                & gas_betaC, bub_dphidt, lag_id, gas_p, gas_mv, intfc_rad, intfc_vel, &
+                & mtn_pos, mtn_posPrev, mtn_vel, mtn_s, intfc_draddt, intfc_dveldt, &
+                & gas_dpdt, gas_dmvdt, mtn_dposdt, mtn_dveldt, n_el_bubs_loc]')
+            call nvtxEndRange
+
+        end if
+
+        $:GPU_PARALLEL_LOOP(private='[cell]')
+        do k = 1, n_el_bubs_loc
+            cell = fd_number - buff_size
+            call s_locate_cell(mtn_pos(k, 1:3, 2), cell, mtn_s(k, 1:3, 2))
+        end do
+
+        call nvtxEndRange ! LAG-BC
+
+    end subroutine s_enforce_EL_bubbles_boundary_conditions
+
     !> This subroutine returns the computational coordinate of the cell for the given position.
           !! @param pos Input coordinates
           !! @param cell Computational coordinate of the cell
           !! @param scoord Calculated particle coordinates
     subroutine s_locate_cell(pos, cell, scoord)
+        $:GPU_ROUTINE(function_name='s_locate_cell',parallelism='[seq]', &
+            & cray_inline=True)
 
         real(wp), dimension(3), intent(in) :: pos
         real(wp), dimension(3), intent(out) :: scoord
@@ -1192,7 +1611,7 @@ contains
             cell(1) = cell(1) - 1
         end do
 
-        do while (pos(1) > x_cb(cell(1)))
+        do while (pos(1) >= x_cb(cell(1)))
             cell(1) = cell(1) + 1
         end do
 
@@ -1200,7 +1619,7 @@ contains
             cell(2) = cell(2) - 1
         end do
 
-        do while (pos(2) > y_cb(cell(2)))
+        do while (pos(2) >= y_cb(cell(2)))
             cell(2) = cell(2) + 1
         end do
 
@@ -1208,7 +1627,7 @@ contains
             do while (pos(3) < z_cb(cell(3) - 1))
                 cell(3) = cell(3) - 1
             end do
-            do while (pos(3) > z_cb(cell(3)))
+            do while (pos(3) >= z_cb(cell(3)))
                 cell(3) = cell(3) + 1
             end do
         end if
@@ -1237,7 +1656,7 @@ contains
         integer :: k
 
         $:GPU_PARALLEL_LOOP(private='[k]')
-        do k = 1, nBubs
+        do k = 1, n_el_bubs_loc
             gas_p(k, 2) = gas_p(k, 1)
             gas_mv(k, 2) = gas_mv(k, 1)
             intfc_rad(k, 2) = intfc_rad(k, 1)
@@ -1263,20 +1682,27 @@ contains
         if (p == 0 .and. cyl_coord .neqv. .true.) then
             ! Defining a virtual z-axis that has the same dimensions as y-axis
             ! defined in the input file
-            particle_in_domain = ((pos_part(1) < x_cb(m + buff_size)) .and. (pos_part(1) >= x_cb(-buff_size - 1)) .and. &
-                                  (pos_part(2) < y_cb(n + buff_size)) .and. (pos_part(2) >= y_cb(-buff_size - 1)) .and. &
-                                  (pos_part(3) < lag_params%charwidth/2._wp) .and. (pos_part(3) >= -lag_params%charwidth/2._wp))
+            particle_in_domain = ((pos_part(1) < x_cb(m + buff_size - fd_number)) .and. &
+                                  (pos_part(1) >= x_cb(fd_number - buff_size - 1)) .and. &
+                                  (pos_part(2) < y_cb(n + buff_size - fd_number)) .and. &
+                                  (pos_part(2) >= y_cb(fd_number - buff_size - 1)) .and. &
+                                  (pos_part(3) < lag_params%charwidth/2._wp) .and. (pos_part(3) > -lag_params%charwidth/2._wp))
         else
             ! cyl_coord
-            particle_in_domain = ((pos_part(1) < x_cb(m + buff_size)) .and. (pos_part(1) >= x_cb(-buff_size - 1)) .and. &
-                                  (abs(pos_part(2)) < y_cb(n + buff_size)) .and. (abs(pos_part(2)) >= max(y_cb(-buff_size - 1), 0._wp)))
+            particle_in_domain = ((pos_part(1) < x_cb(m + buff_size - fd_number)) .and. &
+                                  (pos_part(1) >= x_cb(fd_number - buff_size - 1)) .and. &
+                                  (abs(pos_part(2)) < y_cb(n + buff_size - fd_number)) .and. &
+                                  (abs(pos_part(2)) >= max(y_cb(fd_number - buff_size - 1), 0._wp)))
         end if
 
         ! 3D
         if (p > 0) then
-            particle_in_domain = ((pos_part(1) < x_cb(m + buff_size)) .and. (pos_part(1) >= x_cb(-buff_size - 1)) .and. &
-                                  (pos_part(2) < y_cb(n + buff_size)) .and. (pos_part(2) >= y_cb(-buff_size - 1)) .and. &
-                                  (pos_part(3) < z_cb(p + buff_size)) .and. (pos_part(3) >= z_cb(-buff_size - 1)))
+            particle_in_domain = ((pos_part(1) < x_cb(m + buff_size - fd_number)) .and. &
+                                  (pos_part(1) >= x_cb(fd_number - buff_size - 1)) .and. &
+                                  (pos_part(2) < y_cb(n + buff_size - fd_number)) .and. &
+                                  (pos_part(2) >= y_cb(fd_number - buff_size - 1)) .and. &
+                                  (pos_part(3) < z_cb(p + buff_size - fd_number)) .and. &
+                                  (pos_part(3) >= z_cb(fd_number - buff_size - 1)))
         end if
 
         ! For symmetric and wall boundary condition
@@ -1390,12 +1816,39 @@ contains
 
         character(LEN=path_len + 2*name_len) :: file_loc
         logical :: file_exist
+        character(LEN=25) :: FMT
+
+        write (file_loc, '(A,I0,A)') 'lag_bubble_evol_', proc_rank, '.dat'
+        file_loc = trim(case_dir)//'/D/'//trim(file_loc)
+        call my_inquire(trim(file_loc), file_exist)
+
+        if (precision == 1) then
+            FMT = "(A16,A14,8A16)"
+        else
+            FMT = "(A24,A14,8A24)"
+        end if
+
+        if (.not. file_exist) then
+            open (LAG_EVOL_ID, FILE=trim(file_loc), FORM='formatted', position='rewind')
+            write (LAG_EVOL_ID, FMT) 'currentTime', 'particleID', 'x', 'y', 'z', &
+                'coreVaporMass', 'coreVaporConcentration', 'radius', 'interfaceVelocity', &
+                'corePressure'
+        else
+            open (LAG_EVOL_ID, FILE=trim(file_loc), FORM='formatted', position='append')
+        end if
+
+    end subroutine s_write_lag_particles
+
+    !> @Brief Subroutine that opens the file to write the evolution of the lagrangian bubbles on each time step.
+    impure subroutine s_open_lag_bubble_evol()
 
+        character(LEN=path_len + 2*name_len) :: file_loc
+        logical :: file_exist
         character(LEN=25) :: FMT
 
         write (file_loc, '(A,I0,A)') 'lag_bubble_evol_', proc_rank, '.dat'
         file_loc = trim(case_dir)//'/D/'//trim(file_loc)
-        inquire (FILE=trim(file_loc), EXIST=file_exist)
+        call my_inquire(trim(file_loc), file_exist)
 
         if (precision == 1) then
             FMT = "(A16,A14,8A16)"
@@ -1404,14 +1857,27 @@ contains
         end if
 
         if (.not. file_exist) then
-            open (11, FILE=trim(file_loc), FORM='formatted', position='rewind')
-            write (11, FMT) 'currentTime', 'particleID', 'x', 'y', 'z', &
+            open (LAG_EVOL_ID, FILE=trim(file_loc), FORM='formatted', position='rewind')
+            write (LAG_EVOL_ID, FMT) 'currentTime', 'particleID', 'x', 'y', 'z', &
                 'coreVaporMass', 'coreVaporConcentration', 'radius', 'interfaceVelocity', &
                 'corePressure'
         else
-            open (11, FILE=trim(file_loc), FORM='formatted', position='append')
+            open (LAG_EVOL_ID, FILE=trim(file_loc), FORM='formatted', position='append')
         end if
 
+    end subroutine s_open_lag_bubble_evol
+
+    !> Subroutine that writes on each time step the changes of the lagrangian bubbles.
+        !!  @param q_time Current time
+    impure subroutine s_write_lag_bubble_evol(qtime)
+
+        real(wp), intent(in) :: qtime
+        integer :: k, ios
+        character(LEN=25) :: FMT
+
+        character(LEN=path_len + 2*name_len) :: file_loc, path
+        logical :: file_exist
+
         if (precision == 1) then
             FMT = "(F16.8,I14,8F16.8)"
         else
@@ -1419,8 +1885,8 @@ contains
         end if
 
         ! Cycle through list
-        do k = 1, nBubs
-            write (11, FMT) &
+        do k = 1, n_el_bubs_loc
+            write (LAG_EVOL_ID, FMT) &
                 qtime, &
                 lag_id(k, 1), &
                 mtn_pos(k, 1, 1), &
@@ -1433,22 +1899,15 @@ contains
                 gas_p(k, 1)
         end do
 
-        close (11)
+    end subroutine s_write_lag_bubble_evol
 
-    end subroutine s_write_lag_particles
+    impure subroutine s_close_lag_bubble_evol
 
-    !>  Subroutine that writes some useful statistics related to the volume fraction
-            !!       of the particles (void fraction) in the computatioational domain
-            !!       on each time step.
-            !!  @param qtime Current time
-    impure subroutine s_write_void_evol(qtime)
+        close (LAG_EVOL_ID)
 
-        real(wp), intent(in) :: qtime
-        real(wp) :: volcell, voltot
-        real(wp) :: lag_void_max, lag_void_avg, lag_vol
-        real(wp) :: void_max_glb, void_avg_glb, vol_glb
+    end subroutine s_close_lag_bubble_evol
 
-        integer :: i, j, k
+    subroutine s_open_void_evol
 
         character(LEN=path_len + 2*name_len) :: file_loc
         logical :: file_exist
@@ -1456,19 +1915,37 @@ contains
         if (proc_rank == 0) then
             write (file_loc, '(A)') 'voidfraction.dat'
             file_loc = trim(case_dir)//'/D/'//trim(file_loc)
-            inquire (FILE=trim(file_loc), EXIST=file_exist)
+            call my_inquire(trim(file_loc), file_exist)
             if (.not. file_exist) then
-                open (12, FILE=trim(file_loc), FORM='formatted', position='rewind')
+                open (LAG_VOID_ID, FILE=trim(file_loc), FORM='formatted', position='rewind')
                 !write (12, *) 'currentTime, averageVoidFraction, ', &
                 !    'maximumVoidFraction, totalParticlesVolume'
                 !write (12, *) 'The averageVoidFraction value does ', &
                 !    'not reflect the real void fraction in the cloud since the ', &
                 !    'cells which do not have bubbles are not accounted'
             else
-                open (12, FILE=trim(file_loc), FORM='formatted', position='append')
+                open (LAG_VOID_ID, FILE=trim(file_loc), FORM='formatted', position='append')
             end if
         end if
 
+    end subroutine s_open_void_evol
+
+    !>  Subroutine that writes some useful statistics related to the volume fraction
+            !!       of the particles (void fraction) in the computational domain
+            !!       on each time step.
+            !!  @param qtime Current time
+    impure subroutine s_write_void_evol(qtime)
+
+        real(wp), intent(in) :: qtime
+        real(wp) :: volcell, voltot
+        real(wp) :: lag_void_max, lag_void_avg, lag_vol
+        real(wp) :: void_max_glb, void_avg_glb, vol_glb
+
+        integer :: i, j, k
+
+        character(LEN=path_len + 2*name_len) :: file_loc
+        logical :: file_exist
+
         lag_void_max = 0._wp
         lag_void_avg = 0._wp
         lag_vol = 0._wp
@@ -1503,16 +1980,21 @@ contains
         if (lag_vol > 0._wp) lag_void_avg = lag_void_avg/lag_vol
 
         if (proc_rank == 0) then
-            write (12, '(6X,4e24.8)') &
+            write (LAG_VOID_ID, '(6X,4e24.8)') &
                 qtime, &
                 lag_void_avg, &
                 lag_void_max, &
                 voltot
-            close (12)
         end if
 
     end subroutine s_write_void_evol
 
+    subroutine s_close_void_evol
+
+        if (proc_rank == 0) close (LAG_VOID_ID)
+
+    end subroutine s_close_void_evol
+
     !>  Subroutine that writes the restarting files for the particles in the lagrangian solver.
         !!  @param t_step Current time step
     impure subroutine s_write_restart_lag_bubbles(t_step)
@@ -1532,13 +2014,14 @@ contains
         integer(KIND=MPI_OFFSET_KIND) :: disp
         integer :: view
         integer, dimension(2) :: gsizes, lsizes, start_idx_part
-        integer, allocatable :: proc_bubble_counts(:)
+        integer, dimension(num_procs) :: part_order, part_ord_mpi
+        integer, dimension(num_procs) :: proc_bubble_counts
         real(wp), dimension(1:1, 1:lag_io_vars) :: dummy
         dummy = 0._wp
 
         bub_id = 0._wp
-        if (nBubs /= 0) then
-            do k = 1, nBubs
+        if (n_el_bubs_loc /= 0) then
+            do k = 1, n_el_bubs_loc
                 if (particle_in_domain_physical(mtn_pos(k, 1:3, 1))) then
                     bub_id = bub_id + 1
                 end if
@@ -1547,8 +2030,6 @@ contains
 
         if (.not. parallel_io) return
 
-        allocate (proc_bubble_counts(num_procs))
-
         lsizes(1) = bub_id
         lsizes(2) = lag_io_vars
 
@@ -1600,26 +2081,22 @@ contains
         if (bub_id > 0) then
             allocate (MPI_IO_DATA_lag_bubbles(max(1, bub_id), 1:lag_io_vars))
 
-            i = 1
-            do k = 1, nBubs
-                if (particle_in_domain_physical(mtn_pos(k, 1:3, 1))) then
-                    MPI_IO_DATA_lag_bubbles(i, 1) = real(lag_id(k, 1))
-                    MPI_IO_DATA_lag_bubbles(i, 2:4) = mtn_pos(k, 1:3, 1)
-                    MPI_IO_DATA_lag_bubbles(i, 5:7) = mtn_posPrev(k, 1:3, 1)
-                    MPI_IO_DATA_lag_bubbles(i, 8:10) = mtn_vel(k, 1:3, 1)
-                    MPI_IO_DATA_lag_bubbles(i, 11) = intfc_rad(k, 1)
-                    MPI_IO_DATA_lag_bubbles(i, 12) = intfc_vel(k, 1)
-                    MPI_IO_DATA_lag_bubbles(i, 13) = bub_R0(k)
-                    MPI_IO_DATA_lag_bubbles(i, 14) = Rmax_stats(k)
-                    MPI_IO_DATA_lag_bubbles(i, 15) = Rmin_stats(k)
-                    MPI_IO_DATA_lag_bubbles(i, 16) = bub_dphidt(k)
-                    MPI_IO_DATA_lag_bubbles(i, 17) = gas_p(k, 1)
-                    MPI_IO_DATA_lag_bubbles(i, 18) = gas_mv(k, 1)
-                    MPI_IO_DATA_lag_bubbles(i, 19) = gas_mg(k)
-                    MPI_IO_DATA_lag_bubbles(i, 20) = gas_betaT(k)
-                    MPI_IO_DATA_lag_bubbles(i, 21) = gas_betaC(k)
-                    i = i + 1
-                end if
+            do k = 1, n_el_bubs_loc
+                MPI_IO_DATA_lag_bubbles(k, 1) = real(lag_id(k, 1))
+                MPI_IO_DATA_lag_bubbles(k, 2:4) = mtn_pos(k, 1:3, 1)
+                MPI_IO_DATA_lag_bubbles(k, 5:7) = mtn_posPrev(k, 1:3, 1)
+                MPI_IO_DATA_lag_bubbles(k, 8:10) = mtn_vel(k, 1:3, 1)
+                MPI_IO_DATA_lag_bubbles(k, 11) = intfc_rad(k, 1)
+                MPI_IO_DATA_lag_bubbles(k, 12) = intfc_vel(k, 1)
+                MPI_IO_DATA_lag_bubbles(k, 13) = bub_R0(k)
+                MPI_IO_DATA_lag_bubbles(k, 14) = Rmax_stats(k)
+                MPI_IO_DATA_lag_bubbles(k, 15) = Rmin_stats(k)
+                MPI_IO_DATA_lag_bubbles(k, 16) = bub_dphidt(k)
+                MPI_IO_DATA_lag_bubbles(k, 17) = gas_p(k, 1)
+                MPI_IO_DATA_lag_bubbles(k, 18) = gas_mv(k, 1)
+                MPI_IO_DATA_lag_bubbles(k, 19) = gas_mg(k)
+                MPI_IO_DATA_lag_bubbles(k, 20) = gas_betaT(k)
+                MPI_IO_DATA_lag_bubbles(k, 21) = gas_betaC(k)
             end do
 
             call MPI_TYPE_CREATE_SUBARRAY(2, gsizes, lsizes, start_idx_part, &
@@ -1660,8 +2137,6 @@ contains
             call MPI_FILE_CLOSE(ifile, ierr)
         end if
 
-        deallocate (proc_bubble_counts)
-
 #endif
 
     end subroutine s_write_restart_lag_bubbles
@@ -1673,7 +2148,7 @@ contains
 
         $:GPU_PARALLEL_LOOP(private='[k]', reduction='[[Rmax_glb], [Rmin_glb]]', &
             & reductionOp='[MAX, MIN]', copy='[Rmax_glb,Rmin_glb]')
-        do k = 1, nBubs
+        do k = 1, n_el_bubs_loc
             Rmax_glb = max(Rmax_glb, intfc_rad(k, 1)/bub_R0(k))
             Rmin_glb = min(Rmin_glb, intfc_rad(k, 1)/bub_R0(k))
             Rmax_stats(k) = max(Rmax_stats(k), intfc_rad(k, 1)/bub_R0(k))
@@ -1683,18 +2158,15 @@ contains
 
     end subroutine s_calculate_lag_bubble_stats
 
-    !>  Subroutine that writes the maximum and minimum radius of each bubble.
-    impure subroutine s_write_lag_bubble_stats()
+    impure subroutine s_open_lag_bubble_stats()
 
-        integer :: k
         character(LEN=path_len + 2*name_len) :: file_loc
-
-        character(len=20) :: FMT
+        character(LEN=20) :: FMT
+        logical :: file_exist
 
         write (file_loc, '(A,I0,A)') 'stats_lag_bubbles_', proc_rank, '.dat'
         file_loc = trim(case_dir)//'/D/'//trim(file_loc)
-
-        $:GPU_UPDATE(host='[Rmax_glb,Rmin_glb]')
+        call my_inquire(trim(file_loc), file_exist)
 
         if (precision == 1) then
             FMT = "(A10,A14,5A16)"
@@ -1702,8 +2174,23 @@ contains
             FMT = "(A10,A14,5A24)"
         end if
 
-        open (13, FILE=trim(file_loc), FORM='formatted', position='rewind')
-        write (13, FMT) 'proc_rank', 'particleID', 'x', 'y', 'z', 'Rmax_glb', 'Rmin_glb'
+        if (.not. file_exist) then
+            open (LAG_STATS_ID, FILE=trim(file_loc), FORM='formatted', position='rewind')
+            write (LAG_STATS_ID, *) 'proc_rank, particleID, x, y, z, Rmax_glb, Rmin_glb'
+        else
+            open (LAG_STATS_ID, FILE=trim(file_loc), FORM='formatted', position='append')
+        end if
+
+    end subroutine s_open_lag_bubble_stats
+
+    !>  Subroutine that writes the maximum and minimum radius of each bubble.
+    impure subroutine s_write_lag_bubble_stats()
+
+        integer :: k
+        character(LEN=path_len + 2*name_len) :: file_loc
+        character(LEN=20) :: FMT
+
+        $:GPU_UPDATE(host='[Rmax_glb,Rmin_glb]')
 
         if (precision == 1) then
             FMT = "(I10,I14,5F16.8)"
@@ -1711,8 +2198,8 @@ contains
             FMT = "(I10,I14,5F24.16)"
         end if
 
-        do k = 1, nBubs
-            write (13, FMT) &
+        do k = 1, n_el_bubs_loc
+            write (LAG_STATS_ID, FMT) &
                 proc_rank, &
                 lag_id(k, 1), &
                 mtn_pos(k, 1, 1), &
@@ -1722,56 +2209,60 @@ contains
                 Rmin_stats(k)
         end do
 
-        close (13)
-
     end subroutine s_write_lag_bubble_stats
 
-    !> The purpose of this subroutine is to remove one specific particle if dt is too small.
-          !! @param bub_id Particle id
-    impure subroutine s_remove_lag_bubble(bub_id)
+    subroutine s_close_lag_bubble_stats
 
-        integer, intent(in) :: bub_id
+        close (LAG_STATS_ID)
 
-        integer :: i
+    end subroutine s_close_lag_bubble_stats
 
-        $:GPU_LOOP(parallelism='[seq]')
-        do i = bub_id, nBubs - 1
-            lag_id(i, 1) = lag_id(i + 1, 1)
-            bub_R0(i) = bub_R0(i + 1)
-            Rmax_stats(i) = Rmax_stats(i + 1)
-            Rmin_stats(i) = Rmin_stats(i + 1)
-            gas_mg(i) = gas_mg(i + 1)
-            gas_betaT(i) = gas_betaT(i + 1)
-            gas_betaC(i) = gas_betaC(i + 1)
-            bub_dphidt(i) = bub_dphidt(i + 1)
-            gas_p(i, 1:2) = gas_p(i + 1, 1:2)
-            gas_mv(i, 1:2) = gas_mv(i + 1, 1:2)
-            intfc_rad(i, 1:2) = intfc_rad(i + 1, 1:2)
-            intfc_vel(i, 1:2) = intfc_vel(i + 1, 1:2)
-            mtn_pos(i, 1:3, 1:2) = mtn_pos(i + 1, 1:3, 1:2)
-            mtn_posPrev(i, 1:3, 1:2) = mtn_posPrev(i + 1, 1:3, 1:2)
-            mtn_vel(i, 1:3, 1:2) = mtn_vel(i + 1, 1:3, 1:2)
-            mtn_s(i, 1:3, 1:2) = mtn_s(i + 1, 1:3, 1:2)
-            intfc_draddt(i, 1:lag_num_ts) = intfc_draddt(i + 1, 1:lag_num_ts)
-            intfc_dveldt(i, 1:lag_num_ts) = intfc_dveldt(i + 1, 1:lag_num_ts)
-            gas_dpdt(i, 1:lag_num_ts) = gas_dpdt(i + 1, 1:lag_num_ts)
-            gas_dmvdt(i, 1:lag_num_ts) = gas_dmvdt(i + 1, 1:lag_num_ts)
-        end do
-
-        nBubs = nBubs - 1
-        $:GPU_UPDATE(device='[nBubs]')
-
-    end subroutine s_remove_lag_bubble
+    !> The purpose of this subroutine is to remove one specific particle if dt is too small.
+          !! @param bub_id Particle id
+    impure subroutine s_copy_lag_bubble(dest, src)
+
+        integer, intent(in) :: src, dest
+
+        bub_R0(dest) = bub_R0(src)
+        Rmax_stats(dest) = Rmax_stats(src)
+        Rmin_stats(dest) = Rmin_stats(src)
+        gas_mg(dest) = gas_mg(src)
+        gas_betaT(dest) = gas_betaT(src)
+        gas_betaC(dest) = gas_betaC(src)
+        bub_dphidt(dest) = bub_dphidt(src)
+        lag_id(dest, 1) = lag_id(src, 1)
+        gas_p(dest, 1:2) = gas_p(src, 1:2)
+        gas_mv(dest, 1:2) = gas_mv(src, 1:2)
+        intfc_rad(dest, 1:2) = intfc_rad(src, 1:2)
+        intfc_vel(dest, 1:2) = intfc_vel(src, 1:2)
+        mtn_vel(dest, 1:3, 1:2) = mtn_vel(src, 1:3, 1:2)
+        mtn_s(dest, 1:3, 1:2) = mtn_s(src, 1:3, 1:2)
+        mtn_pos(dest, 1:3, 1:2) = mtn_pos(src, 1:3, 1:2)
+        mtn_posPrev(dest, 1:3, 1:2) = mtn_posPrev(src, 1:3, 1:2)
+        intfc_draddt(dest, 1:lag_num_ts) = intfc_draddt(src, 1:lag_num_ts)
+        intfc_dveldt(dest, 1:lag_num_ts) = intfc_dveldt(src, 1:lag_num_ts)
+        gas_dpdt(dest, 1:lag_num_ts) = gas_dpdt(src, 1:lag_num_ts)
+        gas_dmvdt(dest, 1:lag_num_ts) = gas_dmvdt(src, 1:lag_num_ts)
+        mtn_dposdt(dest, 1:3, 1:lag_num_ts) = mtn_dposdt(src, 1:3, 1:lag_num_ts)
+        mtn_dveldt(dest, 1:3, 1:lag_num_ts) = mtn_dveldt(src, 1:3, 1:lag_num_ts)
+
+    end subroutine s_copy_lag_bubble
 
     !> The purpose of this subroutine is to deallocate variables
     impure subroutine s_finalize_lagrangian_solver()
 
         integer :: i
 
+        if (lag_params%write_void_evol) call s_close_void_evol
+        if (lag_params%write_bubbles) call s_close_lag_bubble_evol()
+        if (lag_params%write_bubbles_stats) call s_close_lag_bubble_stats()
+
         do i = 1, q_beta_idx
             @:DEALLOCATE(q_beta(i)%sf)
+            @:DEALLOCATE(kahan_comp(i)%sf)
         end do
         @:DEALLOCATE(q_beta)
+        @:DEALLOCATE(kahan_comp)
 
         !Deallocating space
         @:DEALLOCATE(lag_id)
@@ -1797,6 +2288,28 @@ contains
         @:DEALLOCATE(mtn_dposdt)
         @:DEALLOCATE(mtn_dveldt)
 
+        @:DEALLOCATE(keep_bubble)
+        @:DEALLOCATE(wrap_bubble_loc, wrap_bubble_dir)
+
+        ! Deallocate pressure gradient arrays and FD coefficients
+        if (lag_params%vel_model > 0 .and. lag_params%pressure_force) then
+            @:DEALLOCATE(grad_p_x)
+            @:DEALLOCATE(fd_coeff_x_pgrad)
+            if (n > 0) then
+                @:DEALLOCATE(grad_p_y)
+                @:DEALLOCATE(fd_coeff_y_pgrad)
+                if (p > 0) then
+                    @:DEALLOCATE(grad_p_z)
+                    @:DEALLOCATE(fd_coeff_z_pgrad)
+                end if
+            end if
+        end if
+
+        ! Deallocate cell list arrays
+        @:DEALLOCATE(cell_list_start)
+        @:DEALLOCATE(cell_list_count)
+        @:DEALLOCATE(cell_list_idx)
+
     end subroutine s_finalize_lagrangian_solver
 
 end module m_bubbles_EL
diff --git a/src/simulation/m_bubbles_EL_kernels.fpp b/src/simulation/m_bubbles_EL_kernels.fpp
index 784abc5adb..f298a883e4 100644
--- a/src/simulation/m_bubbles_EL_kernels.fpp
+++ b/src/simulation/m_bubbles_EL_kernels.fpp
@@ -11,6 +11,22 @@ module m_bubbles_EL_kernels
 
     implicit none
 
+    ! Cell-centered pressure gradients (precomputed for translational motion)
+    real(wp), allocatable, dimension(:, :, :) :: grad_p_x, grad_p_y, grad_p_z
+    $:GPU_DECLARE(create='[grad_p_x, grad_p_y, grad_p_z]')
+
+    ! Finite-difference coefficients for pressure gradient computation
+    real(wp), allocatable, dimension(:, :) :: fd_coeff_x_pgrad
+    real(wp), allocatable, dimension(:, :) :: fd_coeff_y_pgrad
+    real(wp), allocatable, dimension(:, :) :: fd_coeff_z_pgrad
+    $:GPU_DECLARE(create='[fd_coeff_x_pgrad, fd_coeff_y_pgrad, fd_coeff_z_pgrad]')
+
+    ! Cell list for bubble-to-cell mapping (rebuilt each RK stage before smearing)
+    integer, allocatable, dimension(:, :, :) :: cell_list_start  ! (0:m, 0:n, 0:p)
+    integer, allocatable, dimension(:, :, :) :: cell_list_count  ! (0:m, 0:n, 0:p)
+    integer, allocatable, dimension(:) :: cell_list_idx          ! (1:nBubs_glb) sorted bubble indices
+    $:GPU_DECLARE(create='[cell_list_start, cell_list_count, cell_list_idx]')
+
 contains
 
     !> The purpose of this subroutine is to smear the strength of the lagrangian
@@ -21,177 +37,259 @@ contains
             !! @param lbk_s Computational coordinates of the bubbles
             !! @param lbk_pos Spatial coordinates of the bubbles
             !! @param updatedvar Eulerian variable to be updated
-    subroutine s_smoothfunction(nBubs, lbk_rad, lbk_vel, lbk_s, lbk_pos, updatedvar)
+    subroutine s_smoothfunction(nBubs, lbk_rad, lbk_vel, lbk_s, lbk_pos, updatedvar, kcomp)
 
         integer, intent(in) :: nBubs
         real(wp), dimension(1:lag_params%nBubs_glb, 1:3, 1:2), intent(in) :: lbk_s, lbk_pos
         real(wp), dimension(1:lag_params%nBubs_glb, 1:2), intent(in) :: lbk_rad, lbk_vel
         type(scalar_field), dimension(:), intent(inout) :: updatedvar
+        type(scalar_field), dimension(:), intent(inout) :: kcomp
 
         smoothfunc:select case(lag_params%smooth_type)
         case (1)
-        call s_gaussian(nBubs, lbk_rad, lbk_vel, lbk_s, lbk_pos, updatedvar)
+        call s_gaussian(nBubs, lbk_rad, lbk_vel, lbk_s, lbk_pos, updatedvar, kcomp)
         case (2)
-        call s_deltafunc(nBubs, lbk_rad, lbk_vel, lbk_s, updatedvar)
+        call s_deltafunc(nBubs, lbk_rad, lbk_vel, lbk_s, updatedvar, kcomp)
         end select smoothfunc
 
     end subroutine s_smoothfunction
 
-    !> The purpose of this procedure contains the algorithm to use the delta kernel function to map the effect of the bubbles.
-            !!      The effect of the bubbles only affects the cell where the bubble is located.
-    subroutine s_deltafunc(nBubs, lbk_rad, lbk_vel, lbk_s, updatedvar)
+    !> Builds a sorted cell list mapping each interior cell (0:m,0:n,0:p) to its
+    !!      resident bubbles. Uses a counting-sort on the host (O(nBubs + N_cells)).
+    !!      Must be called before s_gaussian each RK stage.
+    !! @param nBubs Number of lagrangian bubbles in the current domain
+    !! @param lbk_s Computational coordinates of the bubbles
+    subroutine s_build_cell_list(nBubs, lbk_s)
 
         integer, intent(in) :: nBubs
         real(wp), dimension(1:lag_params%nBubs_glb, 1:3, 1:2), intent(in) :: lbk_s
-        real(wp), dimension(1:lag_params%nBubs_glb, 1:2), intent(in) :: lbk_rad, lbk_vel
-        type(scalar_field), dimension(:), intent(inout) :: updatedvar
 
-        integer, dimension(3) :: cell
-        real(wp) :: strength_vel, strength_vol
-
-        real(wp) :: addFun1, addFun2, addFun3
-        real(wp) :: volpart, Vol
+        integer :: l, ci, cj, ck, idx
         real(wp), dimension(3) :: s_coord
-        integer :: l
 
-        $:GPU_PARALLEL_LOOP(private='[l,s_coord,cell]')
+        ! Bring current bubble positions to host
+        $:GPU_UPDATE(host='[lbk_s]')
+
+        ! Pass 1: zero counts and count bubbles per cell
+        cell_list_count = 0
         do l = 1, nBubs
+            s_coord(1:3) = lbk_s(l, 1:3, 2)
+            ci = int(s_coord(1))
+            cj = int(s_coord(2))
+            ck = int(s_coord(3))
+            ! Clamp to interior (bubbles should already be in [0:m,0:n,0:p])
+            ci = max(0, min(ci, m))
+            cj = max(0, min(cj, n))
+            ck = max(0, min(ck, p))
+            cell_list_count(ci, cj, ck) = cell_list_count(ci, cj, ck) + 1
+        end do
+
+        ! Prefix sum to compute start indices (1-based into cell_list_idx)
+        idx = 1
+        do ck = 0, p
+            do cj = 0, n
+                do ci = 0, m
+                    cell_list_start(ci, cj, ck) = idx
+                    idx = idx + cell_list_count(ci, cj, ck)
+                end do
+            end do
+        end do
 
-            volpart = 4._wp/3._wp*pi*lbk_rad(l, 2)**3._wp
+        ! Pass 2: place bubble indices into cell_list_idx
+        ! Temporarily reuse cell_list_count as a running offset
+        cell_list_count = 0
+        do l = 1, nBubs
             s_coord(1:3) = lbk_s(l, 1:3, 2)
-            call s_get_cell(s_coord, cell)
+            ci = int(s_coord(1))
+            cj = int(s_coord(2))
+            ck = int(s_coord(3))
+            ci = max(0, min(ci, m))
+            cj = max(0, min(cj, n))
+            ck = max(0, min(ck, p))
+            cell_list_idx(cell_list_start(ci, cj, ck) + cell_list_count(ci, cj, ck)) = l
+            cell_list_count(ci, cj, ck) = cell_list_count(ci, cj, ck) + 1
+        end do
 
-            strength_vol = volpart
-            strength_vel = 4._wp*pi*lbk_rad(l, 2)**2._wp*lbk_vel(l, 2)
+        ! Send cell list arrays to GPU
+        $:GPU_UPDATE(device='[cell_list_start, cell_list_count, cell_list_idx]')
 
-            if (num_dims == 2) then
-                Vol = dx(cell(1))*dy(cell(2))*lag_params%charwidth
-                if (cyl_coord) Vol = dx(cell(1))*dy(cell(2))*y_cc(cell(2))*2._wp*pi
-            else
-                Vol = dx(cell(1))*dy(cell(2))*dz(cell(3))
-            end if
+    end subroutine s_build_cell_list
 
-            !Update void fraction field
-            addFun1 = strength_vol/Vol
-            $:GPU_ATOMIC(atomic='update')
-            updatedvar(1)%sf(cell(1), cell(2), cell(3)) = updatedvar(1)%sf(cell(1), cell(2), cell(3)) + real(addFun1, kind=stp)
-
-            !Update time derivative of void fraction
-            addFun2 = strength_vel/Vol
-            $:GPU_ATOMIC(atomic='update')
-            updatedvar(2)%sf(cell(1), cell(2), cell(3)) = updatedvar(2)%sf(cell(1), cell(2), cell(3)) + real(addFun2, kind=stp)
-
-            !Product of two smeared functions
-            !Update void fraction * time derivative of void fraction
-            if (lag_params%cluster_type >= 4) then
-                addFun3 = (strength_vol*strength_vel)/Vol
-                $:GPU_ATOMIC(atomic='update')
-                updatedvar(5)%sf(cell(1), cell(2), cell(3)) = updatedvar(5)%sf(cell(1), cell(2), cell(3)) + real(addFun3, kind=stp)
-            end if
+    !> Cell-centric delta-function smearing using the cell list (no GPU atomics).
+    !!      Each bubble only affects the cell it resides in. The outer GPU loop
+    !!      iterates over interior cells and sums contributions from resident bubbles.
+    subroutine s_deltafunc(nBubs, lbk_rad, lbk_vel, lbk_s, updatedvar, kcomp)
+
+        integer, intent(in) :: nBubs
+        real(wp), dimension(1:lag_params%nBubs_glb, 1:3, 1:2), intent(in) :: lbk_s
+        real(wp), dimension(1:lag_params%nBubs_glb, 1:2), intent(in) :: lbk_rad, lbk_vel
+        type(scalar_field), dimension(:), intent(inout) :: updatedvar
+        type(scalar_field), dimension(:), intent(inout) :: kcomp
+
+        real(wp) :: strength_vel, strength_vol
+        real(wp) :: volpart, Vol
+        real(wp) :: y_kahan, t_kahan
+        integer :: i, j, k, lb, bub_idx
+
+        $:GPU_PARALLEL_LOOP(collapse=3, private='[i,j,k,lb,bub_idx,volpart,Vol,strength_vel,strength_vol,y_kahan,t_kahan]')
+        do k = 0, p
+            do j = 0, n
+                do i = 0, m
+
+                    ! Cell volume
+                    if (num_dims == 2) then
+                        Vol = dx(i)*dy(j)*lag_params%charwidth
+                        if (cyl_coord) Vol = dx(i)*dy(j)*y_cc(j)*2._wp*pi
+                    else
+                        Vol = dx(i)*dy(j)*dz(k)
+                    end if
+
+                    ! Loop over bubbles in this cell
+                    $:GPU_LOOP(parallelism='[seq]')
+                    do lb = cell_list_start(i, j, k), &
+                        cell_list_start(i, j, k) + cell_list_count(i, j, k) - 1
+
+                        bub_idx = cell_list_idx(lb)
+
+                        volpart = 4._wp/3._wp*pi*lbk_rad(bub_idx, 2)**3._wp
+                        strength_vol = volpart
+                        strength_vel = 4._wp*pi*lbk_rad(bub_idx, 2)**2._wp*lbk_vel(bub_idx, 2)
+
+                        ! Kahan summation for void fraction
+                        y_kahan = real(strength_vol/Vol, kind=wp) - kcomp(1)%sf(i, j, k)
+                        t_kahan = updatedvar(1)%sf(i, j, k) + y_kahan
+                        kcomp(1)%sf(i, j, k) = (t_kahan - updatedvar(1)%sf(i, j, k)) - y_kahan
+                        updatedvar(1)%sf(i, j, k) = t_kahan
+
+                        ! Kahan summation for time derivative of void fraction
+                        y_kahan = real(strength_vel/Vol, kind=wp) - kcomp(2)%sf(i, j, k)
+                        t_kahan = updatedvar(2)%sf(i, j, k) + y_kahan
+                        kcomp(2)%sf(i, j, k) = (t_kahan - updatedvar(2)%sf(i, j, k)) - y_kahan
+                        updatedvar(2)%sf(i, j, k) = t_kahan
+
+                        ! Product of two smeared functions
+                        if (lag_params%cluster_type >= 4) then
+                            y_kahan = real((strength_vol*strength_vel)/Vol, kind=wp) - kcomp(5)%sf(i, j, k)
+                            t_kahan = updatedvar(5)%sf(i, j, k) + y_kahan
+                            kcomp(5)%sf(i, j, k) = (t_kahan - updatedvar(5)%sf(i, j, k)) - y_kahan
+                            updatedvar(5)%sf(i, j, k) = t_kahan
+                        end if
+                    end do
+
+                end do
+            end do
         end do
         $:END_GPU_PARALLEL_LOOP()
 
     end subroutine s_deltafunc
 
-    !> The purpose of this procedure contains the algorithm to use the gaussian kernel function to map the effect of the bubbles.
-            !!      The effect of the bubbles affects the 3X3x3 cells that surround the bubble.
-    subroutine s_gaussian(nBubs, lbk_rad, lbk_vel, lbk_s, lbk_pos, updatedvar)
+    !> Cell-centric gaussian smearing using the cell list (no GPU atomics).
+    !!      Each grid cell accumulates contributions from nearby bubbles looked up
+    !!      via cell_list_start/count/idx.
+    subroutine s_gaussian(nBubs, lbk_rad, lbk_vel, lbk_s, lbk_pos, updatedvar, kcomp)
 
         integer, intent(in) :: nBubs
         real(wp), dimension(1:lag_params%nBubs_glb, 1:3, 1:2), intent(in) :: lbk_s, lbk_pos
         real(wp), dimension(1:lag_params%nBubs_glb, 1:2), intent(in) :: lbk_rad, lbk_vel
         type(scalar_field), dimension(:), intent(inout) :: updatedvar
+        type(scalar_field), dimension(:), intent(inout) :: kcomp
 
-        real(wp), dimension(3) :: center
-        integer, dimension(3) :: cell
-        real(wp) :: stddsv
+        real(wp), dimension(3) :: center, nodecoord, s_coord
+        integer, dimension(3) :: cell, cellijk
+        real(wp) :: stddsv, volpart
         real(wp) :: strength_vel, strength_vol
-
-        real(wp), dimension(3) :: nodecoord
-        real(wp) :: addFun1, addFun2, addFun3
-        real(wp) :: func, func2, volpart
-        integer, dimension(3) :: cellaux
-        real(wp), dimension(3) :: s_coord
-        integer :: l, i, j, k
-        logical :: celloutside
-        integer :: smearGrid, smearGridz
-
-        smearGrid = mapCells - (-mapCells) + 1 ! Include the cell that contains the bubble (3+1+3)
-        smearGridz = smearGrid
-        if (p == 0) smearGridz = 1
-
-        $:GPU_PARALLEL_LOOP(private='[nodecoord,l,s_coord,cell,center]', copyin='[smearGrid,smearGridz]')
-        do l = 1, nBubs
-            nodecoord(1:3) = 0
-            center(1:3) = 0._wp
-            volpart = 4._wp/3._wp*pi*lbk_rad(l, 2)**3._wp
-            s_coord(1:3) = lbk_s(l, 1:3, 2)
-            center(1:2) = lbk_pos(l, 1:2, 2)
-            if (p > 0) center(3) = lbk_pos(l, 3, 2)
-            call s_get_cell(s_coord, cell)
-            call s_compute_stddsv(cell, volpart, stddsv)
-
-            strength_vol = volpart
-            strength_vel = 4._wp*pi*lbk_rad(l, 2)**2._wp*lbk_vel(l, 2)
-
-            $:GPU_LOOP(collapse=3,private='[cellaux,nodecoord]')
-            do i = 1, smearGrid
-                do j = 1, smearGrid
-                    do k = 1, smearGridz
-                        cellaux(1) = cell(1) + i - (mapCells + 1)
-                        cellaux(2) = cell(2) + j - (mapCells + 1)
-                        cellaux(3) = cell(3) + k - (mapCells + 1)
-                        if (p == 0) cellaux(3) = 0
-
-                        !Check if the cells intended to smear the bubbles in are in the computational domain
-                        !and redefine the cells for symmetric boundary
-                        call s_check_celloutside(cellaux, celloutside)
-
-                        if (.not. celloutside) then
-
-                            nodecoord(1) = x_cc(cellaux(1))
-                            nodecoord(2) = y_cc(cellaux(2))
-                            if (p > 0) nodecoord(3) = z_cc(cellaux(3))
-                            call s_applygaussian(center, cellaux, nodecoord, stddsv, 0._wp, func)
-                            if (lag_params%cluster_type >= 4) call s_applygaussian(center, cellaux, nodecoord, stddsv, 1._wp, func2)
-
-                            ! Relocate cells for bubbles intersecting symmetric boundaries
-                            if (any((/bc_x%beg, bc_x%end, bc_y%beg, bc_y%end, bc_z%beg, bc_z%end/) == BC_REFLECTIVE)) then
-                                call s_shift_cell_symmetric_bc(cellaux, cell)
-                            end if
-                        else
-                            func = 0._wp
-                            func2 = 0._wp
-                            cellaux(1) = cell(1)
-                            cellaux(2) = cell(2)
-                            cellaux(3) = cell(3)
-                            if (p == 0) cellaux(3) = 0
-                        end if
-
-                        !Update void fraction field
-                        addFun1 = func*strength_vol
-                        $:GPU_ATOMIC(atomic='update')
-                        updatedvar(1)%sf(cellaux(1), cellaux(2), cellaux(3)) = &
-                            updatedvar(1)%sf(cellaux(1), cellaux(2), cellaux(3)) &
-                            + real(addFun1, kind=stp)
-
-                        !Update time derivative of void fraction
-                        addFun2 = func*strength_vel
-                        $:GPU_ATOMIC(atomic='update')
-                        updatedvar(2)%sf(cellaux(1), cellaux(2), cellaux(3)) = &
-                            updatedvar(2)%sf(cellaux(1), cellaux(2), cellaux(3)) &
-                            + real(addFun2, kind=stp)
-
-                        !Product of two smeared functions
-                        !Update void fraction * time derivative of void fraction
-                        if (lag_params%cluster_type >= 4) then
-                            addFun3 = func2*strength_vol*strength_vel
-                            $:GPU_ATOMIC(atomic='update')
-                            updatedvar(5)%sf(cellaux(1), cellaux(2), cellaux(3)) = &
-                                updatedvar(5)%sf(cellaux(1), cellaux(2), cellaux(3)) &
-                                + real(addFun3, kind=stp)
-                        end if
+        real(wp) :: func, func2
+        real(wp) :: y_kahan, t_kahan
+        integer :: i, j, k, di, dj, dk, lb, bub_idx
+        integer :: di_beg, di_end, dj_beg, dj_end, dk_beg, dk_end
+        integer :: smear_x_beg, smear_x_end
+        integer :: smear_y_beg, smear_y_end
+        integer :: smear_z_beg, smear_z_end
+
+        ! Extended grid range for smearing (includes buffer cells for MPI communication)
+        smear_x_beg = -mapCells - 1
+        smear_x_end = m + mapCells + 1
+        smear_y_beg = merge(-mapCells - 1, 0, n > 0)
+        smear_y_end = merge(n + mapCells + 1, n, n > 0)
+        smear_z_beg = merge(-mapCells - 1, 0, p > 0)
+        smear_z_end = merge(p + mapCells + 1, p, p > 0)
+
+        $:GPU_PARALLEL_LOOP(collapse=3, &
+            & private='[i,j,k,di,dj,dk,lb,bub_idx,center,nodecoord,s_coord,cell,cellijk,stddsv,volpart,strength_vel,strength_vol,func,func2,y_kahan,t_kahan,di_beg,di_end,dj_beg,dj_end,dk_beg,dk_end]', &
+            & copyin='[smear_x_beg,smear_x_end,smear_y_beg,smear_y_end,smear_z_beg,smear_z_end]')
+        do k = smear_z_beg, smear_z_end
+            do j = smear_y_beg, smear_y_end
+                do i = smear_x_beg, smear_x_end
+
+                    cellijk(1) = i
+                    cellijk(2) = j
+                    cellijk(3) = k
+
+                    nodecoord(1) = x_cc(i)
+                    nodecoord(2) = y_cc(j)
+                    nodecoord(3) = 0._wp
+                    if (p > 0) nodecoord(3) = z_cc(k)
+
+                    ! Neighbor cell range clamped to interior [0:m, 0:n, 0:p]
+                    di_beg = max(i - mapCells, 0)
+                    di_end = min(i + mapCells, m)
+                    dj_beg = max(j - mapCells, 0)
+                    dj_end = min(j + mapCells, n)
+                    dk_beg = max(k - mapCells, 0)
+                    dk_end = min(k + mapCells, p)
+
+                    $:GPU_LOOP(parallelism='[seq]')
+                    do dk = dk_beg, dk_end
+                        $:GPU_LOOP(parallelism='[seq]')
+                        do dj = dj_beg, dj_end
+                            $:GPU_LOOP(parallelism='[seq]')
+                            do di = di_beg, di_end
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do lb = cell_list_start(di, dj, dk), &
+                                    cell_list_start(di, dj, dk) + cell_list_count(di, dj, dk) - 1
+
+                                    bub_idx = cell_list_idx(lb)
+
+                                    ! Bubble properties
+                                    volpart = 4._wp/3._wp*pi*lbk_rad(bub_idx, 2)**3._wp
+                                    s_coord(1:3) = lbk_s(bub_idx, 1:3, 2)
+                                    call s_get_cell(s_coord, cell)
+                                    call s_compute_stddsv(cell, volpart, stddsv)
+
+                                    strength_vol = volpart
+                                    strength_vel = 4._wp*pi*lbk_rad(bub_idx, 2)**2._wp*lbk_vel(bub_idx, 2)
+
+                                    center(1:2) = lbk_pos(bub_idx, 1:2, 2)
+                                    center(3) = 0._wp
+                                    if (p > 0) center(3) = lbk_pos(bub_idx, 3, 2)
+
+                                    call s_applygaussian(center, cellijk, nodecoord, stddsv, 0._wp, func)
+
+                                    ! Kahan summation for void fraction
+                                    y_kahan = real(func*strength_vol, kind=wp) - kcomp(1)%sf(i, j, k)
+                                    t_kahan = updatedvar(1)%sf(i, j, k) + y_kahan
+                                    kcomp(1)%sf(i, j, k) = (t_kahan - updatedvar(1)%sf(i, j, k)) - y_kahan
+                                    updatedvar(1)%sf(i, j, k) = t_kahan
+
+                                    ! Kahan summation for time derivative of void fraction
+                                    y_kahan = real(func*strength_vel, kind=wp) - kcomp(2)%sf(i, j, k)
+                                    t_kahan = updatedvar(2)%sf(i, j, k) + y_kahan
+                                    kcomp(2)%sf(i, j, k) = (t_kahan - updatedvar(2)%sf(i, j, k)) - y_kahan
+                                    updatedvar(2)%sf(i, j, k) = t_kahan
+
+                                    if (lag_params%cluster_type >= 4) then
+                                        call s_applygaussian(center, cellijk, nodecoord, stddsv, 1._wp, func2)
+                                        y_kahan = real(func2*strength_vol*strength_vel, kind=wp) - kcomp(5)%sf(i, j, k)
+                                        t_kahan = updatedvar(5)%sf(i, j, k) + y_kahan
+                                        kcomp(5)%sf(i, j, k) = (t_kahan - updatedvar(5)%sf(i, j, k)) - y_kahan
+                                        updatedvar(5)%sf(i, j, k) = t_kahan
+                                    end if
+
+                                end do
+                            end do
+                        end do
                     end do
+
                 end do
             end do
         end do
@@ -210,9 +308,10 @@ contains
         real(wp), intent(in) :: stddsv
         real(wp), intent(in) :: strength_idx
         real(wp), intent(out) :: func
+        integer :: i
 
         real(wp) :: distance
-        real(wp) :: theta, dtheta, L2, dzp, Lz2
+        real(wp) :: theta, dtheta, L2, dzp, Lz2, zc
         real(wp) :: Nr, Nr_count
 
         distance = sqrt((center(1) - nodecoord(1))**2._wp + (center(2) - nodecoord(2))**2._wp + (center(3) - nodecoord(3))**2._wp)
@@ -243,22 +342,16 @@ contains
                            dtheta/2._wp/pi*exp(-0.5_wp*(distance/stddsv)**2._wp)/(sqrt(2._wp*pi)*stddsv)**(3._wp*(strength_idx + 1._wp))
                 end do
             else
-
-                !< 2D cartesian function:
-                ! We smear particles considering a virtual depth (lag_params%charwidth)
-                theta = 0._wp
-                Nr = ceiling(lag_params%charwidth/(y_cb(cellaux(2)) - y_cb(cellaux(2) - 1)))
-                Nr_count = 1._wp - mapCells*1._wp
-                dzp = y_cb(cellaux(2) + 1) - y_cb(cellaux(2))
-                Lz2 = (center(3) - (dzp*(0.5_wp + Nr_count) - lag_params%charwidth/2._wp))**2._wp
-                distance = sqrt((center(1) - nodecoord(1))**2._wp + (center(2) - nodecoord(2))**2._wp + Lz2)
-                func = dzp/lag_params%charwidth*exp(-0.5_wp*(distance/stddsv)**2._wp)/(sqrt(2._wp*pi)*stddsv)**3._wp
-                do while (Nr_count < Nr - 1._wp + ((mapCells - 1)*1._wp))
-                    Nr_count = Nr_count + 1._wp
-                    Lz2 = (center(3) - (dzp*(0.5_wp + Nr_count) - lag_params%charwidth/2._wp))**2._wp
+                !< 2D cartesian function: Equation (48) from Madea and Colonius 2018
+                ! We smear particles considering a virtual depth (lag_params%charwidth) with lag_params%charNz cells
+                dzp = (lag_params%charwidth/(lag_params%charNz + 1._wp))
+
+                func = 0._wp
+                do i = 0, lag_params%charNz
+                    zc = (-lag_params%charwidth/2._wp + dzp*(0.5_wp + i)) ! Center of virtual cell i in z-direction
+                    Lz2 = (center(3) - zc)**2._wp
                     distance = sqrt((center(1) - nodecoord(1))**2._wp + (center(2) - nodecoord(2))**2._wp + Lz2)
-                    func = func + &
-                           dzp/lag_params%charwidth*exp(-0.5_wp*(distance/stddsv)**2._wp)/(sqrt(2._wp*pi)*stddsv)**(3._wp*(strength_idx + 1._wp))
+                    func = func + dzp/lag_params%charwidth*exp(-0.5_wp*(distance/stddsv)**2._wp)/(sqrt(2._wp*pi)*stddsv)**3._wp
                 end do
             end if
         end if
@@ -368,7 +461,7 @@ contains
         end if
 
         !< Compute Standard deviaton
-        if (((volpart/charvol) > 0.5_wp*lag_params%valmaxvoid) .or. (lag_params%smooth_type == 1)) then
+        if ((volpart/charvol) > 0.5_wp*lag_params%valmaxvoid .or. (lag_params%smooth_type == 1)) then
             rad = (3._wp*volpart/(4._wp*pi))**(1._wp/3._wp)
             stddsv = 1._wp*lag_params%epsilonb*max(chardist, rad)
         else
@@ -420,4 +513,227 @@ contains
 
     end subroutine s_get_cell
 
+    !> Precomputes cell-centered pressure gradients (dp/dx, dp/dy, dp/dz) at all cell centers
+        !!      using finite-difference coefficients of the specified order. This avoids
+        !!      scattered memory accesses to the pressure field when computing translational
+        !!      bubble forces.
+        !! @param q_prim_vf Primitive variables (pressure is at index E_idx)
+    subroutine s_compute_pressure_gradients(q_prim_vf)
+
+        type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf
+
+        integer :: i, j, k, r
+
+        ! dp/dx at all cell centers
+        $:GPU_PARALLEL_LOOP(private='[i,j,k,r]', collapse=3)
+        do k = 0, p
+            do j = 0, n
+                do i = 0, m
+                    grad_p_x(i, j, k) = 0._wp
+                    $:GPU_LOOP(parallelism='[seq]')
+                    do r = -fd_number, fd_number
+                        grad_p_x(i, j, k) = grad_p_x(i, j, k) + &
+                                            q_prim_vf(E_idx)%sf(i + r, j, k)*fd_coeff_x_pgrad(r, i)
+                    end do
+                end do
+            end do
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+        ! dp/dy at all cell centers
+        if (n > 0) then
+            $:GPU_PARALLEL_LOOP(private='[i,j,k,r]', collapse=3)
+            do k = 0, p
+                do j = 0, n
+                    do i = 0, m
+                        grad_p_y(i, j, k) = 0._wp
+                        $:GPU_LOOP(parallelism='[seq]')
+                        do r = -fd_number, fd_number
+                            grad_p_y(i, j, k) = grad_p_y(i, j, k) + &
+                                                q_prim_vf(E_idx)%sf(i, j + r, k)*fd_coeff_y_pgrad(r, j)
+                        end do
+                    end do
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+        end if
+
+        ! dp/dz at all cell centers
+        if (p > 0) then
+            $:GPU_PARALLEL_LOOP(private='[i,j,k,r]', collapse=3)
+            do k = 0, p
+                do j = 0, n
+                    do i = 0, m
+                        grad_p_z(i, j, k) = 0._wp
+                        $:GPU_LOOP(parallelism='[seq]')
+                        do r = -fd_number, fd_number
+                            grad_p_z(i, j, k) = grad_p_z(i, j, k) + &
+                                                q_prim_vf(E_idx)%sf(i, j, k + r)*fd_coeff_z_pgrad(r, k)
+                        end do
+                    end do
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+        end if
+
+    end subroutine s_compute_pressure_gradients
+
+    !! This function interpolates the velocity of Eulerian field at the position
+            !! of the bubble.
+            !! @param pos Position of the bubble in directiion i
+            !! @param cell Computational coordinates of the bubble
+            !! @param i Direction of the velocity (1: x, 2: y, 3: z)
+            !! @param q_prim_vf Eulerian field with primitive variables
+            !! @return v Interpolated velocity at the position of the bubble
+    function f_interpolate_velocity(pos, cell, i, q_prim_vf) result(v)
+        $:GPU_ROUTINE(parallelism='[seq]')
+        real(wp), intent(in) :: pos
+        integer, dimension(3), intent(in) :: cell
+        integer, intent(in) :: i
+        type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf
+
+        real(wp) :: v
+        real(wp), dimension(fd_order + 1) :: xi, eta, L
+
+        if (fd_order == 2) then
+            if (i == 1) then
+                xi(1) = x_cc(cell(1) - 1)
+                eta(1) = q_prim_vf(momxb)%sf(cell(1) - 1, cell(2), cell(3))
+                xi(2) = x_cc(cell(1))
+                eta(2) = q_prim_vf(momxb)%sf(cell(1), cell(2), cell(3))
+                xi(3) = x_cc(cell(1) + 1)
+                eta(3) = q_prim_vf(momxb)%sf(cell(1) + 1, cell(2), cell(3))
+            elseif (i == 2) then
+                xi(1) = y_cc(cell(2) - 1)
+                eta(1) = q_prim_vf(momxb + 1)%sf(cell(1), cell(2) - 1, cell(3))
+                xi(2) = y_cc(cell(2))
+                eta(2) = q_prim_vf(momxb + 1)%sf(cell(1), cell(2), cell(3))
+                xi(3) = y_cc(cell(2) + 1)
+                eta(3) = q_prim_vf(momxb + 1)%sf(cell(1), cell(2) + 1, cell(3))
+            elseif (i == 3) then
+                xi(1) = z_cc(cell(3) - 1)
+                eta(1) = q_prim_vf(momxe)%sf(cell(1), cell(2), cell(3) - 1)
+                xi(2) = z_cc(cell(3))
+                eta(2) = q_prim_vf(momxe)%sf(cell(1), cell(2), cell(3))
+                xi(3) = z_cc(cell(3) + 1)
+                eta(3) = q_prim_vf(momxe)%sf(cell(1), cell(2), cell(3) + 1)
+            end if
+
+            L(1) = ((pos - xi(2))*(pos - xi(3)))/((xi(1) - xi(2))*(xi(1) - xi(3)))
+            L(2) = ((pos - xi(1))*(pos - xi(3)))/((xi(2) - xi(1))*(xi(2) - xi(3)))
+            L(3) = ((pos - xi(1))*(pos - xi(2)))/((xi(3) - xi(1))*(xi(3) - xi(2)))
+
+            v = L(1)*eta(1) + L(2)*eta(2) + L(3)*eta(3)
+        elseif (fd_order == 4) then
+            if (i == 1) then
+                xi(1) = x_cc(cell(1) - 2)
+                eta(1) = q_prim_vf(momxb)%sf(cell(1) - 2, cell(2), cell(3))
+                xi(2) = x_cc(cell(1) - 1)
+                eta(2) = q_prim_vf(momxb)%sf(cell(1) - 1, cell(2), cell(3))
+                xi(3) = x_cc(cell(1))
+                eta(3) = q_prim_vf(momxb)%sf(cell(1), cell(2), cell(3))
+                xi(4) = x_cc(cell(1) + 1)
+                eta(4) = q_prim_vf(momxb)%sf(cell(1) + 1, cell(2), cell(3))
+                xi(5) = x_cc(cell(1) + 2)
+                eta(5) = q_prim_vf(momxb)%sf(cell(1) + 2, cell(2), cell(3))
+            elseif (i == 2) then
+                xi(1) = y_cc(cell(2) - 2)
+                eta(1) = q_prim_vf(momxb + 1)%sf(cell(1), cell(2) - 2, cell(3))
+                xi(2) = y_cc(cell(2) - 1)
+                eta(2) = q_prim_vf(momxb + 1)%sf(cell(1), cell(2) - 1, cell(3))
+                xi(3) = y_cc(cell(2))
+                eta(3) = q_prim_vf(momxb + 1)%sf(cell(1), cell(2), cell(3))
+                xi(4) = y_cc(cell(2) + 1)
+                eta(4) = q_prim_vf(momxb + 1)%sf(cell(1), cell(2) + 1, cell(3))
+                xi(5) = y_cc(cell(2) + 2)
+                eta(5) = q_prim_vf(momxb + 1)%sf(cell(1), cell(2) + 2, cell(3))
+            elseif (i == 3) then
+                xi(1) = z_cc(cell(3) - 2)
+                eta(1) = q_prim_vf(momxe)%sf(cell(1), cell(2), cell(3) - 2)
+                xi(2) = z_cc(cell(3) - 1)
+                eta(2) = q_prim_vf(momxe)%sf(cell(1), cell(2), cell(3) - 1)
+                xi(3) = z_cc(cell(3))
+                eta(3) = q_prim_vf(momxe)%sf(cell(1), cell(2), cell(3))
+                xi(4) = z_cc(cell(3) + 1)
+                eta(4) = q_prim_vf(momxe)%sf(cell(1), cell(2), cell(3) + 1)
+                xi(5) = z_cc(cell(3) + 2)
+                eta(5) = q_prim_vf(momxe)%sf(cell(1), cell(2), cell(3) + 2)
+            end if
+
+            L(1) = ((pos - xi(2))*(pos - xi(3))*(pos - xi(4))*(pos - xi(5)))/ &
+                   ((xi(1) - xi(2))*(xi(1) - xi(3))*(xi(1) - xi(4))*(xi(2) - xi(5)))
+            L(2) = ((pos - xi(1))*(pos - xi(3))*(pos - xi(4))*(pos - xi(5)))/ &
+                   ((xi(2) - xi(1))*(xi(2) - xi(3))*(xi(2) - xi(4))*(xi(2) - xi(5)))
+            L(3) = ((pos - xi(1))*(pos - xi(2))*(pos - xi(4))*(pos - xi(5)))/ &
+                   ((xi(3) - xi(1))*(xi(3) - xi(2))*(xi(3) - xi(4))*(xi(3) - xi(5)))
+            L(4) = ((pos - xi(1))*(pos - xi(2))*(pos - xi(3))*(pos - xi(4)))/ &
+                   ((xi(4) - xi(1))*(xi(4) - xi(2))*(xi(4) - xi(3))*(xi(4) - xi(5)))
+            L(5) = ((pos - xi(1))*(pos - xi(2))*(pos - xi(3))*(pos - xi(4)))/ &
+                   ((xi(5) - xi(1))*(xi(5) - xi(2))*(xi(5) - xi(3))*(xi(5) - xi(4)))
+
+            v = L(1)*eta(1) + L(2)*eta(2) + L(3)*eta(3) + L(4)*eta(4) + L(5)*eta(5)
+        end if
+
+    end function f_interpolate_velocity
+
+    !! This function calculates the force on a bubble
+            !!      based on the pressure gradient, velocity, and drag model.
+            !! @param pos Position of the bubble in direction i
+            !! @param rad Radius of the bubble
+            !! @param rdot Radial velocity of the bubble
+            !! @param vel Velocity of the bubble
+            !! @param mg Mass of the gas in the bubble
+            !! @param mv Mass of the liquid in the bubble
+            !! @param Re Reynolds number
+            !! @param rho Density of the fluid
+            !! @param cell Computational coordinates of the bubble
+            !! @param i Direction of the velocity (1: x, 2: y, 3: z)
+            !! @param q_prim_vf Eulerian field with primitive variables
+            !! @return a Acceleration of the bubble in direction i
+    function f_get_bubble_force(pos, rad, rdot, vel, mg, mv, Re, rho, cell, i, q_prim_vf) result(force)
+        $:GPU_ROUTINE(parallelism='[seq]')
+        real(wp), intent(in) :: pos, rad, rdot, mg, mv, Re, rho, vel
+        integer, dimension(3), intent(in) :: cell
+        integer, intent(in) :: i
+        type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf
+
+        real(wp) :: dp, vol, force
+        real(wp) :: v_rel
+
+        if (fd_order > 1) then
+            v_rel = vel - f_interpolate_velocity(pos, cell, i, q_prim_vf)
+        else
+            v_rel = vel - q_prim_vf(momxb + i - 1)%sf(cell(1), cell(2), cell(3))
+        end if
+
+        force = 0._wp
+
+        if (lag_params%drag_model == 1) then ! Free slip Stokes drag
+            force = force - (4._wp*pi*rad*v_rel)/Re
+        else if (lag_params%drag_model == 2) then ! No slip Stokes drag
+            force = force - (6._wp*pi*rad*v_rel)/Re
+        else if (lag_params%drag_model == 3) then ! Levich drag
+            force = force - (12._wp*pi*rad*v_rel)/Re
+        end if
+
+        if (lag_pressure_force) then
+            ! Use precomputed cell-centered pressure gradients
+            if (i == 1) then
+                dp = grad_p_x(cell(1), cell(2), cell(3))
+            elseif (i == 2) then
+                dp = grad_p_y(cell(1), cell(2), cell(3))
+            elseif (i == 3) then
+                dp = grad_p_z(cell(1), cell(2), cell(3))
+            end if
+
+            vol = (4._wp/3._wp)*pi*(rad**3._wp)
+            force = force - vol*dp
+        end if
+
+        if (lag_params%gravity_force) then
+            force = force + (mg + mv)*accel_bf(i)
+        end if
+
+    end function f_get_bubble_force
+
 end module m_bubbles_EL_kernels
diff --git a/src/simulation/m_compute_levelset.fpp b/src/simulation/m_compute_levelset.fpp
index 0663ae73ca..f5d07f9e58 100644
--- a/src/simulation/m_compute_levelset.fpp
+++ b/src/simulation/m_compute_levelset.fpp
@@ -102,8 +102,8 @@ contains
 
         radius = patch_ib(ib_patch_id)%radius
 
-        dist_vec(1) = x_cc(i) - patch_ib(ib_patch_id)%x_centroid - real(gp%x_periodicity, wp)*(x_domain%end - x_domain%beg)
-        dist_vec(2) = y_cc(j) - patch_ib(ib_patch_id)%y_centroid - real(gp%y_periodicity, wp)*(y_domain%end - y_domain%beg)
+        dist_vec(1) = x_cc(i) - patch_ib(ib_patch_id)%x_centroid - real(gp%x_periodicity, wp)*(glb_bounds(1)%end - glb_bounds(1)%beg)
+        dist_vec(2) = y_cc(j) - patch_ib(ib_patch_id)%y_centroid - real(gp%y_periodicity, wp)*(glb_bounds(2)%end - glb_bounds(2)%beg)
         dist_vec(3) = 0._wp
         dist = sqrt(sum(dist_vec**2))
 
@@ -137,8 +137,8 @@ contains
         i = gp%loc(1)
         j = gp%loc(2)
 
-        center(1) = patch_ib(ib_patch_id)%x_centroid + real(gp%x_periodicity, wp)*(x_domain%end - x_domain%beg)
-        center(2) = patch_ib(ib_patch_id)%y_centroid + real(gp%y_periodicity, wp)*(y_domain%end - y_domain%beg)
+        center(1) = patch_ib(ib_patch_id)%x_centroid + real(gp%x_periodicity, wp)*(glb_bounds(1)%end - glb_bounds(1)%beg)
+        center(2) = patch_ib(ib_patch_id)%y_centroid + real(gp%y_periodicity, wp)*(glb_bounds(2)%end - glb_bounds(2)%beg)
         inverse_rotation(:, :) = patch_ib(ib_patch_id)%rotation_matrix_inverse(:, :)
         rotation(:, :) = patch_ib(ib_patch_id)%rotation_matrix(:, :)
         offset(:) = patch_ib(ib_patch_id)%centroid_offset(:)
@@ -223,9 +223,9 @@ contains
         j = gp%loc(2)
         l = gp%loc(3)
 
-        center(1) = patch_ib(ib_patch_id)%x_centroid + real(gp%x_periodicity, wp)*(x_domain%end - x_domain%beg)
-        center(2) = patch_ib(ib_patch_id)%y_centroid + real(gp%y_periodicity, wp)*(y_domain%end - y_domain%beg)
-        center(3) = patch_ib(ib_patch_id)%z_centroid + real(gp%z_periodicity, wp)*(z_domain%end - z_domain%beg)
+        center(1) = patch_ib(ib_patch_id)%x_centroid + real(gp%x_periodicity, wp)*(glb_bounds(1)%end - glb_bounds(1)%beg)
+        center(2) = patch_ib(ib_patch_id)%y_centroid + real(gp%y_periodicity, wp)*(glb_bounds(2)%end - glb_bounds(2)%beg)
+        center(3) = patch_ib(ib_patch_id)%z_centroid + real(gp%z_periodicity, wp)*(glb_bounds(3)%end - glb_bounds(3)%beg)
         lz = patch_ib(ib_patch_id)%length_z
         inverse_rotation(:, :) = patch_ib(ib_patch_id)%rotation_matrix_inverse(:, :)
         rotation(:, :) = patch_ib(ib_patch_id)%rotation_matrix(:, :)
@@ -328,8 +328,8 @@ contains
 
         length_x = patch_ib(ib_patch_id)%length_x
         length_y = patch_ib(ib_patch_id)%length_y
-        center(1) = patch_ib(ib_patch_id)%x_centroid + real(gp%x_periodicity, wp)*(x_domain%end - x_domain%beg)
-        center(2) = patch_ib(ib_patch_id)%y_centroid + real(gp%y_periodicity, wp)*(y_domain%end - y_domain%beg)
+        center(1) = patch_ib(ib_patch_id)%x_centroid + real(gp%x_periodicity, wp)*(glb_bounds(1)%end - glb_bounds(1)%beg)
+        center(2) = patch_ib(ib_patch_id)%y_centroid + real(gp%y_periodicity, wp)*(glb_bounds(2)%end - glb_bounds(2)%beg)
         inverse_rotation(:, :) = patch_ib(ib_patch_id)%rotation_matrix_inverse(:, :)
         rotation(:, :) = patch_ib(ib_patch_id)%rotation_matrix(:, :)
 
@@ -399,8 +399,8 @@ contains
 
         length_x = patch_ib(ib_patch_id)%length_x
         length_y = patch_ib(ib_patch_id)%length_y
-        center(1) = patch_ib(ib_patch_id)%x_centroid + real(gp%x_periodicity, wp)*(x_domain%end - x_domain%beg)
-        center(2) = patch_ib(ib_patch_id)%y_centroid + real(gp%y_periodicity, wp)*(y_domain%end - y_domain%beg)
+        center(1) = patch_ib(ib_patch_id)%x_centroid + real(gp%x_periodicity, wp)*(glb_bounds(1)%end - glb_bounds(1)%beg)
+        center(2) = patch_ib(ib_patch_id)%y_centroid + real(gp%y_periodicity, wp)*(glb_bounds(2)%end - glb_bounds(2)%beg)
         inverse_rotation(:, :) = patch_ib(ib_patch_id)%rotation_matrix_inverse(:, :)
         rotation(:, :) = patch_ib(ib_patch_id)%rotation_matrix(:, :)
 
@@ -453,9 +453,9 @@ contains
         length_y = patch_ib(ib_patch_id)%length_y
         length_z = patch_ib(ib_patch_id)%length_z
 
-        center(1) = patch_ib(ib_patch_id)%x_centroid + real(gp%x_periodicity, wp)*(x_domain%end - x_domain%beg)
-        center(2) = patch_ib(ib_patch_id)%y_centroid + real(gp%y_periodicity, wp)*(y_domain%end - y_domain%beg)
-        center(3) = patch_ib(ib_patch_id)%z_centroid + real(gp%z_periodicity, wp)*(z_domain%end - z_domain%beg)
+        center(1) = patch_ib(ib_patch_id)%x_centroid + real(gp%x_periodicity, wp)*(glb_bounds(1)%end - glb_bounds(1)%beg)
+        center(2) = patch_ib(ib_patch_id)%y_centroid + real(gp%y_periodicity, wp)*(glb_bounds(2)%end - glb_bounds(2)%beg)
+        center(3) = patch_ib(ib_patch_id)%z_centroid + real(gp%z_periodicity, wp)*(glb_bounds(3)%end - glb_bounds(3)%beg)
 
         inverse_rotation(:, :) = patch_ib(ib_patch_id)%rotation_matrix_inverse(:, :)
         rotation(:, :) = patch_ib(ib_patch_id)%rotation_matrix(:, :)
@@ -535,9 +535,9 @@ contains
         k = gp%loc(3)
 
         radius = patch_ib(ib_patch_id)%radius
-        periodicity(1) = real(gp%x_periodicity, wp)*(x_domain%end - x_domain%beg)
-        periodicity(2) = real(gp%y_periodicity, wp)*(y_domain%end - y_domain%beg)
-        periodicity(3) = real(gp%z_periodicity, wp)*(z_domain%end - z_domain%beg)
+        periodicity(1) = real(gp%x_periodicity, wp)*(glb_bounds(1)%end - glb_bounds(1)%beg)
+        periodicity(2) = real(gp%y_periodicity, wp)*(glb_bounds(2)%end - glb_bounds(2)%beg)
+        periodicity(3) = real(gp%z_periodicity, wp)*(glb_bounds(3)%end - glb_bounds(3)%beg)
         center(1) = patch_ib(ib_patch_id)%x_centroid
         center(2) = patch_ib(ib_patch_id)%y_centroid
         center(3) = patch_ib(ib_patch_id)%z_centroid
@@ -579,9 +579,9 @@ contains
         k = gp%loc(3)
 
         radius = patch_ib(ib_patch_id)%radius
-        center(1) = patch_ib(ib_patch_id)%x_centroid + real(gp%x_periodicity, wp)*(x_domain%end - x_domain%beg)
-        center(2) = patch_ib(ib_patch_id)%y_centroid + real(gp%y_periodicity, wp)*(y_domain%end - y_domain%beg)
-        center(3) = patch_ib(ib_patch_id)%z_centroid + real(gp%z_periodicity, wp)*(z_domain%end - z_domain%beg)
+        center(1) = patch_ib(ib_patch_id)%x_centroid + real(gp%x_periodicity, wp)*(glb_bounds(1)%end - glb_bounds(1)%beg)
+        center(2) = patch_ib(ib_patch_id)%y_centroid + real(gp%y_periodicity, wp)*(glb_bounds(2)%end - glb_bounds(2)%beg)
+        center(3) = patch_ib(ib_patch_id)%z_centroid + real(gp%z_periodicity, wp)*(glb_bounds(3)%end - glb_bounds(3)%beg)
         length(1) = patch_ib(ib_patch_id)%length_x
         length(2) = patch_ib(ib_patch_id)%length_y
         length(3) = patch_ib(ib_patch_id)%length_z
@@ -658,10 +658,10 @@ contains
         total_vertices = gpu_total_vertices(patch_id)
 
         center = 0._wp
-        if (.not. f_is_default(patch_ib(patch_id)%x_centroid)) center(1) = patch_ib(patch_id)%x_centroid + real(gp%x_periodicity, wp)*(x_domain%end - x_domain%beg)
-        if (.not. f_is_default(patch_ib(patch_id)%y_centroid)) center(2) = patch_ib(patch_id)%y_centroid + real(gp%y_periodicity, wp)*(y_domain%end - y_domain%beg)
+        if (.not. f_is_default(patch_ib(patch_id)%x_centroid)) center(1) = patch_ib(patch_id)%x_centroid + real(gp%x_periodicity, wp)*(glb_bounds(1)%end - glb_bounds(1)%beg)
+        if (.not. f_is_default(patch_ib(patch_id)%y_centroid)) center(2) = patch_ib(patch_id)%y_centroid + real(gp%y_periodicity, wp)*(glb_bounds(2)%end - glb_bounds(2)%beg)
         if (p > 0) then
-            if (.not. f_is_default(patch_ib(patch_id)%z_centroid)) center(3) = patch_ib(patch_id)%z_centroid + real(gp%z_periodicity, wp)*(z_domain%end - z_domain%beg)
+            if (.not. f_is_default(patch_ib(patch_id)%z_centroid)) center(3) = patch_ib(patch_id)%z_centroid + real(gp%z_periodicity, wp)*(glb_bounds(3)%end - glb_bounds(3)%beg)
         end if
 
         inverse_rotation(:, :) = patch_ib(patch_id)%rotation_matrix_inverse(:, :)
diff --git a/src/simulation/m_data_output.fpp b/src/simulation/m_data_output.fpp
index 068fcb38b1..9db92e1637 100644
--- a/src/simulation/m_data_output.fpp
+++ b/src/simulation/m_data_output.fpp
@@ -49,19 +49,8 @@ module m_data_output
               s_finalize_data_output_module, &
               s_write_ib_data_file
 
-    real(wp), allocatable, dimension(:, :, :) :: icfl_sf  !< ICFL stability criterion
-    real(wp), allocatable, dimension(:, :, :) :: vcfl_sf  !< VCFL stability criterion
-    real(wp), allocatable, dimension(:, :, :) :: ccfl_sf  !< CCFL stability criterion
-    real(wp), allocatable, dimension(:, :, :) :: Rc_sf  !< Rc stability criterion
     real(wp), public, allocatable, dimension(:, :) :: c_mass
-    $:GPU_DECLARE(create='[icfl_sf,vcfl_sf,ccfl_sf,Rc_sf,c_mass]')
-
-    real(wp) :: icfl_max_loc, icfl_max_glb !< ICFL stability extrema on local and global grids
-    real(wp) :: vcfl_max_loc, vcfl_max_glb !< VCFL stability extrema on local and global grids
-    real(wp) :: ccfl_max_loc, ccfl_max_glb !< CCFL stability extrema on local and global grids
-    real(wp) :: Rc_min_loc, Rc_min_glb !< Rc   stability extrema on local and global grids
-    $:GPU_DECLARE(create='[icfl_max_loc,icfl_max_glb,vcfl_max_loc,vcfl_max_glb]')
-    $:GPU_DECLARE(create='[ccfl_max_loc,ccfl_max_glb,Rc_min_loc,Rc_min_glb]')
+    $:GPU_DECLARE(create='[c_mass]')
 
     !> @name ICFL, VCFL, CCFL and Rc stability criteria extrema over all the time-steps
     !> @{
@@ -165,6 +154,10 @@ contains
                 trim('VCFL Max'), trim('Rc Min')
         end if
 
+        if (bubbles_lagrange) then
+            write (3, '(13X,A10)', advance="no") trim('N Bubbles')
+        end if
+
         write (3, *) ! new line
 
     end subroutine s_open_run_time_information_file
@@ -283,9 +276,18 @@ contains
         real(wp) :: H          !< Cell-avg. enthalpy
         real(wp), dimension(2) :: Re         !< Cell-avg. Reynolds numbers
         integer :: j, k, l
-
+        real(wp) :: icfl_max_loc, icfl_max_glb !< ICFL stability extrema on local and global grids
+        real(wp) :: vcfl_max_loc, vcfl_max_glb !< VCFL stability extrema on local and global grids
+        real(wp) :: ccfl_max_loc, ccfl_max_glb !< CCFL stability extrema on local and global grids
+        real(wp) :: Rc_min_loc, Rc_min_glb !< Rc   stability extrema on local and global grids
+        real(wp) :: icfl, vcfl, Rc
+
+        icfl_max_loc = 0._wp
+        vcfl_max_loc = 0._wp
+        Rc_min_loc = huge(1.0_wp)
         ! Computing Stability Criteria at Current Time-step
-        $:GPU_PARALLEL_LOOP(collapse=3, private='[j,k,l,vel, alpha, Re, rho, vel_sum, pres, gamma, pi_inf, c, H, qv]')
+        $:GPU_PARALLEL_LOOP(collapse=3, private='[j,k,l,vel, alpha, Re, rho, vel_sum, pres, gamma, pi_inf, c, H, qv, icfl, vcfl, Rc]', &
+            & reduction='[[icfl_max_loc,vcfl_max_loc],[Rc_min_loc]]', reductionOp='[max,min]')
         do l = 0, p
             do k = 0, n
                 do j = 0, m
@@ -294,57 +296,35 @@ contains
                     call s_compute_speed_of_sound(pres, rho, gamma, pi_inf, H, alpha, vel_sum, 0._wp, c, qv)
 
                     if (viscous) then
-                        call s_compute_stability_from_dt(vel, c, rho, Re, j, k, l, icfl_sf, vcfl_sf, Rc_sf)
+                        call s_compute_stability_from_dt(vel, c, rho, Re, j, k, l, icfl, vcfl, Rc)
                     else
-                        call s_compute_stability_from_dt(vel, c, rho, Re, j, k, l, icfl_sf)
+                        call s_compute_stability_from_dt(vel, c, rho, Re, j, k, l, icfl)
                     end if
 
+                    icfl_max_loc = max(icfl_max_loc, icfl)
+                    vcfl_max_loc = max(vcfl_max_loc, merge(vcfl, 0.0_wp, viscous))
+                    Rc_min_loc = min(Rc_min_loc, merge(Rc, huge(1.0_wp), viscous))
                 end do
             end do
         end do
         $:END_GPU_PARALLEL_LOOP()
-
         ! end: Computing Stability Criteria at Current Time-step
 
-        ! Determining local stability criteria extrema at current time-step
-
-#ifdef _CRAYFTN
-        $:GPU_UPDATE(host='[icfl_sf]')
-
-        if (viscous) then
-            $:GPU_UPDATE(host='[vcfl_sf,Rc_sf]')
-        end if
-
-        icfl_max_loc = maxval(icfl_sf)
-
-        if (viscous) then
-            vcfl_max_loc = maxval(vcfl_sf)
-            Rc_min_loc = minval(Rc_sf)
-        end if
-#else
-        #:call GPU_PARALLEL(copyout='[icfl_max_loc]', copyin='[icfl_sf]')
-            icfl_max_loc = maxval(icfl_sf)
-        #:endcall GPU_PARALLEL
-        if (viscous .or. dummy) then
-            #:call GPU_PARALLEL(copyout='[vcfl_max_loc, Rc_min_loc]', copyin='[vcfl_sf,Rc_sf]')
-                vcfl_max_loc = maxval(vcfl_sf)
-                Rc_min_loc = minval(Rc_sf)
-            #:endcall GPU_PARALLEL
-        end if
-#endif
-
         ! Determining global stability criteria extrema at current time-step
         if (num_procs > 1) then
             call s_mpi_reduce_stability_criteria_extrema(icfl_max_loc, &
                                                          vcfl_max_loc, &
                                                          Rc_min_loc, &
+                                                         n_el_bubs_loc, &
                                                          icfl_max_glb, &
                                                          vcfl_max_glb, &
-                                                         Rc_min_glb)
+                                                         Rc_min_glb, &
+                                                         n_el_bubs_glb)
         else
             icfl_max_glb = icfl_max_loc
             if (viscous) vcfl_max_glb = vcfl_max_loc
             if (viscous) Rc_min_glb = Rc_min_loc
+            if (bubbles_lagrange) n_el_bubs_glb = n_el_bubs_loc
         end if
 
         ! Determining the stability criteria extrema over all the time-steps
@@ -366,6 +346,10 @@ contains
                     Rc_min_glb
             end if
 
+            if (bubbles_lagrange) then
+                write (3, '(13X,I10)', advance="no") n_el_bubs_glb
+            end if
+
             write (3, *) ! new line
 
             if (.not. f_approx_equal(icfl_max_glb, icfl_max_glb)) then
@@ -383,6 +367,12 @@ contains
                     call s_mpi_abort('VCFL is greater than 1.0. Exiting.')
                 end if
             end if
+
+            if (bubbles_lagrange) then
+                if (n_el_bubs_glb == 0) then
+                    call s_mpi_abort('No Lagrangian bubbles remain in the domain. Exiting.')
+                end if
+            end if
         end if
 
         call s_mpi_barrier()
@@ -1875,7 +1865,7 @@ contains
 
         write (3, '(A,F9.6)') 'ICFL Max: ', icfl_max
         if (viscous) write (3, '(A,F9.6)') 'VCFL Max: ', vcfl_max
-        if (viscous) write (3, '(A,F10.6)') 'Rc Min: ', Rc_min
+        if (viscous) write (3, '(A,ES16.6)') 'Rc Min: ', Rc_min
 
         call cpu_time(run_time)
 
@@ -1916,15 +1906,10 @@ contains
 
         ! Allocating/initializing ICFL, VCFL, CCFL and Rc stability criteria
         if (run_time_info) then
-            @:ALLOCATE(icfl_sf(0:m, 0:n, 0:p))
             icfl_max = 0._wp
-
             if (viscous) then
-                @:ALLOCATE(vcfl_sf(0:m, 0:n, 0:p))
-                @:ALLOCATE(Rc_sf  (0:m, 0:n, 0:p))
-
                 vcfl_max = 0._wp
-                Rc_min = 1.e3_wp
+                Rc_min = 1.e12_wp
             end if
         end if
 
@@ -1954,14 +1939,6 @@ contains
             @:DEALLOCATE(c_mass)
         end if
 
-        if (run_time_info) then
-            ! Deallocating the ICFL, VCFL, CCFL, and Rc stability criteria
-            @:DEALLOCATE(icfl_sf)
-            if (viscous) then
-                @:DEALLOCATE(vcfl_sf, Rc_sf)
-            end if
-        end if
-
         if (down_sample) then
             do i = 1, sys_size
                 deallocate (q_cons_temp_ds(i)%sf)
diff --git a/src/simulation/m_global_parameters.fpp b/src/simulation/m_global_parameters.fpp
index c1be537ecd..6aab245102 100644
--- a/src/simulation/m_global_parameters.fpp
+++ b/src/simulation/m_global_parameters.fpp
@@ -54,26 +54,23 @@ module m_global_parameters
 
     !> @name Cell-boundary (CB) locations in the x-, y- and z-directions, respectively
     !> @{
-
     real(wp), target, allocatable, dimension(:) :: x_cb, y_cb, z_cb
+    type(bounds_info), dimension(3) :: glb_bounds !<
     !> @}
 
     !> @name Cell-center (CC) locations in the x-, y- and z-directions, respectively
     !> @{
-
     real(wp), target, allocatable, dimension(:) :: x_cc, y_cc, z_cc
     !> @}
-    !type(bounds_info) :: x_domain, y_domain, z_domain !<
-    !! Locations of the domain bounds in the x-, y- and z-coordinate directions
+
     !> @name Cell-width distributions in the x-, y- and z-directions, respectively
     !> @{
-
     real(wp), target, allocatable, dimension(:) :: dx, dy, dz
     !> @}
 
     real(wp) :: dt !< Size of the time-step
 
-    $:GPU_DECLARE(create='[x_cb,y_cb,z_cb,x_cc,y_cc,z_cc,dx,dy,dz,dt,m,n,p]')
+    $:GPU_DECLARE(create='[x_cb,y_cb,z_cb,x_cc,y_cc,z_cc,dx,dy,dz,dt,m,n,p,glb_bounds]')
 
     !> @name Starting time-step iteration, stopping time-step iteration and the number
     !! of time-step iterations between successive solution backups, respectively
@@ -224,6 +221,7 @@ module m_global_parameters
 
     integer :: num_bc_patches
     logical :: bc_io
+    logical, dimension(3) :: periodic_bc
     !> @name Boundary conditions (BC) in the x-, y- and z-directions, respectively
     !> @{
     type(int_bounds_info) :: bc_x, bc_y, bc_z
@@ -235,10 +233,6 @@ module m_global_parameters
 #elif defined(MFC_OpenMP)
     $:GPU_DECLARE(create='[bc_x, bc_y, bc_z]')
 #endif
-    type(bounds_info) :: x_domain, y_domain, z_domain
-    $:GPU_DECLARE(create='[x_domain, y_domain, z_domain]')
-    real(wp) :: x_a, y_a, z_a
-    real(wp) :: x_b, y_b, z_b
 
     logical :: parallel_io !< Format of the data files
     logical :: file_per_process !< shared file or not when using parallel io
@@ -249,6 +243,19 @@ module m_global_parameters
     integer, allocatable, dimension(:) :: proc_coords !<
     !! Processor coordinates in MPI_CART_COMM
 
+    type(bounds_info), allocatable, dimension(:) :: pcomm_coords
+    $:GPU_DECLARE(create='[pcomm_coords]')
+    !! Coordinates for EL particle transfer
+
+    type(bounds_info), allocatable, dimension(:) :: pcomm_coords_ghost
+    $:GPU_DECLARE(create='[pcomm_coords_ghost]')
+    !! Coordinates for EL particle transfer
+
+    type(int_bounds_info), dimension(3) :: nidx !< Indices for neighboring processors
+
+    integer, allocatable, dimension(:, :, :) :: neighbor_ranks
+    !! Neighbor ranks
+
     integer, allocatable, dimension(:) :: start_idx !<
     !! Starting cell-center index of local processor in global grid
 
@@ -345,6 +352,10 @@ module m_global_parameters
 
     $:GPU_DECLARE(create='[buff_size]')
 
+    integer, allocatable :: beta_vars(:) !< Indices of variables to communicate for bubble/particle coupling
+
+    $:GPU_DECLARE(create='[beta_vars]')
+
     integer :: shear_num !! Number of shear stress components
     integer, dimension(3) :: shear_indices !<
     !! Indices of the stress components that represent shear stress
@@ -368,6 +379,9 @@ module m_global_parameters
     ! Subgrid Bubble Parameters
     type(subgrid_bubble_physical_parameters) :: bub_pp
 
+    ! Subgrid Particle Parameters
+    type(subgrid_particle_physical_parameters) :: particle_pp
+
     integer :: fd_order !<
     !! The order of the finite-difference (fd) approximations of the first-order
     !! derivatives that need to be evaluated when the CoM or flow probe data
@@ -484,6 +498,13 @@ module m_global_parameters
                 gam_v, gam_g, M_v, M_g, cp_v, cp_g, R_v, R_g
     $:GPU_DECLARE(create='[R0ref, p0ref, rho0ref, T0ref, ss, pv, vd, mu_l, mu_v, mu_g, &
         gam_v, gam_g, M_v, M_g, cp_v, cp_g, R_v, R_g]')
+
+    !> @}
+
+    !> @name Solid particle physical parameters
+    !> @{
+    real(wp) :: cp_particle, rho0ref_particle
+    $:GPU_DECLARE(create='[rho0ref_particle, cp_particle]')
     !> @}
 
     !> @name Acoustic acoustic_source parameters
@@ -532,8 +553,20 @@ module m_global_parameters
     !> @name lagrangian subgrid bubble parameters
     !> @{!
     logical :: bubbles_lagrange                         !< Lagrangian subgrid bubble model switch
+    logical :: particles_lagrange                         !< Lagrangian subgrid particle model switch
     type(bubbles_lagrange_parameters) :: lag_params     !< Lagrange bubbles' parameters
-    $:GPU_DECLARE(create='[bubbles_lagrange,lag_params]')
+    integer :: n_el_bubs_loc, n_el_bubs_glb             !< Number of Lagrangian bubbles (local and global)
+    integer :: n_el_particles_loc, n_el_particles_glb             !< Number of Lagrangian bubbles (local and global)
+    logical :: moving_lag_bubbles
+    logical :: moving_lag_particles
+    logical :: lag_pressure_force
+    logical :: lag_gravity_force
+    integer :: lag_vel_model, lag_drag_model
+    $:GPU_DECLARE(create='[bubbles_lagrange,lag_params,n_el_bubs_loc,n_el_bubs_glb]')
+    $:GPU_DECLARE(create='[particles_lagrange,n_el_particles_loc,n_el_particles_glb]')
+    $:GPU_DECLARE(create='[moving_lag_particles]')
+    $:GPU_DECLARE(create='[moving_lag_bubbles, lag_vel_model, lag_drag_model]')
+    $:GPU_DECLARE(create='[lag_pressure_force,lag_gravity_force]')
     !> @}
 
     real(wp) :: Bx0 !< Constant magnetic field in the x-direction (1D)
@@ -659,6 +692,7 @@ contains
 
         num_bc_patches = 0
         bc_io = .false.
+        periodic_bc = .false.
 
         bc_x%beg = dflt_int; bc_x%end = dflt_int
         bc_y%beg = dflt_int; bc_y%end = dflt_int
@@ -671,9 +705,9 @@ contains
             #:endfor
         #:endfor
 
-        x_domain%beg = dflt_real; x_domain%end = dflt_real
-        y_domain%beg = dflt_real; y_domain%end = dflt_real
-        z_domain%beg = dflt_real; z_domain%end = dflt_real
+        glb_bounds(1)%beg = dflt_real; glb_bounds(1)%end = dflt_real
+        glb_bounds(2)%beg = dflt_real; glb_bounds(2)%end = dflt_real
+        glb_bounds(3)%beg = dflt_real; glb_bounds(3)%end = dflt_real
 
         ! Fluids physical parameters
         do i = 1, num_fluids_max
@@ -708,6 +742,10 @@ contains
         bub_pp%R_v = dflt_real; R_v = dflt_real
         bub_pp%R_g = dflt_real; R_g = dflt_real
 
+        ! Subgrid particle parameters
+        particle_pp%rho0ref_particle = dflt_real
+        particle_pp%cp_particle = dflt_real
+
         ! Tait EOS
         rhoref = dflt_real
         pref = dflt_real
@@ -836,10 +874,29 @@ contains
         lag_params%massTransfer_model = .false.
         lag_params%write_bubbles = .false.
         lag_params%write_bubbles_stats = .false.
+        lag_params%write_void_evol = .false.
         lag_params%nBubs_glb = dflt_int
+        lag_params%vel_model = dflt_int
+        lag_params%drag_model = dflt_int
+        lag_params%pressure_force = .true.
+        lag_params%gravity_force = .false.
         lag_params%epsilonb = 1._wp
         lag_params%charwidth = dflt_real
+        lag_params%charNz = dflt_int
         lag_params%valmaxvoid = dflt_real
+        lag_params%input_path = 'input/lag_bubbles.dat'
+        lag_params%nParticles_glb = dflt_int
+        lag_params%qs_drag_model = dflt_int
+        lag_params%stokes_drag = dflt_int
+        lag_params%added_mass_model = dflt_int
+        lag_params%interpolation_order = dflt_int
+        lag_params%collision_force = .false.
+
+        moving_lag_bubbles = .false.
+        lag_vel_model = dflt_int
+
+        particles_lagrange = .false.
+        moving_lag_particles = .false.
 
         ! Continuum damage model
         tau_star = dflt_real
@@ -1209,6 +1266,16 @@ contains
 
         ! END: Volume Fraction Model
 
+        if (bubbles_lagrange) then
+            @:ALLOCATE(beta_vars(1:3))
+            beta_vars(1:3) = [1, 2, 5]
+            $:GPU_UPDATE(device='[beta_vars]')
+        elseif (particles_lagrange) then
+            @:ALLOCATE(beta_vars(1:8))
+            beta_vars(1:8) = [1, 2, 3, 4, 5, 6, 7, 8]
+            $:GPU_UPDATE(device='[beta_vars]')
+        end if
+
         if (chemistry) then
             species_idx%beg = sys_size + 1
             species_idx%end = sys_size + num_species
@@ -1221,6 +1288,9 @@ contains
         elseif (bubbles_lagrange) then
             allocate (MPI_IO_DATA%view(1:sys_size + 1))
             allocate (MPI_IO_DATA%var(1:sys_size + 1))
+        elseif (particles_lagrange) then
+            allocate (MPI_IO_DATA%view(1:sys_size + 1))
+            allocate (MPI_IO_DATA%var(1:sys_size + 1))
         else
             allocate (MPI_IO_DATA%view(1:sys_size))
             allocate (MPI_IO_DATA%var(1:sys_size))
@@ -1242,6 +1312,11 @@ contains
                 allocate (MPI_IO_DATA%var(i)%sf(0:m, 0:n, 0:p))
                 MPI_IO_DATA%var(i)%sf => null()
             end do
+        elseif (particles_lagrange) then
+            do i = 1, sys_size + 1
+                allocate (MPI_IO_DATA%var(i)%sf(0:m, 0:n, 0:p))
+                MPI_IO_DATA%var(i)%sf => null()
+            end do
         end if
 
         ! Configuring the WENO average flag that will be used to regulate
@@ -1260,23 +1335,19 @@ contains
         if (ib) allocate (MPI_IO_IB_DATA%var%sf(0:m, 0:n, 0:p))
         Np = 0
 
-        if (elasticity) then
-            fd_number = max(1, fd_order/2)
-        end if
-
+        if (elasticity) fd_number = max(1, fd_order/2)
         if (mhd) then ! TODO merge with above; waiting for hyperelasticity PR
             fd_number = max(1, fd_order/2)
         end if
-
-        if (probe_wrt) then
-            fd_number = max(1, fd_order/2)
-        end if
+        if (probe_wrt) fd_number = max(1, fd_order/2)
+        if (bubbles_lagrange) fd_number = max(1, fd_order/2)
+        if (particles_lagrange) fd_number = max(1, fd_order/2)
 
         call s_configure_coordinate_bounds(recon_type, weno_polyn, muscl_polyn, &
                                            igr_order, buff_size, &
                                            idwint, idwbuff, viscous, &
-                                           bubbles_lagrange, m, n, p, &
-                                           num_dims, igr, ib)
+                                           bubbles_lagrange, particles_lagrange, &
+                                           m, n, p, num_dims, igr, ib, fd_number)
         $:GPU_UPDATE(device='[idwint, idwbuff]')
 
         ! Configuring Coordinate Direction Indexes
@@ -1397,6 +1468,8 @@ contains
         #:endif
 
         allocate (proc_coords(1:num_dims))
+        @:ALLOCATE(pcomm_coords(1:num_dims))
+        @:ALLOCATE(pcomm_coords_ghost(1:num_dims))
 
         if (parallel_io .neqv. .true.) return
 
@@ -1433,6 +1506,14 @@ contains
         end if
 
         deallocate (proc_coords)
+
+        @:DEALLOCATE(pcomm_coords)
+        @:DEALLOCATE(pcomm_coords_ghost)
+
+        if (bubbles_lagrange .or. particles_lagrange) then
+            @:DEALLOCATE(beta_vars)
+        end if
+
         if (parallel_io) then
             deallocate (start_idx)
 
@@ -1440,6 +1521,10 @@ contains
                 do i = 1, sys_size + 1
                     MPI_IO_DATA%var(i)%sf => null()
                 end do
+            elseif (particles_lagrange) then
+                do i = 1, sys_size + 1
+                    MPI_IO_DATA%var(i)%sf => null()
+                end do
             else
                 do i = 1, sys_size
                     MPI_IO_DATA%var(i)%sf => null()
@@ -1461,6 +1546,10 @@ contains
         if (p == 0) return; 
         @:DEALLOCATE(z_cb, z_cc, dz)
 
+        if (allocated(neighbor_ranks)) then
+            @:DEALLOCATE(neighbor_ranks)
+        end if
+
     end subroutine s_finalize_global_parameters_module
 
 end module m_global_parameters
diff --git a/src/simulation/m_ib_patches.fpp b/src/simulation/m_ib_patches.fpp
index d725a3e5f3..12fcccbf3c 100644
--- a/src/simulation/m_ib_patches.fpp
+++ b/src/simulation/m_ib_patches.fpp
@@ -143,8 +143,8 @@ contains
 
         ! Transferring the circular patch's radius, centroid, smearing patch
         ! identity and smearing coefficient information
-        center(1) = patch_ib(patch_id)%x_centroid + real(xp, wp)*(x_domain%end - x_domain%beg)
-        center(2) = patch_ib(patch_id)%y_centroid + real(yp, wp)*(y_domain%end - y_domain%beg)
+        center(1) = patch_ib(patch_id)%x_centroid + real(xp, wp)*(glb_bounds(1)%end - glb_bounds(1)%beg)
+        center(2) = patch_ib(patch_id)%y_centroid + real(yp, wp)*(glb_bounds(2)%end - glb_bounds(2)%beg)
         radius = patch_ib(patch_id)%radius
 
         ! encode the periodicity information into the patch_id
@@ -197,8 +197,8 @@ contains
         real(wp), dimension(1:2) :: center !< x and y coordinates in local IB frame
         real(wp), dimension(1:3, 1:3) :: inverse_rotation
 
-        center(1) = patch_ib(patch_id)%x_centroid + real(xp, wp)*(x_domain%end - x_domain%beg)
-        center(2) = patch_ib(patch_id)%y_centroid + real(yp, wp)*(y_domain%end - y_domain%beg)
+        center(1) = patch_ib(patch_id)%x_centroid + real(xp, wp)*(glb_bounds(1)%end - glb_bounds(1)%beg)
+        center(2) = patch_ib(patch_id)%y_centroid + real(yp, wp)*(glb_bounds(2)%end - glb_bounds(2)%beg)
         ca_in = patch_ib(patch_id)%c
         pa = patch_ib(patch_id)%p
         ma = patch_ib(patch_id)%m
@@ -360,9 +360,9 @@ contains
         real(wp), dimension(1:3) :: xyz_local, center, offset !< x, y, z coordinates in local IB frame
         real(wp), dimension(1:3, 1:3) :: inverse_rotation
 
-        center(1) = patch_ib(patch_id)%x_centroid + real(xp, wp)*(x_domain%end - x_domain%beg)
-        center(2) = patch_ib(patch_id)%y_centroid + real(yp, wp)*(y_domain%end - y_domain%beg)
-        center(3) = patch_ib(patch_id)%z_centroid + real(zp, wp)*(z_domain%end - z_domain%beg)
+        center(1) = patch_ib(patch_id)%x_centroid + real(xp, wp)*(glb_bounds(1)%end - glb_bounds(1)%beg)
+        center(2) = patch_ib(patch_id)%y_centroid + real(yp, wp)*(glb_bounds(2)%end - glb_bounds(2)%beg)
+        center(3) = patch_ib(patch_id)%z_centroid + real(zp, wp)*(glb_bounds(3)%end - glb_bounds(3)%beg)
         lz = patch_ib(patch_id)%length_z
         ca_in = patch_ib(patch_id)%c
         pa = patch_ib(patch_id)%p
@@ -533,8 +533,8 @@ contains
         real(wp), dimension(1:3, 1:3) :: inverse_rotation
 
         ! Transferring the rectangle's centroid and length information
-        center(1) = patch_ib(patch_id)%x_centroid + real(xp, wp)*(x_domain%end - x_domain%beg)
-        center(2) = patch_ib(patch_id)%y_centroid + real(yp, wp)*(y_domain%end - y_domain%beg)
+        center(1) = patch_ib(patch_id)%x_centroid + real(xp, wp)*(glb_bounds(1)%end - glb_bounds(1)%beg)
+        center(2) = patch_ib(patch_id)%y_centroid + real(yp, wp)*(glb_bounds(2)%end - glb_bounds(2)%beg)
         length(1) = patch_ib(patch_id)%length_x
         length(2) = patch_ib(patch_id)%length_y
         inverse_rotation(:, :) = patch_ib(patch_id)%rotation_matrix_inverse(:, :)
@@ -604,9 +604,9 @@ contains
 
         ! Transferring spherical patch's radius, centroid, smoothing patch
         ! identity and smoothing coefficient information
-        center(1) = patch_ib(patch_id)%x_centroid + real(xp, wp)*(x_domain%end - x_domain%beg)
-        center(2) = patch_ib(patch_id)%y_centroid + real(yp, wp)*(y_domain%end - y_domain%beg)
-        center(3) = patch_ib(patch_id)%z_centroid + real(zp, wp)*(z_domain%end - z_domain%beg)
+        center(1) = patch_ib(patch_id)%x_centroid + real(xp, wp)*(glb_bounds(1)%end - glb_bounds(1)%beg)
+        center(2) = patch_ib(patch_id)%y_centroid + real(yp, wp)*(glb_bounds(2)%end - glb_bounds(2)%beg)
+        center(3) = patch_ib(patch_id)%z_centroid + real(zp, wp)*(glb_bounds(3)%end - glb_bounds(3)%beg)
         radius = patch_ib(patch_id)%radius
 
         ! encode the periodicity information into the patch_id
@@ -675,9 +675,9 @@ contains
         real(wp) :: corner_distance
 
         ! Transferring the cuboid's centroid and length information
-        center(1) = patch_ib(patch_id)%x_centroid + real(xp, wp)*(x_domain%end - x_domain%beg)
-        center(2) = patch_ib(patch_id)%y_centroid + real(yp, wp)*(y_domain%end - y_domain%beg)
-        center(3) = patch_ib(patch_id)%z_centroid + real(zp, wp)*(z_domain%end - z_domain%beg)
+        center(1) = patch_ib(patch_id)%x_centroid + real(xp, wp)*(glb_bounds(1)%end - glb_bounds(1)%beg)
+        center(2) = patch_ib(patch_id)%y_centroid + real(yp, wp)*(glb_bounds(2)%end - glb_bounds(2)%beg)
+        center(3) = patch_ib(patch_id)%z_centroid + real(zp, wp)*(glb_bounds(3)%end - glb_bounds(3)%beg)
         length(1) = patch_ib(patch_id)%length_x
         length(2) = patch_ib(patch_id)%length_y
         length(3) = patch_ib(patch_id)%length_z
@@ -760,9 +760,9 @@ contains
         real(wp) :: corner_distance
 
         ! Transferring the cylindrical patch's centroid, length, radius,
-        center(1) = patch_ib(patch_id)%x_centroid + real(xp, wp)*(x_domain%end - x_domain%beg)
-        center(2) = patch_ib(patch_id)%y_centroid + real(yp, wp)*(y_domain%end - y_domain%beg)
-        center(3) = patch_ib(patch_id)%z_centroid + real(zp, wp)*(z_domain%end - z_domain%beg)
+        center(1) = patch_ib(patch_id)%x_centroid + real(xp, wp)*(glb_bounds(1)%end - glb_bounds(1)%beg)
+        center(2) = patch_ib(patch_id)%y_centroid + real(yp, wp)*(glb_bounds(2)%end - glb_bounds(2)%beg)
+        center(3) = patch_ib(patch_id)%z_centroid + real(zp, wp)*(glb_bounds(3)%end - glb_bounds(3)%beg)
         length(1) = patch_ib(patch_id)%length_x
         length(2) = patch_ib(patch_id)%length_y
         length(3) = patch_ib(patch_id)%length_z
@@ -845,8 +845,8 @@ contains
         real(wp), dimension(1:3, 1:3) :: inverse_rotation
 
         ! Transferring the ellipse's centroid and length information
-        center(1) = patch_ib(patch_id)%x_centroid + real(xp, wp)*(x_domain%end - x_domain%beg)
-        center(2) = patch_ib(patch_id)%y_centroid + real(yp, wp)*(y_domain%end - y_domain%beg)
+        center(1) = patch_ib(patch_id)%x_centroid + real(xp, wp)*(glb_bounds(1)%end - glb_bounds(1)%beg)
+        center(2) = patch_ib(patch_id)%y_centroid + real(yp, wp)*(glb_bounds(2)%end - glb_bounds(2)%beg)
         ellipse_coeffs(1) = 0.5_wp*patch_ib(patch_id)%length_x
         ellipse_coeffs(2) = 0.5_wp*patch_ib(patch_id)%length_y
         inverse_rotation(:, :) = patch_ib(patch_id)%rotation_matrix_inverse(:, :)
@@ -905,8 +905,8 @@ contains
         real(wp), dimension(1:3, 1:3) :: inverse_rotation, rotation
 
         center = 0._wp
-        center(1) = patch_ib(patch_id)%x_centroid + real(xp, wp)*(x_domain%end - x_domain%beg)
-        center(2) = patch_ib(patch_id)%y_centroid + real(yp, wp)*(y_domain%end - y_domain%beg)
+        center(1) = patch_ib(patch_id)%x_centroid + real(xp, wp)*(glb_bounds(1)%end - glb_bounds(1)%beg)
+        center(2) = patch_ib(patch_id)%y_centroid + real(yp, wp)*(glb_bounds(2)%end - glb_bounds(2)%beg)
         inverse_rotation(:, :) = patch_ib(patch_id)%rotation_matrix_inverse(:, :)
         rotation(:, :) = patch_ib(patch_id)%rotation_matrix(:, :)
         offset(:) = patch_ib(patch_id)%centroid_offset(:)
@@ -987,9 +987,9 @@ contains
         real(wp), dimension(1:3) :: bbox_min, bbox_max, local_corner, world_corner
 
         center = 0._wp
-        center(1) = patch_ib(patch_id)%x_centroid + real(xp, wp)*(x_domain%end - x_domain%beg)
-        center(2) = patch_ib(patch_id)%y_centroid + real(yp, wp)*(y_domain%end - y_domain%beg)
-        center(3) = patch_ib(patch_id)%z_centroid + real(zp, wp)*(z_domain%end - z_domain%beg)
+        center(1) = patch_ib(patch_id)%x_centroid + real(xp, wp)*(glb_bounds(1)%end - glb_bounds(1)%beg)
+        center(2) = patch_ib(patch_id)%y_centroid + real(yp, wp)*(glb_bounds(2)%end - glb_bounds(2)%beg)
+        center(3) = patch_ib(patch_id)%z_centroid + real(zp, wp)*(glb_bounds(3)%end - glb_bounds(3)%beg)
         inverse_rotation(:, :) = patch_ib(patch_id)%rotation_matrix_inverse(:, :)
         offset(:) = patch_ib(patch_id)%centroid_offset(:)
         spc = patch_ib(patch_id)%model_spc
@@ -1232,29 +1232,20 @@ contains
         integer, intent(out), optional :: zp_lower, zp_upper
 
         ! check domain wraps in x, y
-        #:for X in [('x'), ('y')]
-            ! check for periodicity
-            if (bc_${X}$%beg == BC_PERIODIC) then
-                ${X}$p_lower = -1
-                ${X}$p_upper = 1
-            else
-                !if it is not periodic, then both elements are 0
-                ${X}$p_lower = 0
-                ${X}$p_upper = 0
+        #:for X, ID in [('x', 1), ('y', 2), ('z', 3)]
+            if (num_dims >= ${ID}$) then
+                ! check for periodicity
+                if (bc_${X}$%beg == BC_PERIODIC) then
+                    ${X}$p_lower = -1
+                    ${X}$p_upper = 1
+                else
+                    !if it is not periodic, then both elements are 0
+                    ${X}$p_lower = 0
+                    ${X}$p_upper = 0
+                end if
             end if
         #:endfor
 
-        ! z only if 3D
-        if (present(zp_lower) .and. p /= 0) then
-            if (bc_z%beg == BC_PERIODIC) then
-                zp_lower = -1
-                zp_upper = 1
-            else
-                zp_lower = 0
-                zp_upper = 0
-            end if
-        end if
-
     end subroutine s_get_periodicities
 
     !> Archimedes spiral function
diff --git a/src/simulation/m_ibm.fpp b/src/simulation/m_ibm.fpp
index b87f5a1b19..b8434b584c 100644
--- a/src/simulation/m_ibm.fpp
+++ b/src/simulation/m_ibm.fpp
@@ -99,10 +99,11 @@ contains
         $:GPU_UPDATE(device='[patch_ib(1:num_ibs)]')
 
         ! GPU routines require updated cell centers
-        $:GPU_UPDATE(device='[num_ibs, x_cc, y_cc, dx, dy, x_domain, y_domain]')
+        $:GPU_UPDATE(device='[num_ibs, x_cc, y_cc, dx, dy]')
         if (p /= 0) then
-            $:GPU_UPDATE(device='[z_cc, dz, z_domain]')
+            $:GPU_UPDATE(device='[z_cc, dz]')
         end if
+        $:GPU_UPDATE(device='[glb_bounds]')
 
         ! allocate STL models
         call s_instantiate_STL_models()
@@ -504,7 +505,7 @@ contains
                                 print *, [x_cc(i), y_cc(j), z_cc(k)]
                             end if
                             print *, "We are searching in dimension ", dim, " for image point at ", ghost_points_in(q)%ip_loc(:)
-                            print *, "Domain size: ", [x_cc(-buff_size), y_cc(-buff_size), z_cc(-buff_size)]
+                            print *, "Domain size: "
                             print *, "x: ", x_cc(-buff_size), " to: ", x_cc(m + buff_size - 1)
                             print *, "y: ", y_cc(-buff_size), " to: ", y_cc(n + buff_size - 1)
                             if (p /= 0) print *, "z: ", z_cc(-buff_size), " to: ", z_cc(p + buff_size - 1)
@@ -599,7 +600,7 @@ contains
         gp_layers_z = gp_layers
         if (p == 0) gp_layers_z = 0
 
-        $:GPU_PARALLEL_LOOP(private='[i,j,k,ii,jj,kk,is_gp,local_idx,patch_id,encoded_patch_id,xp,yp,zp]', copyin='[count,count_i, x_domain, y_domain, z_domain]', firstprivate='[gp_layers,gp_layers_z]', collapse=3)
+        $:GPU_PARALLEL_LOOP(private='[i,j,k,ii,jj,kk,is_gp,local_idx,patch_id,encoded_patch_id,xp,yp,zp]', copyin='[count,count_i,glb_bounds]', firstprivate='[gp_layers,gp_layers_z]', collapse=3)
         do i = 0, m
             do j = 0, n
                 do k = 0, p
@@ -633,26 +634,26 @@ contains
                             ghost_points_in(local_idx)%z_periodicity = zp
                             ghost_points_in(local_idx)%slip = patch_ib(patch_id)%slip
 
-                            if ((x_cc(i) - dx(i)) < x_domain%beg) then
+                            if ((x_cc(i) - dx(i)) < glb_bounds(1)%beg) then
                                 ghost_points_in(local_idx)%DB(1) = -1
-                            else if ((x_cc(i) + dx(i)) > x_domain%end) then
+                            else if ((x_cc(i) + dx(i)) > glb_bounds(1)%end) then
                                 ghost_points_in(local_idx)%DB(1) = 1
                             else
                                 ghost_points_in(local_idx)%DB(1) = 0
                             end if
 
-                            if ((y_cc(j) - dy(j)) < y_domain%beg) then
+                            if ((y_cc(j) - dy(j)) < glb_bounds(2)%beg) then
                                 ghost_points_in(local_idx)%DB(2) = -1
-                            else if ((y_cc(j) + dy(j)) > y_domain%end) then
+                            else if ((y_cc(j) + dy(j)) > glb_bounds(2)%end) then
                                 ghost_points_in(local_idx)%DB(2) = 1
                             else
                                 ghost_points_in(local_idx)%DB(2) = 0
                             end if
 
                             if (p /= 0) then
-                                if ((z_cc(k) - dz(k)) < z_domain%beg) then
+                                if ((z_cc(k) - dz(k)) < glb_bounds(3)%beg) then
                                     ghost_points_in(local_idx)%DB(3) = -1
-                                else if ((z_cc(k) + dz(k)) > z_domain%end) then
+                                else if ((z_cc(k) + dz(k)) > glb_bounds(3)%end) then
                                     ghost_points_in(local_idx)%DB(3) = 1
                                 else
                                     ghost_points_in(local_idx)%DB(3) = 0
@@ -1293,33 +1294,21 @@ contains
 
         do patch_id = 1, num_ibs
             ! check domain wraps in x, y,
-            #:for X in [('x'), ('y')]
-                ! check for periodicity
-                if (bc_${X}$%beg == BC_PERIODIC) then
-                    ! check if the boundary has left the domain, and then correct
-                    if (patch_ib(patch_id)%${X}$_centroid < ${X}$_domain%beg) then
-                        ! if the boundary exited "left", wrap it back around to the "right"
-                        patch_ib(patch_id)%${X}$_centroid = patch_ib(patch_id)%${X}$_centroid + (${X}$_domain%end - ${X}$_domain%beg)
-                    else if (patch_ib(patch_id)%${X}$_centroid > ${X}$_domain%end) then
-                        ! if the boundary exited "right", wrap it back around to the "left"
-                        patch_ib(patch_id)%${X}$_centroid = patch_ib(patch_id)%${X}$_centroid - (${X}$_domain%end - ${X}$_domain%beg)
+            #:for X, ID in [('x', 1), ('y', 2), ('z',3)]
+                if (num_dims >= ${ID}$) then
+                    ! check for periodicity
+                    if (bc_${X}$%beg == BC_PERIODIC) then
+                        ! check if the boundary has left the domain, and then correct
+                        if (patch_ib(patch_id)%${X}$_centroid < glb_bounds(${ID}$)%beg) then
+                            ! if the boundary exited "left", wrap it back around to the "right"
+                            patch_ib(patch_id)%${X}$_centroid = patch_ib(patch_id)%${X}$_centroid + (glb_bounds(${ID}$)%end - glb_bounds(${ID}$)%beg)
+                        else if (patch_ib(patch_id)%${X}$_centroid > glb_bounds(${ID}$)%end) then
+                            ! if the boundary exited "right", wrap it back around to the "left"
+                            patch_ib(patch_id)%${X}$_centroid = patch_ib(patch_id)%${X}$_centroid - (glb_bounds(${ID}$)%end - glb_bounds(${ID}$)%beg)
+                        end if
                     end if
                 end if
             #:endfor
-
-            if (p /= 0) then
-                ! check for periodicity
-                if (bc_z%beg == BC_PERIODIC) then
-                    ! check if the boundary has left the domain, and then correct
-                    if (patch_ib(patch_id)%z_centroid < z_domain%beg) then
-                        ! if the boundary exited "left", wrap it back around to the "right"
-                        patch_ib(patch_id)%z_centroid = patch_ib(patch_id)%z_centroid + (z_domain%end - z_domain%beg)
-                    else if (patch_ib(patch_id)%z_centroid > z_domain%end) then
-                        ! if the boundary exited "right", wrap it back around to the "left"
-                        patch_ib(patch_id)%z_centroid = patch_ib(patch_id)%z_centroid - (z_domain%end - z_domain%beg)
-                    end if
-                end if
-            end if
         end do
 
     end subroutine s_wrap_periodic_ibs
diff --git a/src/simulation/m_mpi_proxy.fpp b/src/simulation/m_mpi_proxy.fpp
index a684c79ec4..b1ae741a72 100644
--- a/src/simulation/m_mpi_proxy.fpp
+++ b/src/simulation/m_mpi_proxy.fpp
@@ -41,6 +41,24 @@ module m_mpi_proxy
     integer :: i_halo_size
     $:GPU_DECLARE(create='[i_halo_size]')
 
+    integer, dimension(-1:1, -1:1, -1:1) :: p_send_counts, p_recv_counts
+    integer, dimension(:, :, :, :), allocatable :: p_send_ids
+    character(len=1), dimension(:), allocatable :: p_send_buff, p_recv_buff
+    integer :: p_buff_size, p_var_size
+    !! EL Bubbles communication variables
+    integer, parameter :: MAX_NEIGHBORS = 27
+    integer :: send_requests(MAX_NEIGHBORS), recv_requests(MAX_NEIGHBORS)
+    integer :: recv_offsets(MAX_NEIGHBORS)
+    integer :: neighbor_list(MAX_NEIGHBORS, 3)
+    integer :: n_neighbors
+    $:GPU_DECLARE(create='[p_send_counts]')
+    integer, allocatable :: force_send_counts(:), force_recv_counts(:)
+    integer, allocatable :: force_send_ids(:, :)
+    integer, allocatable :: flat_send_ids(:)
+    real(wp), allocatable :: force_send_vals(:, :, :)
+    real(wp), allocatable :: flat_send_vals(:)
+    $:GPU_DECLARE(create='[force_send_counts, force_send_ids, force_send_vals]')
+
 contains
 
     !> @brief Allocates immersed boundary communication buffers for MPI halo exchanges.
@@ -70,6 +88,82 @@ contains
 
     end subroutine s_initialize_mpi_proxy_module
 
+    !! This subroutine initializes the MPI buffers and variables
+        !! required for the particle communication.
+        !! @param lag_num_ts Number of stages in time-stepping scheme
+    subroutine s_initialize_particles_mpi(lag_num_ts)
+
+        integer :: i, j, k
+        integer :: real_size, int_size, nReal, lag_num_ts
+        integer :: ierr !< Generic flag used to identify and report MPI errors
+
+#ifdef MFC_MPI
+        call MPI_Pack_size(1, mpi_p, MPI_COMM_WORLD, real_size, ierr)
+        call MPI_Pack_size(1, MPI_INTEGER, MPI_COMM_WORLD, int_size, ierr)
+        nReal = 7 + 16*2 + 10*lag_num_ts
+        p_var_size = nReal*real_size + int_size
+        p_buff_size = lag_params%nBubs_glb*p_var_size
+        @:ALLOCATE(p_send_buff(0:p_buff_size), p_recv_buff(0:p_buff_size))
+        @:ALLOCATE(p_send_ids(nidx(1)%beg:nidx(1)%end, nidx(2)%beg:nidx(2)%end, nidx(3)%beg:nidx(3)%end, 0:lag_params%nBubs_glb))
+        ! First, collect all neighbor information
+        n_neighbors = 0
+        do k = nidx(3)%beg, nidx(3)%end
+            do j = nidx(2)%beg, nidx(2)%end
+                do i = nidx(1)%beg, nidx(1)%end
+                    if (abs(i) + abs(j) + abs(k) /= 0) then
+                        n_neighbors = n_neighbors + 1
+                        neighbor_list(n_neighbors, 1) = i
+                        neighbor_list(n_neighbors, 2) = j
+                        neighbor_list(n_neighbors, 3) = k
+                    end if
+                end do
+            end do
+        end do
+#endif
+
+    end subroutine s_initialize_particles_mpi
+
+    !! This subroutine initializes the MPI buffers and variables
+        !! required for the particle communication.
+        !! @param lag_num_ts Number of stages in time-stepping scheme
+    subroutine s_initialize_solid_particles_mpi(lag_num_ts)
+
+        integer :: i, j, k
+        integer :: real_size, int_size, nReal, lag_num_ts
+        integer :: ierr !< Generic flag used to identify and report MPI errors
+
+#ifdef MFC_MPI
+        call MPI_Pack_size(1, mpi_p, MPI_COMM_WORLD, real_size, ierr)
+        call MPI_Pack_size(1, MPI_INTEGER, MPI_COMM_WORLD, int_size, ierr)
+        nReal = 7 + 13*2 + 7*lag_num_ts
+        p_var_size = (nReal*real_size + int_size)
+        p_buff_size = lag_params%nParticles_glb*p_var_size
+        @:ALLOCATE(p_send_buff(0:p_buff_size), p_recv_buff(0:p_buff_size))
+        @:ALLOCATE(p_send_ids(nidx(1)%beg:nidx(1)%end, nidx(2)%beg:nidx(2)%end, nidx(3)%beg:nidx(3)%end, 0:lag_params%nParticles_glb))
+        ! First, collect all neighbor information
+        n_neighbors = 0
+        do k = nidx(3)%beg, nidx(3)%end
+            do j = nidx(2)%beg, nidx(2)%end
+                do i = nidx(1)%beg, nidx(1)%end
+                    if (abs(i) + abs(j) + abs(k) /= 0) then
+                        n_neighbors = n_neighbors + 1
+                        neighbor_list(n_neighbors, 1) = i
+                        neighbor_list(n_neighbors, 2) = j
+                        neighbor_list(n_neighbors, 3) = k
+                    end if
+                end do
+            end do
+        end do
+        @:ALLOCATE(force_send_counts(0:num_procs-1))
+        @:ALLOCATE(force_recv_counts(0:num_procs-1))
+        @:ALLOCATE(force_send_ids(0:num_procs-1, 1:lag_params%nParticles_glb))
+        @:ALLOCATE(force_send_vals(0:num_procs-1, 1:lag_params%nParticles_glb, 1:3))
+        @:ALLOCATE(flat_send_ids(1:lag_params%nParticles_glb))
+        @:ALLOCATE(flat_send_vals(1:3*lag_params%nParticles_glb))
+#endif
+
+    end subroutine s_initialize_solid_particles_mpi
+
     !>  Since only the processor with rank 0 reads and verifies
         !!      the consistency of user inputs, these are initially not
         !!      available to the other processors. Then, the purpose of
@@ -113,8 +207,8 @@ contains
             & 'bc_z%grcbc_in', 'bc_z%grcbc_out', 'bc_z%grcbc_vel_out',          &
             & 'cfl_adap_dt', 'cfl_const_dt', 'cfl_dt', 'surface_tension',       &
             & 'shear_stress', 'bulk_stress', 'bubbles_lagrange',                &
-            & 'hyperelasticity', 'down_sample', 'int_comp','fft_wrt', &
-            & 'hyper_cleaning' ]
+            & 'hyperelasticity', 'down_sample', 'int_comp','fft_wrt',           &
+            & 'hyper_cleaning', 'particles_lagrange' ]
             call MPI_BCAST(${VAR}$, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr)
         #:endfor
 
@@ -130,17 +224,40 @@ contains
 
         if (bubbles_lagrange) then
             #:for VAR in [ 'heatTransfer_model', 'massTransfer_model', 'pressure_corrector', &
-                & 'write_bubbles', 'write_bubbles_stats']
+                & 'write_bubbles', 'write_bubbles_stats', 'write_void_evol', 'pressure_force', &
+                & 'gravity_force']
                 call MPI_BCAST(lag_params%${VAR}$, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr)
             #:endfor
 
-            #:for VAR in ['solver_approach', 'cluster_type', 'smooth_type', 'nBubs_glb']
+            #:for VAR in ['solver_approach', 'cluster_type', 'smooth_type', 'nBubs_glb', 'vel_model', &
+                & 'drag_model', 'charNz']
                 call MPI_BCAST(lag_params%${VAR}$, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
             #:endfor
 
             #:for VAR in ['epsilonb','charwidth','valmaxvoid']
                 call MPI_BCAST(lag_params%${VAR}$, 1, mpi_p, 0, MPI_COMM_WORLD, ierr)
             #:endfor
+
+            call MPI_BCAST(lag_params%input_path, len(lag_params%input_path), MPI_CHARACTER, 0, MPI_COMM_WORLD, ierr)
+        end if
+
+        if (particles_lagrange) then
+            #:for VAR in [ 'heatTransfer_model', 'massTransfer_model', 'pressure_corrector', &
+                & 'write_bubbles', 'write_bubbles_stats', 'write_void_evol', 'pressure_force', &
+                & 'gravity_force', 'collision_force']
+                call MPI_BCAST(lag_params%${VAR}$, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr)
+            #:endfor
+
+            #:for VAR in ['solver_approach', 'cluster_type', 'smooth_type', 'nParticles_glb', 'vel_model', &
+                & 'drag_model', 'qs_drag_model', 'stokes_drag', 'added_mass_model', 'interpolation_order']
+                call MPI_BCAST(lag_params%${VAR}$, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
+            #:endfor
+
+            #:for VAR in ['epsilonb','charwidth','valmaxvoid']
+                call MPI_BCAST(lag_params%${VAR}$, 1, mpi_p, 0, MPI_COMM_WORLD, ierr)
+            #:endfor
+
+            call MPI_BCAST(lag_params%input_path, len(lag_params%input_path), MPI_CHARACTER, 0, MPI_COMM_WORLD, ierr)
         end if
 
         #:for VAR in [ 'dt','weno_eps','teno_CT','pref','rhoref','R0ref','Web','Ca', 'sigma', &
@@ -149,9 +266,7 @@ contains
             & 'bc_y%vb1','bc_y%vb2','bc_y%vb3','bc_y%ve1','bc_y%ve2','bc_y%ve3', &
             & 'bc_z%vb1','bc_z%vb2','bc_z%vb3','bc_z%ve1','bc_z%ve2','bc_z%ve3', &
             & 'bc_x%pres_in','bc_x%pres_out','bc_y%pres_in','bc_y%pres_out', 'bc_z%pres_in','bc_z%pres_out', &
-            & 'x_domain%beg', 'x_domain%end', 'y_domain%beg', 'y_domain%end',    &
-            & 'z_domain%beg', 'z_domain%end', 'x_a', 'x_b', 'y_a', 'y_b', 'z_a', &
-            & 'z_b', 't_stop', 't_save', 'cfl_target', 'Bx0', 'alf_factor',  &
+            & 't_stop', 't_save', 'cfl_target', 'Bx0', 'alf_factor',  &
             & 'tau_star', 'cont_damage_s', 'alpha_bar', 'adap_dt_tol', &
             & 'ic_eps', 'ic_beta', 'hyper_cleaning_speed', &
             & 'hyper_cleaning_tau' ]
@@ -200,6 +315,12 @@ contains
             #:endfor
         end if
 
+        if (particles_lagrange) then
+            #:for VAR in [ 'rho0ref_particle','cp_particle']
+                call MPI_BCAST(particle_pp%${VAR}$, 1, mpi_p, 0, MPI_COMM_WORLD, ierr)
+            #:endfor
+        end if
+
         do i = 1, num_fluids_max
             #:for VAR in ['bc_x%alpha_rho_in','bc_x%alpha_in','bc_y%alpha_rho_in','bc_y%alpha_in','bc_z%alpha_rho_in','bc_z%alpha_in']
                 call MPI_BCAST(${VAR}$ (i), 1, mpi_p, 0, MPI_COMM_WORLD, ierr)
@@ -254,10 +375,1052 @@ contains
         call MPI_BCAST(nv_uvm_igr_temps_on_gpu, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
         call MPI_BCAST(nv_uvm_pref_gpu, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr)
 
+        ! Extra BC Variable
+        call MPI_BCAST(periodic_bc, 3, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr)
+
 #endif
 
     end subroutine s_mpi_bcast_user_inputs
 
+    !> @brief Packs, exchanges, and unpacks immersed boundary marker buffers between neighboring MPI ranks.
+    subroutine s_mpi_sendrecv_ib_buffers(ib_markers, mpi_dir, pbc_loc)
+
+        type(integer_field), intent(inout) :: ib_markers
+
+        integer, intent(in) :: mpi_dir, pbc_loc
+
+        integer :: i, j, k, l, r, q !< Generic loop iterators
+
+        integer :: buffer_counts(1:3), buffer_count
+
+        type(int_bounds_info) :: boundary_conditions(1:3)
+        integer :: beg_end(1:2), grid_dims(1:3)
+        integer :: dst_proc, src_proc, recv_tag, send_tag
+
+        logical :: beg_end_geq_0, qbmm_comm
+
+        integer :: pack_offset, unpack_offset
+
+#ifdef MFC_MPI
+        integer :: ierr !< Generic flag used to identify and report MPI errors
+
+        call nvtxStartRange("IB-MARKER-COMM-PACKBUF")
+
+        buffer_counts = (/ &
+                        buff_size*(n + 1)*(p + 1), &
+                        buff_size*(m + 2*buff_size + 1)*(p + 1), &
+                        buff_size*(m + 2*buff_size + 1)*(n + 2*buff_size + 1) &
+                        /)
+
+        buffer_count = buffer_counts(mpi_dir)
+        boundary_conditions = (/bc_x, bc_y, bc_z/)
+        beg_end = (/boundary_conditions(mpi_dir)%beg, boundary_conditions(mpi_dir)%end/)
+        beg_end_geq_0 = beg_end(max(pbc_loc, 0) - pbc_loc + 1) >= 0
+
+        ! Implements:
+        ! pbc_loc  bc_x >= 0 -> [send/recv]_tag  [dst/src]_proc
+        ! -1 (=0)      0            ->     [1,0]       [0,0]      | 0 0 [1,0] [beg,beg]
+        ! -1 (=0)      1            ->     [0,0]       [1,0]      | 0 1 [0,0] [end,beg]
+        ! +1 (=1)      0            ->     [0,1]       [1,1]      | 1 0 [0,1] [end,end]
+        ! +1 (=1)      1            ->     [1,1]       [0,1]      | 1 1 [1,1] [beg,end]
+
+        send_tag = f_logical_to_int(.not. f_xor(beg_end_geq_0, pbc_loc == 1))
+        recv_tag = f_logical_to_int(pbc_loc == 1)
+
+        dst_proc = beg_end(1 + f_logical_to_int(f_xor(pbc_loc == 1, beg_end_geq_0)))
+        src_proc = beg_end(1 + f_logical_to_int(pbc_loc == 1))
+
+        grid_dims = (/m, n, p/)
+
+        pack_offset = 0
+        if (f_xor(pbc_loc == 1, beg_end_geq_0)) then
+            pack_offset = grid_dims(mpi_dir) - buff_size + 1
+        end if
+
+        unpack_offset = 0
+        if (pbc_loc == 1) then
+            unpack_offset = grid_dims(mpi_dir) + buff_size + 1
+        end if
+
+        ! Pack Buffer to Send
+        #:for mpi_dir in [1, 2, 3]
+            if (mpi_dir == ${mpi_dir}$) then
+                #:if mpi_dir == 1
+                    $:GPU_PARALLEL_LOOP(collapse=3,private='[j,k,l,r]')
+                    do l = 0, p
+                        do k = 0, n
+                            do j = 0, buff_size - 1
+                                r = (j + buff_size*(k + (n + 1)*l))
+                                ib_buff_send(r) = ib_markers%sf(j + pack_offset, k, l)
+                            end do
+                        end do
+                    end do
+                    $:END_GPU_PARALLEL_LOOP()
+                #:elif mpi_dir == 2
+                    $:GPU_PARALLEL_LOOP(collapse=3,private='[j,k,l,r]')
+                    do l = 0, p
+                        do k = 0, buff_size - 1
+                            do j = -buff_size, m + buff_size
+                                r = ((j + buff_size) + (m + 2*buff_size + 1)* &
+                                     (k + buff_size*l))
+                                ib_buff_send(r) = ib_markers%sf(j, k + pack_offset, l)
+                            end do
+                        end do
+                    end do
+                    $:END_GPU_PARALLEL_LOOP()
+                #:else
+                    $:GPU_PARALLEL_LOOP(collapse=3,private='[j,k,l,r]')
+                    do l = 0, buff_size - 1
+                        do k = -buff_size, n + buff_size
+                            do j = -buff_size, m + buff_size
+                                r = ((j + buff_size) + (m + 2*buff_size + 1)* &
+                                     ((k + buff_size) + (n + 2*buff_size + 1)*l))
+                                ib_buff_send(r) = ib_markers%sf(j, k, l + pack_offset)
+                            end do
+                        end do
+                    end do
+                    $:END_GPU_PARALLEL_LOOP()
+                #:endif
+            end if
+        #:endfor
+        call nvtxEndRange ! Packbuf
+
+        #:for rdma_mpi in [False, True]
+            if (rdma_mpi .eqv. ${'.true.' if rdma_mpi else '.false.'}$) then
+                #:if rdma_mpi
+                    #:call GPU_HOST_DATA(use_device_addr='[ib_buff_send, ib_buff_recv]')
+
+                        call nvtxStartRange("IB-MARKER-SENDRECV-RDMA")
+                        call MPI_SENDRECV( &
+                            ib_buff_send, buffer_count, MPI_INTEGER, dst_proc, send_tag, &
+                            ib_buff_recv, buffer_count, MPI_INTEGER, src_proc, recv_tag, &
+                            MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+                        call nvtxEndRange
+
+                    #:endcall GPU_HOST_DATA
+                    $:GPU_WAIT()
+                #:else
+                    call nvtxStartRange("IB-MARKER-DEV2HOST")
+                    $:GPU_UPDATE(host='[ib_buff_send]')
+                    call nvtxEndRange
+
+                    call nvtxStartRange("IB-MARKER-SENDRECV-NO-RMDA")
+                    call MPI_SENDRECV( &
+                        ib_buff_send, buffer_count, MPI_INTEGER, dst_proc, send_tag, &
+                        ib_buff_recv, buffer_count, MPI_INTEGER, src_proc, recv_tag, &
+                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+                    call nvtxEndRange
+
+                    call nvtxStartRange("IB-MARKER-HOST2DEV")
+                    $:GPU_UPDATE(device='[ib_buff_recv]')
+                    call nvtxEndRange
+                #:endif
+            end if
+        #:endfor
+
+        ! Unpack Received Buffer
+        call nvtxStartRange("IB-MARKER-COMM-UNPACKBUF")
+        #:for mpi_dir in [1, 2, 3]
+            if (mpi_dir == ${mpi_dir}$) then
+                #:if mpi_dir == 1
+                    $:GPU_PARALLEL_LOOP(collapse=3,private='[j,k,l,r]')
+                    do l = 0, p
+                        do k = 0, n
+                            do j = -buff_size, -1
+                                r = (j + buff_size*((k + 1) + (n + 1)*l))
+                                ib_markers%sf(j + unpack_offset, k, l) = ib_buff_recv(r)
+                            end do
+                        end do
+                    end do
+                    $:END_GPU_PARALLEL_LOOP()
+                #:elif mpi_dir == 2
+                    $:GPU_PARALLEL_LOOP(collapse=3,private='[j,k,l,r]')
+                    do l = 0, p
+                        do k = -buff_size, -1
+                            do j = -buff_size, m + buff_size
+                                r = ((j + buff_size) + (m + 2*buff_size + 1)* &
+                                     ((k + buff_size) + buff_size*l))
+                                ib_markers%sf(j, k + unpack_offset, l) = ib_buff_recv(r)
+                            end do
+                        end do
+                    end do
+                    $:END_GPU_PARALLEL_LOOP()
+                #:else
+                    ! Unpacking buffer from bc_z%beg
+                    $:GPU_PARALLEL_LOOP(collapse=3,private='[j,k,l,r]')
+                    do l = -buff_size, -1
+                        do k = -buff_size, n + buff_size
+                            do j = -buff_size, m + buff_size
+                                r = ((j + buff_size) + (m + 2*buff_size + 1)* &
+                                     ((k + buff_size) + (n + 2*buff_size + 1)* &
+                                      (l + buff_size)))
+                                ib_markers%sf(j, k, l + unpack_offset) = ib_buff_recv(r)
+                            end do
+                        end do
+                    end do
+                    $:END_GPU_PARALLEL_LOOP()
+                #:endif
+            end if
+        #:endfor
+        call nvtxEndRange
+#endif
+
+    end subroutine s_mpi_sendrecv_ib_buffers
+
+    !> This subroutine adds particles to the transfer list for the MPI
+        !! communication.
+        !! @param nBub Current LOCAL number of bubbles
+        !! @param pos Current position of each bubble
+        !! @param posPrev Previous position of each bubble (optional, not used
+        !!                for communication of initial condition)
+    impure subroutine s_add_particles_to_transfer_list(nBub, pos, posPrev, include_ghost)
+
+        real(wp), dimension(:, :) :: pos, posPrev
+        integer :: bubID, nbub
+        integer :: i, j, k
+        logical, optional, intent(in) :: include_ghost
+
+        do k = nidx(3)%beg, nidx(3)%end
+            do j = nidx(2)%beg, nidx(2)%end
+                do i = nidx(1)%beg, nidx(1)%end
+                    p_send_counts(i, j, k) = 0
+                end do
+            end do
+        end do
+
+        do k = 1, nbub
+            if (f_crosses_boundary(k, 1, -1, pos, posPrev, include_ghost)) then
+                call s_add_particle_to_direction(k, -1, 0, 0)
+                if (n > 0) then
+                    if (f_crosses_boundary(k, 2, -1, pos, posPrev, include_ghost)) then
+                        call s_add_particle_to_direction(k, -1, -1, 0)
+                        call s_add_particle_to_direction(k, 0, -1, 0)
+                        if (p > 0) then
+                            if (f_crosses_boundary(k, 3, -1, pos, posPrev, include_ghost)) then
+                                call s_add_particle_to_direction(k, -1, -1, -1)
+                                call s_add_particle_to_direction(k, 0, -1, -1)
+                                call s_add_particle_to_direction(k, -1, 0, -1)
+                                call s_add_particle_to_direction(k, 0, 0, -1)
+                            elseif (f_crosses_boundary(k, 3, 1, pos, posPrev, include_ghost)) then
+                                call s_add_particle_to_direction(k, -1, -1, 1)
+                                call s_add_particle_to_direction(k, 0, -1, 1)
+                                call s_add_particle_to_direction(k, -1, 0, 1)
+                                call s_add_particle_to_direction(k, 0, 0, 1)
+                            end if
+                        end if
+                    elseif (f_crosses_boundary(k, 2, 1, pos, posPrev, include_ghost)) then
+                        call s_add_particle_to_direction(k, -1, 1, 0)
+                        call s_add_particle_to_direction(k, 0, 1, 0)
+                        if (p > 0) then
+                            if (f_crosses_boundary(k, 3, -1, pos, posPrev, include_ghost)) then
+                                call s_add_particle_to_direction(k, -1, 1, -1)
+                                call s_add_particle_to_direction(k, 0, 1, -1)
+                                call s_add_particle_to_direction(k, -1, 0, -1)
+                                call s_add_particle_to_direction(k, 0, 0, -1)
+                            elseif (f_crosses_boundary(k, 3, 1, pos, posPrev, include_ghost)) then
+                                call s_add_particle_to_direction(k, -1, 1, 1)
+                                call s_add_particle_to_direction(k, 0, 1, 1)
+                                call s_add_particle_to_direction(k, -1, 0, 1)
+                                call s_add_particle_to_direction(k, 0, 0, 1)
+                            end if
+                        end if
+                    else
+                        if (p > 0) then
+                            if (f_crosses_boundary(k, 3, -1, pos, posPrev, include_ghost)) then
+                                call s_add_particle_to_direction(k, -1, 0, -1)
+                                call s_add_particle_to_direction(k, 0, 0, -1)
+                            elseif (f_crosses_boundary(k, 3, 1, pos, posPrev, include_ghost)) then
+                                call s_add_particle_to_direction(k, -1, 0, 1)
+                                call s_add_particle_to_direction(k, 0, 0, 1)
+                            end if
+                        end if
+                    end if
+                end if
+            elseif (f_crosses_boundary(k, 1, 1, pos, posPrev, include_ghost)) then
+                call s_add_particle_to_direction(k, 1, 0, 0)
+                if (n > 0) then
+                    if (f_crosses_boundary(k, 2, -1, pos, posPrev, include_ghost)) then
+                        call s_add_particle_to_direction(k, 1, -1, 0)
+                        call s_add_particle_to_direction(k, 0, -1, 0)
+                        if (p > 0) then
+                            if (f_crosses_boundary(k, 3, -1, pos, posPrev, include_ghost)) then
+                                call s_add_particle_to_direction(k, 1, -1, -1)
+                                call s_add_particle_to_direction(k, 0, -1, -1)
+                                call s_add_particle_to_direction(k, 1, 0, -1)
+                                call s_add_particle_to_direction(k, 0, 0, -1)
+                            elseif (f_crosses_boundary(k, 3, 1, pos, posPrev, include_ghost)) then
+                                call s_add_particle_to_direction(k, 1, -1, 1)
+                                call s_add_particle_to_direction(k, 0, -1, 1)
+                                call s_add_particle_to_direction(k, 1, 0, 1)
+                                call s_add_particle_to_direction(k, 0, 0, 1)
+                            end if
+                        end if
+                    elseif (f_crosses_boundary(k, 2, 1, pos, posPrev, include_ghost)) then
+                        call s_add_particle_to_direction(k, 1, 1, 0)
+                        call s_add_particle_to_direction(k, 0, 1, 0)
+                        if (p > 0) then
+                            if (f_crosses_boundary(k, 3, -1, pos, posPrev, include_ghost)) then
+                                call s_add_particle_to_direction(k, 1, 1, -1)
+                                call s_add_particle_to_direction(k, 0, 1, -1)
+                                call s_add_particle_to_direction(k, 1, 0, -1)
+                                call s_add_particle_to_direction(k, 0, 0, -1)
+                            elseif (f_crosses_boundary(k, 3, 1, pos, posPrev, include_ghost)) then
+                                call s_add_particle_to_direction(k, 1, 1, 1)
+                                call s_add_particle_to_direction(k, 0, 1, 1)
+                                call s_add_particle_to_direction(k, 1, 0, 1)
+                                call s_add_particle_to_direction(k, 0, 0, 1)
+                            end if
+                        end if
+                    else
+                        if (p > 0) then
+                            if (f_crosses_boundary(k, 3, -1, pos, posPrev, include_ghost)) then
+                                call s_add_particle_to_direction(k, 1, 0, -1)
+                                call s_add_particle_to_direction(k, 0, 0, -1)
+                            elseif (f_crosses_boundary(k, 3, 1, pos, posPrev, include_ghost)) then
+                                call s_add_particle_to_direction(k, 1, 0, 1)
+                                call s_add_particle_to_direction(k, 0, 0, 1)
+                            end if
+                        end if
+                    end if
+                end if
+            elseif (f_crosses_boundary(k, 2, -1, pos, posPrev, include_ghost)) then
+                call s_add_particle_to_direction(k, 0, -1, 0)
+                if (p > 0) then
+                    if (f_crosses_boundary(k, 3, -1, pos, posPrev, include_ghost)) then
+                        call s_add_particle_to_direction(k, 0, -1, -1)
+                        call s_add_particle_to_direction(k, 0, 0, -1)
+                    elseif (f_crosses_boundary(k, 3, 1, pos, posPrev, include_ghost)) then
+                        call s_add_particle_to_direction(k, 0, -1, 1)
+                        call s_add_particle_to_direction(k, 0, 0, 1)
+                    end if
+                end if
+            elseif (f_crosses_boundary(k, 2, 1, pos, posPrev, include_ghost)) then
+                call s_add_particle_to_direction(k, 0, 1, 0)
+                if (p > 0) then
+                    if (f_crosses_boundary(k, 3, -1, pos, posPrev, include_ghost)) then
+                        call s_add_particle_to_direction(k, 0, 1, -1)
+                        call s_add_particle_to_direction(k, 0, 0, -1)
+                    elseif (f_crosses_boundary(k, 3, 1, pos, posPrev, include_ghost)) then
+                        call s_add_particle_to_direction(k, 0, 1, 1)
+                        call s_add_particle_to_direction(k, 0, 0, 1)
+                    end if
+                end if
+            elseif (p > 0) then
+                if (f_crosses_boundary(k, 3, -1, pos, posPrev, include_ghost)) then
+                    call s_add_particle_to_direction(k, 0, 0, -1)
+                elseif (f_crosses_boundary(k, 3, 1, pos, posPrev, include_ghost)) then
+                    call s_add_particle_to_direction(k, 0, 0, 1)
+                end if
+            end if
+
+        end do
+
+    contains !f_crosses_boundary(k, 2, -1, pos, posPrev, include_ghost
+
+        logical function f_crosses_boundary(particle_id, dir, loc, pos, posPrev, include_ghost)
+
+            integer, intent(in) :: particle_id, dir, loc
+            real(wp), dimension(:, :), intent(in) :: pos
+            real(wp), dimension(:, :), optional, intent(in) :: posPrev
+            logical, optional, intent(in) :: include_ghost
+
+            if (present(include_ghost) .and. include_ghost) then
+
+                if (loc == -1) then ! Beginning of the domain
+                    if (nidx(dir)%beg == 0) then
+                        f_crosses_boundary = .false.
+                        return
+                    end if
+
+                    f_crosses_boundary = pos(particle_id, dir) < pcomm_coords_ghost(dir)%beg
+                elseif (loc == 1) then ! End of the domain
+                    if (nidx(dir)%end == 0) then
+                        f_crosses_boundary = .false.
+                        return
+                    end if
+
+                    f_crosses_boundary = pos(particle_id, dir) > pcomm_coords_ghost(dir)%end
+                end if
+
+            else
+
+                if (loc == -1) then ! Beginning of the domain
+                    if (nidx(dir)%beg == 0) then
+                        f_crosses_boundary = .false.
+                        return
+                    end if
+
+                    f_crosses_boundary = (posPrev(particle_id, dir) >= pcomm_coords(dir)%beg .and. &
+                                          pos(particle_id, dir) < pcomm_coords(dir)%beg)
+                elseif (loc == 1) then ! End of the domain
+                    if (nidx(dir)%end == 0) then
+                        f_crosses_boundary = .false.
+                        return
+                    end if
+
+                    f_crosses_boundary = (posPrev(particle_id, dir) <= pcomm_coords(dir)%end .and. &
+                                          pos(particle_id, dir) > pcomm_coords(dir)%end)
+                end if
+
+            end if
+
+        end function f_crosses_boundary
+
+        subroutine s_add_particle_to_direction(particle_id, dir_x, dir_y, dir_z)
+
+            integer, intent(in) :: particle_id, dir_x, dir_y, dir_z
+
+            p_send_ids(dir_x, dir_y, dir_z, p_send_counts(dir_x, dir_y, dir_z)) = particle_id
+            p_send_counts(dir_x, dir_y, dir_z) = p_send_counts(dir_x, dir_y, dir_z) + 1
+
+        end subroutine s_add_particle_to_direction
+
+    end subroutine s_add_particles_to_transfer_list
+
+    !> This subroutine performs the MPI communication for lagrangian particles/
+        !! bubbles.
+        !! @param bub_R0 Initial radius of each bubble
+        !! @param Rmax_stats Maximum radius of each bubble
+        !! @param Rmin_stats Minimum radius of each bubble
+        !! @param gas_mg Mass of gas in each bubble
+        !! @param gas_betaT Heat flux model coefficient for each bubble
+        !! @param gas_betaC mass flux model coefficient for each bubble
+        !! @param bub_dphidt Subgrid velocity potential for each bubble
+        !! @param lag_id Global and local ID of each bubble
+        !! @param gas_p Pressure of the gas in each bubble
+        !! @param gas_mv Mass of vapor in each bubble
+        !! @param rad Radius of each bubble
+        !! @param rvel Radial velocity of each bubble
+        !! @param pos Position of each bubble
+        !! @param posPrev Previous position of each bubble
+        !! @param vel Velocity of each bubble
+        !! @param scoord Cell index in real format of each bubble
+        !! @param drad Radial velocity of each bubble
+        !! @param drvel Radial acceleration of each bubble
+        !! @param dgasp Time derivative of gas pressure in each bubble
+        !! @param dgasmv Time derivative of vapor mass in each bubble
+        !! @param dpos Time derivative of position of each bubble
+        !! @param dvel Time derivative of velocity of each bubble
+        !! @param lag_num_ts Number of stages in time-stepping scheme
+        !! @param nBubs Local number of bubbles
+    impure subroutine s_mpi_sendrecv_particles(bub_R0, Rmax_stats, Rmin_stats, gas_mg, gas_betaT, &
+                                               gas_betaC, bub_dphidt, lag_id, gas_p, gas_mv, rad, &
+                                               rvel, pos, posPrev, vel, scoord, drad, drvel, dgasp, &
+                                               dgasmv, dpos, dvel, lag_num_ts, nbubs, dest)
+
+        real(wp), dimension(:) :: bub_R0, Rmax_stats, Rmin_stats, gas_mg, gas_betaT, gas_betaC, bub_dphidt
+        integer, dimension(:, :) :: lag_id
+        real(wp), dimension(:, :) :: gas_p, gas_mv, rad, rvel, drad, drvel, dgasp, dgasmv
+        real(wp), dimension(:, :, :) :: pos, posPrev, vel, scoord, dpos, dvel
+        integer :: position, bub_id, lag_num_ts, tag, partner, send_tag, recv_tag, nbubs, p_recv_size, dest
+
+        integer :: i, j, k, l, q, r
+        integer :: req_send, req_recv, ierr !< Generic flag used to identify and report MPI errors
+        integer :: send_count, send_offset, recv_count, recv_offset
+
+#ifdef MFC_MPI
+        ! Phase 1: Exchange particle counts using non-blocking communication
+        send_count = 0
+        recv_count = 0
+
+        ! Post all receives first
+        do l = 1, n_neighbors
+            i = neighbor_list(l, 1)
+            j = neighbor_list(l, 2)
+            k = neighbor_list(l, 3)
+            partner = neighbor_ranks(i, j, k)
+            recv_tag = neighbor_tag(i, j, k)
+
+            recv_count = recv_count + 1
+            call MPI_Irecv(p_recv_counts(i, j, k), 1, MPI_INTEGER, partner, recv_tag, &
+                           MPI_COMM_WORLD, recv_requests(recv_count), ierr)
+        end do
+
+        ! Post all sends
+        do l = 1, n_neighbors
+            i = neighbor_list(l, 1)
+            j = neighbor_list(l, 2)
+            k = neighbor_list(l, 3)
+            partner = neighbor_ranks(i, j, k)
+            send_tag = neighbor_tag(-i, -j, -k)
+
+            send_count = send_count + 1
+            call MPI_Isend(p_send_counts(i, j, k), 1, MPI_INTEGER, partner, send_tag, &
+                           MPI_COMM_WORLD, send_requests(send_count), ierr)
+        end do
+
+        ! Wait for all count exchanges to complete
+        if (recv_count > 0) then
+            call MPI_Waitall(recv_count, recv_requests(1:recv_count), MPI_STATUSES_IGNORE, ierr)
+        end if
+        if (send_count > 0) then
+            call MPI_Waitall(send_count, send_requests(1:send_count), MPI_STATUSES_IGNORE, ierr)
+        end if
+
+        ! Phase 2: Exchange particle data using non-blocking communication
+        send_count = 0
+        recv_count = 0
+
+        ! Post all receives for particle data first
+        recv_offset = 1
+        do l = 1, n_neighbors
+            i = neighbor_list(l, 1)
+            j = neighbor_list(l, 2)
+            k = neighbor_list(l, 3)
+
+            if (p_recv_counts(i, j, k) > 0) then
+                partner = neighbor_ranks(i, j, k)
+                p_recv_size = p_recv_counts(i, j, k)*p_var_size
+                recv_tag = neighbor_tag(i, j, k)
+
+                recv_count = recv_count + 1
+                call MPI_Irecv(p_recv_buff(recv_offset), p_recv_size, MPI_PACKED, partner, recv_tag, &
+                               MPI_COMM_WORLD, recv_requests(recv_count), ierr)
+                recv_offsets(l) = recv_offset
+                recv_offset = recv_offset + p_recv_size
+            end if
+        end do
+
+        ! Pack and send particle data
+        send_offset = 0
+        do l = 1, n_neighbors
+            i = neighbor_list(l, 1)
+            j = neighbor_list(l, 2)
+            k = neighbor_list(l, 3)
+
+            if (p_send_counts(i, j, k) > 0 .and. abs(i) + abs(j) + abs(k) /= 0 .and. abs(i) + abs(j) + abs(k) /= 0) then
+                partner = neighbor_ranks(i, j, k)
+                send_tag = neighbor_tag(-i, -j, -k)
+
+                ! Pack data for sending
+                position = 0
+                do q = 0, p_send_counts(i, j, k) - 1
+                    bub_id = p_send_ids(i, j, k, q)
+
+                    call MPI_Pack(lag_id(bub_id, 1), 1, MPI_INTEGER, p_send_buff(send_offset), p_buff_size, position, MPI_COMM_WORLD, ierr)
+                    call MPI_Pack(bub_R0(bub_id), 1, mpi_p, p_send_buff(send_offset), p_buff_size, position, MPI_COMM_WORLD, ierr)
+                    call MPI_Pack(Rmax_stats(bub_id), 1, mpi_p, p_send_buff(send_offset), p_buff_size, position, MPI_COMM_WORLD, ierr)
+                    call MPI_Pack(Rmin_stats(bub_id), 1, mpi_p, p_send_buff(send_offset), p_buff_size, position, MPI_COMM_WORLD, ierr)
+                    call MPI_Pack(gas_mg(bub_id), 1, mpi_p, p_send_buff(send_offset), p_buff_size, position, MPI_COMM_WORLD, ierr)
+                    call MPI_Pack(gas_betaT(bub_id), 1, mpi_p, p_send_buff(send_offset), p_buff_size, position, MPI_COMM_WORLD, ierr)
+                    call MPI_Pack(gas_betaC(bub_id), 1, mpi_p, p_send_buff(send_offset), p_buff_size, position, MPI_COMM_WORLD, ierr)
+                    call MPI_Pack(bub_dphidt(bub_id), 1, mpi_p, p_send_buff(send_offset), p_buff_size, position, MPI_COMM_WORLD, ierr)
+                    do r = 1, 2
+                        call MPI_Pack(gas_p(bub_id, r), 1, mpi_p, p_send_buff(send_offset), p_buff_size, position, MPI_COMM_WORLD, ierr)
+                        call MPI_Pack(gas_mv(bub_id, r), 1, mpi_p, p_send_buff(send_offset), p_buff_size, position, MPI_COMM_WORLD, ierr)
+                        call MPI_Pack(rad(bub_id, r), 1, mpi_p, p_send_buff(send_offset), p_buff_size, position, MPI_COMM_WORLD, ierr)
+                        call MPI_Pack(rvel(bub_id, r), 1, mpi_p, p_send_buff(send_offset), p_buff_size, position, MPI_COMM_WORLD, ierr)
+                        call MPI_Pack(pos(bub_id, :, r), 3, mpi_p, p_send_buff(send_offset), p_buff_size, position, MPI_COMM_WORLD, ierr)
+                        call MPI_Pack(posPrev(bub_id, :, r), 3, mpi_p, p_send_buff(send_offset), p_buff_size, position, MPI_COMM_WORLD, ierr)
+                        call MPI_Pack(vel(bub_id, :, r), 3, mpi_p, p_send_buff(send_offset), p_buff_size, position, MPI_COMM_WORLD, ierr)
+                        call MPI_Pack(scoord(bub_id, :, r), 3, mpi_p, p_send_buff(send_offset), p_buff_size, position, MPI_COMM_WORLD, ierr)
+                    end do
+                    do r = 1, lag_num_ts
+                        call MPI_Pack(drad(bub_id, r), 1, mpi_p, p_send_buff(send_offset), p_buff_size, position, MPI_COMM_WORLD, ierr)
+                        call MPI_Pack(drvel(bub_id, r), 1, mpi_p, p_send_buff(send_offset), p_buff_size, position, MPI_COMM_WORLD, ierr)
+                        call MPI_Pack(dgasp(bub_id, r), 1, mpi_p, p_send_buff(send_offset), p_buff_size, position, MPI_COMM_WORLD, ierr)
+                        call MPI_Pack(dgasmv(bub_id, r), 1, mpi_p, p_send_buff(send_offset), p_buff_size, position, MPI_COMM_WORLD, ierr)
+                        call MPI_Pack(dpos(bub_id, :, r), 3, mpi_p, p_send_buff(send_offset), p_buff_size, position, MPI_COMM_WORLD, ierr)
+                        call MPI_Pack(dvel(bub_id, :, r), 3, mpi_p, p_send_buff(send_offset), p_buff_size, position, MPI_COMM_WORLD, ierr)
+                    end do
+                end do
+
+                send_count = send_count + 1
+                call MPI_Isend(p_send_buff(send_offset), position, MPI_PACKED, partner, send_tag, &
+                               MPI_COMM_WORLD, send_requests(send_count), ierr)
+                send_offset = send_offset + position
+            end if
+        end do
+
+        ! Wait for all recvs for contiguous data to complete
+        call MPI_Waitall(recv_count, recv_requests(1:recv_count), MPI_STATUSES_IGNORE, ierr)
+
+        ! Process received data as it arrives
+        do l = 1, n_neighbors
+            i = neighbor_list(l, 1)
+            j = neighbor_list(l, 2)
+            k = neighbor_list(l, 3)
+
+            if (p_recv_counts(i, j, k) > 0 .and. abs(i) + abs(j) + abs(k) /= 0) then
+                p_recv_size = p_recv_counts(i, j, k)*p_var_size
+                recv_offset = recv_offsets(l)
+
+                position = 0
+                ! Unpack received data
+                do q = 0, p_recv_counts(i, j, k) - 1
+                    nbubs = nbubs + 1
+                    bub_id = nbubs
+                    call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, lag_id(bub_id, 1), 1, MPI_INTEGER, MPI_COMM_WORLD, ierr)
+                    call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, bub_R0(bub_id), 1, mpi_p, MPI_COMM_WORLD, ierr)
+                    call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, Rmax_stats(bub_id), 1, mpi_p, MPI_COMM_WORLD, ierr)
+                    call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, Rmin_stats(bub_id), 1, mpi_p, MPI_COMM_WORLD, ierr)
+                    call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, gas_mg(bub_id), 1, mpi_p, MPI_COMM_WORLD, ierr)
+                    call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, gas_betaT(bub_id), 1, mpi_p, MPI_COMM_WORLD, ierr)
+                    call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, gas_betaC(bub_id), 1, mpi_p, MPI_COMM_WORLD, ierr)
+                    call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, bub_dphidt(bub_id), 1, mpi_p, MPI_COMM_WORLD, ierr)
+                    do r = 1, 2
+                        call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, gas_p(bub_id, r), 1, mpi_p, MPI_COMM_WORLD, ierr)
+                        call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, gas_mv(bub_id, r), 1, mpi_p, MPI_COMM_WORLD, ierr)
+                        call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, rad(bub_id, r), 1, mpi_p, MPI_COMM_WORLD, ierr)
+                        call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, rvel(bub_id, r), 1, mpi_p, MPI_COMM_WORLD, ierr)
+                        call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, pos(bub_id, :, r), 3, mpi_p, MPI_COMM_WORLD, ierr)
+                        call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, posPrev(bub_id, :, r), 3, mpi_p, MPI_COMM_WORLD, ierr)
+                        call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, vel(bub_id, :, r), 3, mpi_p, MPI_COMM_WORLD, ierr)
+                        call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, scoord(bub_id, :, r), 3, mpi_p, MPI_COMM_WORLD, ierr)
+                    end do
+                    do r = 1, lag_num_ts
+                        call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, drad(bub_id, r), 1, mpi_p, MPI_COMM_WORLD, ierr)
+                        call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, drvel(bub_id, r), 1, mpi_p, MPI_COMM_WORLD, ierr)
+                        call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, dgasp(bub_id, r), 1, mpi_p, MPI_COMM_WORLD, ierr)
+                        call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, dgasmv(bub_id, r), 1, mpi_p, MPI_COMM_WORLD, ierr)
+                        call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, dpos(bub_id, :, r), 3, mpi_p, MPI_COMM_WORLD, ierr)
+                        call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, dvel(bub_id, :, r), 3, mpi_p, MPI_COMM_WORLD, ierr)
+                    end do
+                    lag_id(bub_id, 2) = bub_id
+                end do
+                recv_offset = recv_offset + p_recv_size
+            end if
+
+        end do
+
+        ! Wait for all sends to complete
+        if (send_count > 0) then
+            call MPI_Waitall(send_count, send_requests(1:send_count), MPI_STATUSES_IGNORE, ierr)
+        end if
+#endif
+
+        if (any(periodic_bc)) then
+            call s_wrap_particle_positions(pos, posPrev, nbubs, dest)
+        end if
+
+    end subroutine s_mpi_sendrecv_particles
+
+    !> This subroutine performs the MPI communication for lagrangian particles/
+        !! particles.
+        !! @param particle_R0 Initial radius of each particle
+        !! @param Rmax_stats Maximum radius of each particle
+        !! @param Rmin_stats Minimum radius of each particle
+        !! @param particle_mass Mass of gas in each particle
+        !! @param f_p Force on each particle
+        !! @param gas_betaT Heat flux model coefficient for each particle
+        !! @param gas_betaC mass flux model coefficient for each particle
+        !! @param lag_id Global and local ID of each particle
+        !! @param rad Radius of each particle
+        !! @param pos Position of each particle
+        !! @param posPrev Previous position of each particle
+        !! @param vel Velocity of each particle
+        !! @param scoord Cell index in real format of each particle
+        !! @param drad DTime erivative of particles's radius
+        !! @param dpos Time derivative of position of each particle
+        !! @param dvel Time derivative of velocity of each particle
+        !! @param lag_num_ts Number of stages in time-stepping scheme
+        !! @param nParticles Local number of particles
+    impure subroutine s_mpi_sendrecv_solid_particles(p_owner_rank, particle_R0, Rmax_stats, Rmin_stats, particle_mass, f_p, &
+                                                     lag_id, rad, pos, &
+                                                     posPrev, vel, scoord, drad, dpos, &
+                                                     dvel, lag_num_ts, nParticles, dest)
+
+        integer, dimension(:) :: p_owner_rank
+        real(wp), dimension(:) :: particle_R0, Rmax_stats, Rmin_stats, particle_mass
+        real(wp), dimension(:, :) :: f_p
+        integer, dimension(:, :) :: lag_id
+        real(wp), dimension(:, :) :: rad, drad
+        real(wp), dimension(:, :, :) :: pos, posPrev, vel, scoord, dpos, dvel
+        integer :: position, particle_id, lag_num_ts, tag, partner, send_tag, recv_tag, nParticles, p_recv_size, dest
+
+        integer :: i, j, k, l, q, r
+        integer :: req_send, req_recv, ierr !< Generic flag used to identify and report MPI errors
+        integer :: send_count, send_offset, recv_count, recv_offset
+
+#ifdef MFC_MPI
+        ! Phase 1: Exchange particle counts using non-blocking communication
+        send_count = 0
+        recv_count = 0
+
+        ! Post all receives first
+        do l = 1, n_neighbors
+            i = neighbor_list(l, 1)
+            j = neighbor_list(l, 2)
+            k = neighbor_list(l, 3)
+            partner = neighbor_ranks(i, j, k)
+            recv_tag = neighbor_tag(i, j, k)
+
+            recv_count = recv_count + 1
+            call MPI_Irecv(p_recv_counts(i, j, k), 1, MPI_INTEGER, partner, recv_tag, &
+                           MPI_COMM_WORLD, recv_requests(recv_count), ierr)
+        end do
+
+        ! Post all sends
+        do l = 1, n_neighbors
+            i = neighbor_list(l, 1)
+            j = neighbor_list(l, 2)
+            k = neighbor_list(l, 3)
+            partner = neighbor_ranks(i, j, k)
+            send_tag = neighbor_tag(-i, -j, -k)
+
+            send_count = send_count + 1
+            call MPI_Isend(p_send_counts(i, j, k), 1, MPI_INTEGER, partner, send_tag, &
+                           MPI_COMM_WORLD, send_requests(send_count), ierr)
+        end do
+
+        ! Wait for all count exchanges to complete
+        if (recv_count > 0) then
+            call MPI_Waitall(recv_count, recv_requests(1:recv_count), MPI_STATUSES_IGNORE, ierr)
+        end if
+        if (send_count > 0) then
+            call MPI_Waitall(send_count, send_requests(1:send_count), MPI_STATUSES_IGNORE, ierr)
+        end if
+
+        ! Phase 2: Exchange particle data using non-blocking communication
+        send_count = 0
+        recv_count = 0
+
+        ! Post all receives for particle data first
+        recv_offset = 1
+        do l = 1, n_neighbors
+            i = neighbor_list(l, 1)
+            j = neighbor_list(l, 2)
+            k = neighbor_list(l, 3)
+
+            if (p_recv_counts(i, j, k) > 0) then
+                partner = neighbor_ranks(i, j, k)
+                p_recv_size = p_recv_counts(i, j, k)*p_var_size
+                recv_tag = neighbor_tag(i, j, k)
+
+                recv_count = recv_count + 1
+                call MPI_Irecv(p_recv_buff(recv_offset), p_recv_size, MPI_PACKED, partner, recv_tag, &
+                               MPI_COMM_WORLD, recv_requests(recv_count), ierr)
+                recv_offsets(l) = recv_offset
+                recv_offset = recv_offset + p_recv_size
+            end if
+        end do
+
+        ! Pack and send particle data
+        send_offset = 0
+        do l = 1, n_neighbors
+            i = neighbor_list(l, 1)
+            j = neighbor_list(l, 2)
+            k = neighbor_list(l, 3)
+
+            if (p_send_counts(i, j, k) > 0 .and. abs(i) + abs(j) + abs(k) /= 0 .and. abs(i) + abs(j) + abs(k) /= 0) then
+                partner = neighbor_ranks(i, j, k)
+                send_tag = neighbor_tag(-i, -j, -k)
+
+                ! Pack data for sending
+                position = 0
+                do q = 0, p_send_counts(i, j, k) - 1
+                    particle_id = p_send_ids(i, j, k, q)
+
+                    call MPI_Pack(lag_id(particle_id, 1), 1, MPI_INTEGER, p_send_buff(send_offset), p_buff_size, position, MPI_COMM_WORLD, ierr)
+                    call MPI_Pack(particle_R0(particle_id), 1, mpi_p, p_send_buff(send_offset), p_buff_size, position, MPI_COMM_WORLD, ierr)
+                    call MPI_Pack(Rmax_stats(particle_id), 1, mpi_p, p_send_buff(send_offset), p_buff_size, position, MPI_COMM_WORLD, ierr)
+                    call MPI_Pack(Rmin_stats(particle_id), 1, mpi_p, p_send_buff(send_offset), p_buff_size, position, MPI_COMM_WORLD, ierr)
+                    call MPI_Pack(particle_mass(particle_id), 1, mpi_p, p_send_buff(send_offset), p_buff_size, position, MPI_COMM_WORLD, ierr)
+                    call MPI_Pack(f_p(particle_id, :), 3, mpi_p, p_send_buff(send_offset), p_buff_size, position, MPI_COMM_WORLD, ierr)
+                    ! call MPI_Pack(gas_betaT(particle_id), 1, mpi_p, p_send_buff(send_offset), p_buff_size, position, MPI_COMM_WORLD, ierr)
+                    ! call MPI_Pack(gas_betaC(particle_id), 1, mpi_p, p_send_buff(send_offset), p_buff_size, position, MPI_COMM_WORLD, ierr)
+                    ! call MPI_Pack(bub_dphidt(particle_id), 1, mpi_p, p_send_buff(send_offset), p_buff_size, position, MPI_COMM_WORLD, ierr)
+                    do r = 1, 2
+                        ! call MPI_Pack(gas_p(particle_id, r), 1, mpi_p, p_send_buff(send_offset), p_buff_size, position, MPI_COMM_WORLD, ierr)
+                        ! call MPI_Pack(gas_mv(particle_id, r), 1, mpi_p, p_send_buff(send_offset), p_buff_size, position, MPI_COMM_WORLD, ierr)
+                        call MPI_Pack(rad(particle_id, r), 1, mpi_p, p_send_buff(send_offset), p_buff_size, position, MPI_COMM_WORLD, ierr)
+                        ! call MPI_Pack(rvel(particle_id, r), 1, mpi_p, p_send_buff(send_offset), p_buff_size, position, MPI_COMM_WORLD, ierr)
+                        call MPI_Pack(pos(particle_id, :, r), 3, mpi_p, p_send_buff(send_offset), p_buff_size, position, MPI_COMM_WORLD, ierr)
+                        call MPI_Pack(posPrev(particle_id, :, r), 3, mpi_p, p_send_buff(send_offset), p_buff_size, position, MPI_COMM_WORLD, ierr)
+                        call MPI_Pack(vel(particle_id, :, r), 3, mpi_p, p_send_buff(send_offset), p_buff_size, position, MPI_COMM_WORLD, ierr)
+                        call MPI_Pack(scoord(particle_id, :, r), 3, mpi_p, p_send_buff(send_offset), p_buff_size, position, MPI_COMM_WORLD, ierr)
+                    end do
+                    do r = 1, lag_num_ts
+                        call MPI_Pack(drad(particle_id, r), 1, mpi_p, p_send_buff(send_offset), p_buff_size, position, MPI_COMM_WORLD, ierr)
+                        ! call MPI_Pack(drvel(particle_id, r), 1, mpi_p, p_send_buff(send_offset), p_buff_size, position, MPI_COMM_WORLD, ierr)
+                        ! call MPI_Pack(dgasp(particle_id, r), 1, mpi_p, p_send_buff(send_offset), p_buff_size, position, MPI_COMM_WORLD, ierr)
+                        ! call MPI_Pack(dgasmv(particle_id, r), 1, mpi_p, p_send_buff(send_offset), p_buff_size, position, MPI_COMM_WORLD, ierr)
+                        call MPI_Pack(dpos(particle_id, :, r), 3, mpi_p, p_send_buff(send_offset), p_buff_size, position, MPI_COMM_WORLD, ierr)
+                        call MPI_Pack(dvel(particle_id, :, r), 3, mpi_p, p_send_buff(send_offset), p_buff_size, position, MPI_COMM_WORLD, ierr)
+                    end do
+                end do
+
+                send_count = send_count + 1
+                call MPI_Isend(p_send_buff(send_offset), position, MPI_PACKED, partner, send_tag, &
+                               MPI_COMM_WORLD, send_requests(send_count), ierr)
+                send_offset = send_offset + position
+            end if
+        end do
+
+        ! Wait for all recvs for contiguous data to complete
+        call MPI_Waitall(recv_count, recv_requests(1:recv_count), MPI_STATUSES_IGNORE, ierr)
+
+        ! Process received data as it arrives
+        do l = 1, n_neighbors
+            i = neighbor_list(l, 1)
+            j = neighbor_list(l, 2)
+            k = neighbor_list(l, 3)
+
+            if (p_recv_counts(i, j, k) > 0 .and. abs(i) + abs(j) + abs(k) /= 0) then
+                p_recv_size = p_recv_counts(i, j, k)*p_var_size
+                recv_offset = recv_offsets(l)
+
+                position = 0
+                ! Unpack received data
+                do q = 0, p_recv_counts(i, j, k) - 1
+                    nParticles = nParticles + 1
+                    particle_id = nParticles
+
+                    p_owner_rank(particle_id) = neighbor_ranks(i, j, k)
+
+                    call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, lag_id(particle_id, 1), 1, MPI_INTEGER, MPI_COMM_WORLD, ierr)
+                    call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, particle_R0(particle_id), 1, mpi_p, MPI_COMM_WORLD, ierr)
+                    call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, Rmax_stats(particle_id), 1, mpi_p, MPI_COMM_WORLD, ierr)
+                    call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, Rmin_stats(particle_id), 1, mpi_p, MPI_COMM_WORLD, ierr)
+                    call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, particle_mass(particle_id), 1, mpi_p, MPI_COMM_WORLD, ierr)
+                    call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, f_p(particle_id, :), 3, mpi_p, MPI_COMM_WORLD, ierr)
+                    ! call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, gas_betaT(particle_id), 1, mpi_p, MPI_COMM_WORLD, ierr)
+                    ! call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, gas_betaC(particle_id), 1, mpi_p, MPI_COMM_WORLD, ierr)
+                    ! call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, bub_dphidt(particle_id), 1, mpi_p, MPI_COMM_WORLD, ierr)
+                    do r = 1, 2
+                        ! call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, gas_p(particle_id, r), 1, mpi_p, MPI_COMM_WORLD, ierr)
+                        ! call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, gas_mv(particle_id, r), 1, mpi_p, MPI_COMM_WORLD, ierr)
+                        call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, rad(particle_id, r), 1, mpi_p, MPI_COMM_WORLD, ierr)
+                        ! call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, rvel(particle_id, r), 1, mpi_p, MPI_COMM_WORLD, ierr)
+                        call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, pos(particle_id, :, r), 3, mpi_p, MPI_COMM_WORLD, ierr)
+                        call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, posPrev(particle_id, :, r), 3, mpi_p, MPI_COMM_WORLD, ierr)
+                        call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, vel(particle_id, :, r), 3, mpi_p, MPI_COMM_WORLD, ierr)
+                        call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, scoord(particle_id, :, r), 3, mpi_p, MPI_COMM_WORLD, ierr)
+                    end do
+                    do r = 1, lag_num_ts
+                        call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, drad(particle_id, r), 1, mpi_p, MPI_COMM_WORLD, ierr)
+                        ! call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, drvel(particle_id, r), 1, mpi_p, MPI_COMM_WORLD, ierr)
+                        ! call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, dgasp(particle_id, r), 1, mpi_p, MPI_COMM_WORLD, ierr)
+                        ! call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, dgasmv(particle_id, r), 1, mpi_p, MPI_COMM_WORLD, ierr)
+                        call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, dpos(particle_id, :, r), 3, mpi_p, MPI_COMM_WORLD, ierr)
+                        call MPI_Unpack(p_recv_buff(recv_offset), p_recv_size, position, dvel(particle_id, :, r), 3, mpi_p, MPI_COMM_WORLD, ierr)
+                    end do
+                    lag_id(particle_id, 2) = particle_id
+                end do
+                recv_offset = recv_offset + p_recv_size
+            end if
+
+        end do
+
+        ! Wait for all sends to complete
+        if (send_count > 0) then
+            call MPI_Waitall(send_count, send_requests(1:send_count), MPI_STATUSES_IGNORE, ierr)
+        end if
+#endif
+
+        if (any(periodic_bc)) then
+            call s_wrap_particle_positions(pos, posPrev, nParticles, dest)
+        end if
+
+    end subroutine s_mpi_sendrecv_solid_particles
+
+    !> This resets the collision force buffers
+    impure subroutine s_reset_force_buffers()
+
+        force_send_counts = 0
+        force_recv_counts = 0
+        force_send_ids = 0
+        force_send_vals = 0._wp
+
+        $:GPU_UPDATE(device='[force_send_counts, force_send_ids, force_send_vals]')
+
+    end subroutine
+
+    !> This adds the forces to the buffer arrays for mpi transfer
+    impure subroutine s_add_force_to_send_buffer(dest_rank, gid, force)
+        $:GPU_ROUTINE(function_name='s_add_force_to_send_buffer', parallelism='[seq]')
+
+        integer, intent(in) :: dest_rank, gid
+        real(wp), intent(in), dimension(3) :: force
+        integer :: idx
+
+        $:GPU_ATOMIC(atomic='capture')
+        force_send_counts(dest_rank) = force_send_counts(dest_rank) + 1
+        idx = force_send_counts(dest_rank)
+        $:END_GPU_ATOMIC_CAPTURE()
+
+        force_send_ids(dest_rank, idx) = gid
+        force_send_vals(dest_rank, idx, 1) = force(1)
+        force_send_vals(dest_rank, idx, 2) = force(2)
+        force_send_vals(dest_rank, idx, 3) = force(3)
+
+    end subroutine
+
+    !> This communicates the collision forces across neighbor mpi ranks
+    impure subroutine s_transfer_collision_forces(total_recv, force_recv_ids, force_recv_vals)
+
+        integer, intent(inout) :: total_recv
+        integer, intent(inout) :: force_recv_ids(:)
+        real(wp), intent(inout) :: force_recv_vals(:)
+        integer :: ierr !< Generic flag used to identify and report MPI errors
+        integer :: i, j, k, l, idx, total_send, recv_tag, send_tag, partner, recv_count, send_count
+        integer :: send_displs(0:num_procs - 1), recv_displs(0:num_procs - 1)
+        integer :: sendcounts_vals(0:num_procs - 1), recvcounts_vals(0:num_procs - 1)
+        integer :: senddispls_vals(0:num_procs - 1), recvdispls_vals(0:num_procs - 1)
+
+        $:GPU_UPDATE(host='[force_send_counts, force_send_ids, force_send_vals]')
+
+        ! Phase 1: Exchange force counts with neighbors only
+        send_count = 0
+        recv_count = 0
+
+        do l = 1, n_neighbors
+            i = neighbor_list(l, 1)
+            j = neighbor_list(l, 2)
+            k = neighbor_list(l, 3)
+            partner = neighbor_ranks(i, j, k)
+            recv_tag = neighbor_tag(i, j, k)
+            send_tag = neighbor_tag(-i, -j, -k)
+
+            recv_count = recv_count + 1
+            call MPI_Irecv(force_recv_counts(partner), 1, MPI_INTEGER, partner, recv_tag, &
+                           MPI_COMM_WORLD, recv_requests(recv_count), ierr)
+
+            send_count = send_count + 1
+            call MPI_Isend(force_send_counts(partner), 1, MPI_INTEGER, partner, send_tag, &
+                           MPI_COMM_WORLD, send_requests(send_count), ierr)
+        end do
+
+        call MPI_Waitall(recv_count, recv_requests(1:recv_count), MPI_STATUSES_IGNORE, ierr)
+        call MPI_Waitall(send_count, send_requests(1:send_count), MPI_STATUSES_IGNORE, ierr)
+
+        ! Compute displacements
+        send_displs(0) = 0
+        recv_displs(0) = 0
+        do i = 1, num_procs - 1
+            send_displs(i) = send_displs(i - 1) + force_send_counts(i - 1)
+            recv_displs(i) = recv_displs(i - 1) + force_recv_counts(i - 1)
+        end do
+
+        do i = 0, num_procs - 1
+            sendcounts_vals(i) = 3*force_send_counts(i)
+            recvcounts_vals(i) = 3*force_recv_counts(i)
+            senddispls_vals(i) = 3*send_displs(i)
+            recvdispls_vals(i) = 3*recv_displs(i)
+        end do
+
+        total_send = sum(force_send_counts)
+        total_recv = sum(force_recv_counts)
+
+        ! Flatten send buffers
+        idx = 1
+        do i = 0, num_procs - 1
+            do j = 1, force_send_counts(i)
+                flat_send_ids(idx) = force_send_ids(i, j)
+                flat_send_vals(3*(idx - 1) + 1) = force_send_vals(i, j, 1)
+                flat_send_vals(3*(idx - 1) + 2) = force_send_vals(i, j, 2)
+                flat_send_vals(3*(idx - 1) + 3) = force_send_vals(i, j, 3)
+                idx = idx + 1
+            end do
+        end do
+
+        ! Phase 2: Exchange force data with neighbors only
+        send_count = 0
+        recv_count = 0
+
+        do l = 1, n_neighbors
+            i = neighbor_list(l, 1)
+            j = neighbor_list(l, 2)
+            k = neighbor_list(l, 3)
+            partner = neighbor_ranks(i, j, k)
+            recv_tag = neighbor_tag(i, j, k)
+            send_tag = neighbor_tag(-i, -j, -k)
+
+            if (force_recv_counts(partner) > 0) then
+                recv_count = recv_count + 1
+                call MPI_Irecv(force_recv_ids(recv_displs(partner) + 1), force_recv_counts(partner), MPI_INTEGER, &
+                               partner, recv_tag, MPI_COMM_WORLD, recv_requests(recv_count), ierr)
+                recv_count = recv_count + 1
+                call MPI_Irecv(force_recv_vals(recvdispls_vals(partner) + 1), recvcounts_vals(partner), mpi_p, &
+                               partner, recv_tag + 1, MPI_COMM_WORLD, recv_requests(recv_count), ierr)
+            end if
+
+            if (force_send_counts(partner) > 0) then
+                send_count = send_count + 1
+                call MPI_Isend(flat_send_ids(send_displs(partner) + 1), force_send_counts(partner), MPI_INTEGER, &
+                               partner, send_tag, MPI_COMM_WORLD, send_requests(send_count), ierr)
+                send_count = send_count + 1
+                call MPI_Isend(flat_send_vals(senddispls_vals(partner) + 1), sendcounts_vals(partner), mpi_p, &
+                               partner, send_tag + 1, MPI_COMM_WORLD, send_requests(send_count), ierr)
+            end if
+        end do
+
+        call MPI_Waitall(recv_count, recv_requests(1:recv_count), MPI_STATUSES_IGNORE, ierr)
+        call MPI_Waitall(send_count, send_requests(1:send_count), MPI_STATUSES_IGNORE, ierr)
+
+    end subroutine s_transfer_collision_forces
+
+    !! This function returns a unique tag for each neighbor based on its position
+        !! relative to the current process.
+        !! @param i, j, k Indices of the neighbor in the range [-1, 1]
+        !! @return tag Unique integer tag for the neighbor
+    integer function neighbor_tag(i, j, k) result(tag)
+
+        integer, intent(in) :: i, j, k
+
+        tag = (k + 1)*9 + (j + 1)*3 + (i + 1)
+
+    end function neighbor_tag
+
+    subroutine s_wrap_particle_positions(pos, posPrev, nbubs, dest)
+
+        real(wp), dimension(:, :, :) :: pos, posPrev
+        integer :: nbubs, dest
+        integer :: i, q
+        real(wp) :: offset
+
+        do i = 1, nbubs
+            if (periodic_bc(1)) then
+                offset = glb_bounds(1)%end - glb_bounds(1)%beg
+                if (pos(i, 1, dest) > x_cb(m + buff_size)) then
+                    do q = 1, 2
+                        pos(i, 1, q) = pos(i, 1, q) - offset
+                        posPrev(i, 1, q) = posPrev(i, 1, q) - offset
+                    end do
+                end if
+                if (pos(i, 1, dest) < x_cb(-1 - buff_size)) then
+                    do q = 1, 2
+                        pos(i, 1, q) = pos(i, 1, q) + offset
+                        posPrev(i, 1, q) = posPrev(i, 1, q) + offset
+                    end do
+                end if
+            end if
+
+            if (periodic_bc(2)) then
+                offset = glb_bounds(2)%end - glb_bounds(2)%beg
+                if (pos(i, 2, dest) > y_cb(n + buff_size)) then
+                    do q = 1, 2
+                        pos(i, 2, q) = pos(i, 2, q) - offset
+                        posPrev(i, 2, q) = posPrev(i, 2, q) - offset
+                    end do
+                end if
+                if (pos(i, 2, dest) < y_cb(-buff_size - 1)) then
+                    do q = 1, 2
+                        pos(i, 2, q) = pos(i, 2, q) + offset
+                        posPrev(i, 2, q) = posPrev(i, 2, q) + offset
+                    end do
+                end if
+            end if
+
+            if (periodic_bc(3)) then
+                offset = glb_bounds(3)%end - glb_bounds(3)%beg
+                if (pos(i, 3, dest) > z_cb(p + buff_size)) then
+                    do q = 1, 2
+                        pos(i, 3, q) = pos(i, 3, q) - offset
+                        posPrev(i, 3, q) = posPrev(i, 3, q) - offset
+                    end do
+                end if
+                if (pos(i, 3, dest) < z_cb(-1 - buff_size)) then
+                    do q = 1, 2
+                        pos(i, 3, q) = pos(i, 3, q) + offset
+                        posPrev(i, 3, q) = posPrev(i, 3, q) + offset
+                    end do
+                end if
+            end if
+        end do
+
+    end subroutine s_wrap_particle_positions
+
     !> @brief Broadcasts random phase numbers from rank 0 to all MPI processes.
     impure subroutine s_mpi_send_random_number(phi_rn, num_freq)
         integer, intent(in) :: num_freq
diff --git a/src/simulation/m_particles_EL.fpp b/src/simulation/m_particles_EL.fpp
new file mode 100644
index 0000000000..031f5b6375
--- /dev/null
+++ b/src/simulation/m_particles_EL.fpp
@@ -0,0 +1,2787 @@
+!>
+!! @file m_particles_EL.fpp
+!! @brief Contains module m_particles_EL
+
+#:include 'macros.fpp'
+
+!> @brief This module is used to to compute the volume-averaged particle model
+module m_particles_EL
+
+    use m_global_parameters             !< Definitions of the global parameters
+
+    use m_mpi_proxy                     !< Message passing interface (MPI) module proxy
+
+    use m_particles_EL_kernels            !< Definitions of the kernel functions
+
+    use m_variables_conversion          !< State variables type conversion procedures
+
+    use m_compile_specific
+
+    use m_boundary_common
+
+    use m_helper_basic         !< Functions to compare floating point numbers
+
+    use m_sim_helpers
+
+    use m_helper
+
+    use m_mpi_common
+
+    use m_ibm
+
+    implicit none
+
+    real(wp), save :: next_write_time = 0._wp
+
+    integer, allocatable, dimension(:, :) :: lag_part_id                 !< Global and local IDs
+    integer, allocatable, dimension(:) :: gid_to_local
+    real(wp), allocatable, dimension(:) :: particle_R0            !< Initial particle radius
+    real(wp), allocatable, dimension(:) :: Rmax_stats_part        !< Maximum radius
+    real(wp), allocatable, dimension(:) :: Rmin_stats_part        !< Minimum radius
+    $:GPU_DECLARE(create='[lag_part_id, gid_to_local, particle_R0, Rmax_stats_part, Rmin_stats_part]')
+
+    real(wp), allocatable, dimension(:) :: particle_mass            !< Particle Mass
+    $:GPU_DECLARE(create='[particle_mass]')
+    real(wp), allocatable, dimension(:) :: p_AM            !< Particle Added Mass
+    $:GPU_DECLARE(create='[p_AM]')
+
+    integer, allocatable, dimension(:) :: p_owner_rank            !< Particle Added Mass
+    $:GPU_DECLARE(create='[p_owner_rank]')
+
+    integer, allocatable, dimension(:) :: linked_list            !< particle cell linked list
+    $:GPU_DECLARE(create='[linked_list]')
+
+    integer, allocatable, dimension(:, :, :) :: particle_head !< particle heads at each cell
+    $:GPU_DECLARE(create='[particle_head]')
+
+    !(nPart, 1 -> actual val or 2 -> temp val)
+    real(wp), allocatable, dimension(:, :) :: particle_rad      !< Particle radius
+    $:GPU_DECLARE(create='[particle_rad]')
+
+    !(nPart, 1-> x or 2->y or 3 ->z, 1 -> actual or 2 -> temporal val)
+    real(wp), allocatable, dimension(:, :, :) :: particle_pos     !< Particle's position
+    real(wp), allocatable, dimension(:, :, :) :: particle_posPrev !< Particle's previous position
+    real(wp), allocatable, dimension(:, :, :) :: particle_vel     !< Particle's velocity
+    real(wp), allocatable, dimension(:, :, :) :: particle_s       !< Particle's computational cell position in real format
+    $:GPU_DECLARE(create='[particle_pos, particle_posPrev, particle_vel, particle_s]')
+    !(nPart, 1-> x or 2->y or 3 ->z, time-stage)
+    real(wp), allocatable, dimension(:, :) :: particle_draddt   !< Time derivative of particle's radius
+    real(wp), allocatable, dimension(:, :, :) :: particle_dposdt  !< Time derivative of the particle's position
+    real(wp), allocatable, dimension(:, :, :) :: particle_dveldt  !< Time derivative of the particle's velocity
+    $:GPU_DECLARE(create='[particle_draddt, particle_dposdt, particle_dveldt]')
+
+    integer, private :: lag_num_ts                                  !<  Number of time stages in the time-stepping scheme
+    $:GPU_DECLARE(create='[lag_num_ts]')
+
+    real(wp) :: Rmax_glb, Rmin_glb       !< Maximum and minimum bubbe size in the local domain
+    !< Projection of the lagrangian particles in the Eulerian framework
+    type(scalar_field), dimension(:), allocatable :: q_particles
+    type(scalar_field), dimension(:), allocatable :: kahan_comp !< Kahan compensation for q_beta accumulation
+    integer :: q_particles_idx                     !< Size of the q vector field for particle cell (q)uantities
+    integer, parameter :: alphaf_id = 1
+    integer, parameter :: alphaupx_id = 2 !< x particle momentum index
+    integer, parameter :: alphaupy_id = 3 !< y particle momentum index
+    integer, parameter :: alphaupz_id = 4 !< z particle momentum index
+    integer, parameter :: Smx_id = 5
+    integer, parameter :: Smy_id = 6
+    integer, parameter :: Smz_id = 7
+    integer, parameter :: SE_id = 8
+
+    type(scalar_field), dimension(:), allocatable :: field_vars !< For cell quantities (field gradients, etc.)
+    integer, parameter :: dPx_id = 1 !< Spatial pressure gradient in x, y, and z
+    integer, parameter :: dPy_id = 2
+    integer, parameter :: dPz_id = 3
+    integer, parameter :: drhox_id = 4 !< Spatial density gradient in x, y, and z
+    integer, parameter :: drhoy_id = 5
+    integer, parameter :: drhoz_id = 6
+    integer, parameter :: dufx_id = 7 !< Spatial velocity gradient in x, y, and z
+    integer, parameter :: dufy_id = 8
+    integer, parameter :: dufz_id = 9
+    integer, parameter :: dalphafx_id = 10 !< Spatial fluid volume fraction gradient in x, y, and z
+    integer, parameter :: dalphafy_id = 11
+    integer, parameter :: dalphafz_id = 12
+    integer, parameter :: dalphap_upx_id = 13 !< Spatial particle momentum gradient in x, y, and z
+    integer, parameter :: dalphap_upy_id = 14
+    integer, parameter :: dalphap_upz_id = 15
+    integer, parameter :: nField_vars = 15
+
+    type(scalar_field), dimension(:), allocatable :: weights_x_interp !< For precomputing weights
+    type(scalar_field), dimension(:), allocatable :: weights_y_interp !< For precomputing weights
+    type(scalar_field), dimension(:), allocatable :: weights_z_interp !< For precomputing weights
+    integer :: nWeights_interp
+
+    type(scalar_field), dimension(:), allocatable :: weights_x_grad !< For precomputing weights
+    type(scalar_field), dimension(:), allocatable :: weights_y_grad !< For precomputing weights
+    type(scalar_field), dimension(:), allocatable :: weights_z_grad !< For precomputing weights
+    integer :: nWeights_grad
+
+    $:GPU_DECLARE(create='[Rmax_glb,Rmin_glb,q_particles,kahan_comp,q_particles_idx,field_vars]')
+    $:GPU_DECLARE(create='[weights_x_interp,weights_y_interp,weights_z_interp,nWeights_interp]')
+    $:GPU_DECLARE(create='[weights_x_grad,weights_y_grad,weights_z_grad,nWeights_grad]')
+
+    !Particle Source terms for fluid coupling
+    real(wp), allocatable, dimension(:, :) :: f_p !< force on each particle
+    $:GPU_DECLARE(create='[f_p]')
+
+    real(wp), allocatable, dimension(:) :: gSum !< gaussian sum for each particle
+    $:GPU_DECLARE(create='[gSum]')
+
+    integer, allocatable :: force_recv_ids(:) !< ids of collision forces received from other ranks
+    real(wp), allocatable :: force_recv_vals(:) !< collision forces received from other ranks
+    $:GPU_DECLARE(create='[force_recv_ids, force_recv_vals]')
+
+    integer, parameter :: LAG_EVOL_ID = 11 ! File id for lag_bubbles_evol_*.dat
+    integer, parameter :: LAG_STATS_ID = 12 ! File id for stats_lag_bubbles_*.dat
+    integer, parameter :: LAG_VOID_ID = 13 ! File id for voidfraction.dat
+
+    integer, allocatable, dimension(:) :: keep_bubble
+    integer, allocatable, dimension(:, :) :: wrap_bubble_loc, wrap_bubble_dir
+    $:GPU_DECLARE(create='[keep_bubble]')
+    $:GPU_DECLARE(create='[wrap_bubble_loc, wrap_bubble_dir]')
+
+    integer :: error_flag !Error flag for collisions
+    $:GPU_DECLARE(create='[error_flag]')
+
+    integer, parameter :: ncc = 1 !< Number of collisions cells at boundaries
+    real(wp) :: eps_overlap = 1.e-12
+
+contains
+
+    !> Initializes the lagrangian subgrid particle solver
+        !! @param q_cons_vf Initial conservative variables
+    impure subroutine s_initialize_particles_EL_module(q_cons_vf, bc_type)
+
+        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
+        type(integer_field), dimension(1:num_dims, 1:2), intent(in) :: bc_type
+
+        integer :: nParticles_glb, i, j, k, nf, l, npts
+
+        !PRIM TO CONS VARIABLES
+        real(wp) :: dyn_pres, pi_inf, qv, gamma, pres, T
+        real(wp) :: rhou, alpharhou, rho_f, alpharho
+        real(wp), dimension(3) :: fluid_vel
+        real(wp) :: rhoYks(1:num_species)
+        integer :: save_count
+        real(wp) :: qtime
+
+        real(wp) :: myR, func_sum
+        real(wp), dimension(3) :: myPos, myVel, myForce
+        integer, dimension(3) :: cell
+        logical :: only_beta = .true.
+
+        if (cfl_dt) then
+            save_count = n_start
+            qtime = n_start*t_save
+        else
+            save_count = t_step_start
+            qtime = t_step_start*dt
+        end if
+
+        pi_inf = 0._wp
+        qv = 0._wp
+        gamma = gammas(1)
+
+        ! Setting number of time-stages for selected time-stepping scheme
+        lag_num_ts = time_stepper
+
+        ! Allocate space for the Eulerian fields needed to map the effect of the particles
+        if (lag_params%solver_approach == 1) then
+            ! One-way coupling
+            q_particles_idx = 1 !For tracking volume fraction
+        elseif (lag_params%solver_approach == 2) then
+            !Two-way coupling
+            q_particles_idx = 8 !For tracking volume fraction(1), x-mom(2), y-mom(3), z-mom(4), and energy(5) sources, and alpha_p u_p (x(6),y(7),z(8))
+        else
+            call s_mpi_abort('Please check the lag_params%solver_approach input')
+        end if
+
+        nWeights_interp = lag_params%interpolation_order + 1
+        nWeights_grad = fd_order + 1
+
+        pcomm_coords(1)%beg = x_cb(-1)
+        pcomm_coords(1)%end = x_cb(m)
+        $:GPU_UPDATE(device='[pcomm_coords(1)]')
+        if (n > 0) then
+            pcomm_coords(2)%beg = y_cb(-1)
+            pcomm_coords(2)%end = y_cb(n)
+            $:GPU_UPDATE(device='[pcomm_coords(2)]')
+            if (p > 0) then
+                pcomm_coords(3)%beg = z_cb(-1)
+                pcomm_coords(3)%end = z_cb(p)
+                $:GPU_UPDATE(device='[pcomm_coords(3)]')
+            end if
+        end if
+
+        pcomm_coords_ghost(1)%beg = x_cb(-1 + ncc)
+        pcomm_coords_ghost(1)%end = x_cb(m - ncc)
+        $:GPU_UPDATE(device='[pcomm_coords_ghost(1)]')
+        if (n > 0) then
+            pcomm_coords_ghost(2)%beg = y_cb(-1 + ncc)
+            pcomm_coords_ghost(2)%end = y_cb(n - ncc)
+            $:GPU_UPDATE(device='[pcomm_coords_ghost(2)]')
+            if (p > 0) then
+                pcomm_coords_ghost(3)%beg = z_cb(-1 + ncc)
+                pcomm_coords_ghost(3)%end = z_cb(p - ncc)
+                $:GPU_UPDATE(device='[pcomm_coords_ghost(3)]')
+            end if
+        end if
+
+        $:GPU_UPDATE(device='[lag_num_ts, q_particles_idx]')
+
+        @:ALLOCATE(q_particles(1:q_particles_idx))
+        @:ALLOCATE(kahan_comp(1:q_particles_idx))
+        do i = 1, q_particles_idx
+            @:ALLOCATE(q_particles(i)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+                idwbuff(2)%beg:idwbuff(2)%end, &
+                idwbuff(3)%beg:idwbuff(3)%end))
+            @:ACC_SETUP_SFs(q_particles(i))
+            @:ALLOCATE(kahan_comp(i)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+                idwbuff(2)%beg:idwbuff(2)%end, &
+                idwbuff(3)%beg:idwbuff(3)%end))
+            @:ACC_SETUP_SFs(kahan_comp(i))
+        end do
+
+        @:ALLOCATE(field_vars(1:nField_vars))
+        do i = 1, nField_vars
+            @:ALLOCATE(field_vars(i)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+                idwbuff(2)%beg:idwbuff(2)%end, &
+                idwbuff(3)%beg:idwbuff(3)%end))
+            @:ACC_SETUP_SFs(field_vars(i))
+        end do
+
+        @:ALLOCATE(weights_x_interp(1:nWeights_interp))
+        do i = 1, nWeights_interp
+            @:ALLOCATE(weights_x_interp(i)%sf(idwbuff(1)%beg:idwbuff(1)%end,1:1,1:1))
+            @:ACC_SETUP_SFs(weights_x_interp(i))
+        end do
+
+        @:ALLOCATE(weights_y_interp(1:nWeights_interp))
+        do i = 1, nWeights_interp
+            @:ALLOCATE(weights_y_interp(i)%sf(idwbuff(2)%beg:idwbuff(2)%end,1:1,1:1))
+            @:ACC_SETUP_SFs(weights_y_interp(i))
+        end do
+
+        @:ALLOCATE(weights_z_interp(1:nWeights_interp))
+        do i = 1, nWeights_interp
+            @:ALLOCATE(weights_z_interp(i)%sf(idwbuff(3)%beg:idwbuff(3)%end,1:1,1:1))
+            @:ACC_SETUP_SFs(weights_z_interp(i))
+        end do
+
+        @:ALLOCATE(weights_x_grad(1:nWeights_grad))
+        do i = 1, nWeights_grad
+            @:ALLOCATE(weights_x_grad(i)%sf(idwbuff(1)%beg:idwbuff(1)%end,1:1,1:1))
+            @:ACC_SETUP_SFs(weights_x_grad(i))
+        end do
+
+        @:ALLOCATE(weights_y_grad(1:nWeights_grad))
+        do i = 1, nWeights_grad
+            @:ALLOCATE(weights_y_grad(i)%sf(idwbuff(2)%beg:idwbuff(2)%end,1:1,1:1))
+            @:ACC_SETUP_SFs(weights_y_grad(i))
+        end do
+
+        @:ALLOCATE(weights_z_grad(1:nWeights_grad))
+        do i = 1, nWeights_grad
+            @:ALLOCATE(weights_z_grad(i)%sf(idwbuff(3)%beg:idwbuff(3)%end,1:1,1:1))
+            @:ACC_SETUP_SFs(weights_z_grad(i))
+        end do
+
+        ! Allocating space for lagrangian variables
+        nParticles_glb = lag_params%nParticles_glb
+
+        @:ALLOCATE(lag_part_id(1:nParticles_glb, 1:2))
+        @:ALLOCATE(gid_to_local(1:nParticles_glb))
+        @:ALLOCATE(particle_R0(1:nParticles_glb))
+        @:ALLOCATE(Rmax_stats_part(1:nParticles_glb))
+        @:ALLOCATE(Rmin_stats_part(1:nParticles_glb))
+        @:ALLOCATE(particle_mass(1:nParticles_glb))
+        @:ALLOCATE(p_AM(1:nParticles_glb))
+        @:ALLOCATE(p_owner_rank(1:nParticles_glb))
+        @:ALLOCATE(particle_rad(1:nParticles_glb, 1:2))
+        @:ALLOCATE(particle_pos(1:nParticles_glb, 1:3, 1:2))
+        @:ALLOCATE(particle_posPrev(1:nParticles_glb, 1:3, 1:2))
+        @:ALLOCATE(particle_vel(1:nParticles_glb, 1:3, 1:2))
+        @:ALLOCATE(particle_s(1:nParticles_glb, 1:3, 1:2))
+        @:ALLOCATE(particle_draddt(1:nParticles_glb, 1:lag_num_ts))
+        @:ALLOCATE(particle_dposdt(1:nParticles_glb, 1:3, 1:lag_num_ts))
+        @:ALLOCATE(particle_dveldt(1:nParticles_glb, 1:3, 1:lag_num_ts))
+        @:ALLOCATE(f_p(1:nParticles_glb, 1:3))
+        @:ALLOCATE(gSum(1:nParticles_glb))
+
+        @:ALLOCATE(linked_list(1:nParticles_glb))
+
+        @:ALLOCATE(particle_head(idwbuff(1)%beg:idwbuff(1)%end, &
+            idwbuff(2)%beg:idwbuff(2)%end, &
+            idwbuff(3)%beg:idwbuff(3)%end))
+
+        @:ALLOCATE(force_recv_ids(1:lag_params%nParticles_glb))
+        @:ALLOCATE(force_recv_vals(1:3*lag_params%nParticles_glb))
+
+        @:ALLOCATE(keep_bubble(1:nParticles_glb))
+        @:ALLOCATE(wrap_bubble_loc(1:nParticles_glb, 1:num_dims), wrap_bubble_dir(1:nParticles_glb, 1:num_dims))
+
+        if (adap_dt .and. f_is_default(adap_dt_tol)) adap_dt_tol = dflt_adap_dt_tol
+
+        if (num_procs > 1) call s_initialize_solid_particles_mpi(lag_num_ts)
+
+        ! Starting particles
+        if (lag_params%write_void_evol) call s_open_void_evol
+        if (lag_params%write_bubbles) call s_open_lag_bubble_evol()
+        if (lag_params%write_bubbles_stats) call s_open_lag_particle_stats()
+
+        if (lag_params%vel_model > 0) then
+            moving_lag_particles = .true.
+            lag_pressure_force = lag_params%pressure_force
+            lag_gravity_force = lag_params%gravity_force
+            lag_vel_model = lag_params%vel_model
+            lag_drag_model = lag_params%drag_model
+        end if
+
+        $:GPU_UPDATE(device='[moving_lag_particles, lag_pressure_force, &
+            & lag_gravity_force, lag_vel_model, lag_drag_model]')
+
+        ! Allocate cell list arrays for atomic-free Gaussian smearing
+        @:ALLOCATE(cell_list_start(0:m, 0:n, 0:p))
+        @:ALLOCATE(cell_list_count(0:m, 0:n, 0:p))
+        @:ALLOCATE(cell_list_idx(1:lag_params%nParticles_glb))
+
+        call s_read_input_particles(q_cons_vf, bc_type)
+
+        call s_reset_cell_vars()
+
+        $:GPU_PARALLEL_LOOP(private='[k,cell,myR,myPos,myVel,myForce,func_sum]',copyin='[only_beta]')
+        do k = 1, n_el_particles_loc
+
+            cell = fd_number - buff_size
+            call s_locate_cell(particle_pos(k, 1:3, 1), cell, particle_s(k, 1:3, 1))
+
+            myR = particle_R0(k)
+            myPos = particle_pos(k, 1:3, 1)
+            myVel = particle_vel(k, 1:3, 1)
+            myForce = f_p(k, :)
+            !Compute the total gaussian contribution for each particle for normalization
+            call s_compute_gaussian_contribution(myR, myPos, cell, func_sum)
+            gSum(k) = func_sum
+
+            call s_gaussian_atomic(myR, myVel, myPos, myForce, func_sum, cell, q_particles, only_beta)
+
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+        call s_finalize_beta_field(bc_type, only_beta)
+
+        npts = (nWeights_interp - 1)/2
+        call s_compute_barycentric_weights(npts) !For interpolation
+
+        npts = (nWeights_grad - 1)/2
+        call s_compute_fornberg_fd_weights(npts) !For finite differences
+
+        if (lag_params%solver_approach == 2) then
+            if (save_count == 0) then
+
+                !> Correcting initial conditions so they account for particles
+                $:GPU_PARALLEL_LOOP(private='[i,j,k,dyn_pres,fluid_vel,rho_f,alpharho,rhou,alpharhou]', collapse=3, copyin = '[pi_inf, qv, gamma, rhoYks]')
+                do k = idwint(3)%beg, idwint(3)%end
+                    do j = idwint(2)%beg, idwint(2)%end
+                        do i = idwint(1)%beg, idwint(1)%end
+                            !!!!!!!!! Mass
+                            do l = 1, num_fluids !num_fluid is just 1 right now
+                                rho_f = q_cons_vf(l)%sf(i, j, k)
+                                alpharho = q_particles(alphaf_id)%sf(i, j, k)*rho_f
+                                q_cons_vf(l)%sf(i, j, k) = alpharho
+                            end do
+
+                            !!!!!!!!! Momentum
+                            dyn_pres = 0._wp
+                            do l = momxb, momxe
+                                fluid_vel(l - momxb + 1) = q_cons_vf(l)%sf(i, j, k)/rho_f
+                                rhou = q_cons_vf(l)%sf(i, j, k)
+                                alpharhou = q_particles(alphaf_id)%sf(i, j, k)*rhou
+                                q_cons_vf(l)%sf(i, j, k) = alpharhou
+                                dyn_pres = dyn_pres + q_cons_vf(l)%sf(i, j, k)* &
+                                           fluid_vel(l - momxb + 1)/2._wp
+                            end do
+
+                            !!!!!!!!!Energy
+                            call s_compute_pressure(q_cons_vf(E_idx)%sf(i, j, k), &
+                                                    q_cons_vf(alf_idx)%sf(i, j, k), &
+                                                    dyn_pres, pi_inf, gamma, alpharho, &
+                                                    qv, rhoYks, pres, T)
+
+                            q_cons_vf(E_idx)%sf(i, j, k) = &
+                                gamma*pres + dyn_pres + pi_inf + qv !Updating energy in cons
+                        end do
+                    end do
+                end do
+                $:END_GPU_PARALLEL_LOOP()
+
+            end if
+        end if
+
+    end subroutine s_initialize_particles_EL_module
+
+    !> The purpose of this procedure is to obtain the initial bubbles' information
+        !! @param q_cons_vf Conservative variables
+    impure subroutine s_read_input_particles(q_cons_vf, bc_type)
+
+        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
+        type(integer_field), dimension(1:num_dims, 1:2), intent(in) :: bc_type
+
+        real(wp), dimension(8) :: inputParticle
+        real(wp) :: qtime
+        integer :: id, particle_id, save_count
+        integer :: i, ios
+        logical :: file_exist, indomain
+        integer, dimension(3) :: cell
+
+        character(LEN=path_len + 2*name_len) :: path_D_dir !<
+
+        ! Initialize number of particles
+        particle_id = 0
+        id = 0
+
+        ! Read the input lag_bubble file or restart point
+        if (cfl_dt) then
+            save_count = n_start
+            qtime = n_start*t_save
+        else
+            save_count = t_step_start
+            qtime = t_step_start*dt
+        end if
+
+        if (save_count == 0) then
+            if (proc_rank == 0) print *, 'Reading lagrange particles input file.'
+            call my_inquire(trim(lag_params%input_path), file_exist)
+            if (file_exist) then
+                open (94, file=trim(lag_params%input_path), form='formatted', iostat=ios)
+                do while (ios == 0)
+                    read (94, *, iostat=ios) (inputParticle(i), i=1, 8)
+                    if (ios /= 0) cycle
+                    indomain = particle_in_domain_physical(inputParticle(1:3))
+                    id = id + 1
+                    if (id > lag_params%nParticles_glb .and. proc_rank == 0) then
+                        call s_mpi_abort("Current number of particles is larger than nParticles_glb")
+                    end if
+                    if (indomain) then
+                        particle_id = particle_id + 1
+                        call s_add_particles(inputParticle, q_cons_vf, particle_id)
+                        lag_part_id(particle_id, 1) = id      !global ID
+                        lag_part_id(particle_id, 2) = particle_id  !local ID
+                        n_el_particles_loc = particle_id              ! local number of particles
+                    end if
+                end do
+                close (94)
+            else
+                call s_mpi_abort("Initialize the lagrange particles in "//trim(lag_params%input_path))
+            end if
+        else
+            if (proc_rank == 0) print *, 'Restarting lagrange particles at save_count: ', save_count
+            call s_restart_bubbles(particle_id, save_count)
+        end if
+
+        print *, " Lagrange parrticles running, in proc", proc_rank, "number:", particle_id, "/", id
+
+        if (num_procs > 1) then
+            call s_mpi_reduce_int_sum(n_el_particles_loc, n_el_particles_glb)
+        else
+            n_el_particles_glb = n_el_particles_loc
+        end if
+
+        if (proc_rank == 0) then
+            if (n_el_particles_glb == 0) call s_mpi_abort('No particles in the domain. Check '//trim(lag_params%input_path))
+        end if
+
+        $:GPU_UPDATE(device='[particles_lagrange, lag_params]')
+
+        $:GPU_UPDATE(device='[lag_part_id,particle_R0,Rmax_stats_part,Rmin_stats_part,particle_mass, &
+            & f_p,p_AM,p_owner_rank,gid_to_local, &
+            & particle_rad,particle_pos,particle_posPrev,particle_vel, &
+            & particle_s,particle_draddt, &
+            & particle_dposdt,particle_dveldt,n_el_particles_loc]')
+
+        Rmax_glb = min(dflt_real, -dflt_real)
+        Rmin_glb = max(dflt_real, -dflt_real)
+        $:GPU_UPDATE(device='[Rmax_glb, Rmin_glb]')
+
+        $:GPU_UPDATE(device='[dx,dy,dz,x_cb,x_cc,y_cb,y_cc,z_cb,z_cc]')
+
+        !Populate temporal variables
+        call s_transfer_data_to_tmp_particles()
+
+        if (save_count == 0) then
+            ! Create ./D directory
+            if (proc_rank == 0) then
+                write (path_D_dir, '(A,I0,A,I0)') trim(case_dir)//'/D'
+                call my_inquire(trim(path_D_dir), file_exist)
+                if (.not. file_exist) call s_create_directory(trim(path_D_dir))
+            end if
+            call s_mpi_barrier()
+            call s_write_restart_lag_particles(save_count) ! Needed for post_processing
+            if (lag_params%write_void_evol) call s_write_void_evol_particles(qtime)
+        end if
+
+        if (lag_params%write_bubbles) call s_write_lag_particle_evol(qtime)
+
+    end subroutine s_read_input_particles
+
+    !> The purpose of this procedure is to obtain the information of the particles when starting fresh
+        !! @param inputPart Particle information
+        !! @param q_cons_vf Conservative variables
+        !! @param part_id Local id of the particle
+    impure subroutine s_add_particles(inputPart, q_cons_vf, part_id)
+
+        type(scalar_field), dimension(sys_size), intent(in) :: q_cons_vf
+        real(wp), dimension(8), intent(in) :: inputPart
+        integer, intent(in) :: part_id
+        integer :: i
+
+        real(wp) :: pliq, volparticle, concvap, totalmass, kparticle, cpparticle
+        real(wp) :: omegaN_local, PeG, PeT, rhol, pcrit, qv, gamma, pi_inf, dynP
+        integer, dimension(3) :: cell
+        real(wp), dimension(2) :: Re
+        real(wp) :: massflag, heatflag, Re_trans, Im_trans, myR, func_sum
+        real(wp), dimension(3) :: myPos, myVel
+
+        massflag = 0._wp
+        heatflag = 0._wp
+        if (lag_params%massTransfer_model) massflag = 1._wp
+        if (lag_params%heatTransfer_model) heatflag = 1._wp
+
+        particle_R0(part_id) = inputPart(7)
+        Rmax_stats_part(part_id) = min(dflt_real, -dflt_real)
+        Rmin_stats_part(part_id) = max(dflt_real, -dflt_real)
+        particle_rad(part_id, 1) = inputPart(7)
+        particle_pos(part_id, 1:3, 1) = inputPart(1:3)
+        particle_posPrev(part_id, 1:3, 1) = particle_pos(part_id, 1:3, 1)
+        particle_vel(part_id, 1:3, 1) = inputPart(4:6)
+
+        !Initialize Particle Sources
+        f_p(part_id, 1:3) = 0._wp
+        p_AM(part_id) = 0._wp
+        p_owner_rank(part_id) = proc_rank
+        gid_to_local(part_id) = -1
+
+        if (cyl_coord .and. p == 0) then
+            particle_pos(part_id, 2, 1) = sqrt(particle_pos(part_id, 2, 1)**2._wp + &
+                                               particle_pos(part_id, 3, 1)**2._wp)
+            !Storing azimuthal angle (-Pi to Pi)) into the third coordinate variable
+            particle_pos(part_id, 3, 1) = atan2(inputPart(3), inputPart(2))
+            particle_posPrev(part_id, 1:3, 1) = particle_pos(part_id, 1:3, 1)
+        end if
+
+        cell = fd_number - buff_size
+        call s_locate_cell(particle_pos(part_id, 1:3, 1), cell, particle_s(part_id, 1:3, 1))
+
+        ! Check if the particle is located in the ghost cell of a symmetric, or wall boundary
+        if ((any(bc_x%beg == (/BC_REFLECTIVE, BC_CHAR_SLIP_WALL, BC_SLIP_WALL, BC_NO_SLIP_WALL/)) .and. cell(1) < 0) .or. &
+            (any(bc_x%end == (/BC_REFLECTIVE, BC_CHAR_SLIP_WALL, BC_SLIP_WALL, BC_NO_SLIP_WALL/)) .and. cell(1) > m) .or. &
+            (any(bc_y%beg == (/BC_REFLECTIVE, BC_CHAR_SLIP_WALL, BC_SLIP_WALL, BC_NO_SLIP_WALL/)) .and. cell(2) < 0) .or. &
+            (any(bc_y%end == (/BC_REFLECTIVE, BC_CHAR_SLIP_WALL, BC_SLIP_WALL, BC_NO_SLIP_WALL/)) .and. cell(2) > n)) then
+            call s_mpi_abort("Lagrange particle is in the ghost cells of a symmetric or wall boundary.")
+        end if
+
+        if (p > 0) then
+            if ((any(bc_z%beg == (/BC_REFLECTIVE, BC_CHAR_SLIP_WALL, BC_SLIP_WALL, BC_NO_SLIP_WALL/)) .and. cell(3) < 0) .or. &
+                (any(bc_z%end == (/BC_REFLECTIVE, BC_CHAR_SLIP_WALL, BC_SLIP_WALL, BC_NO_SLIP_WALL/)) .and. cell(3) > p)) then
+                call s_mpi_abort("Lagrange particle is in the ghost cells of a symmetric or wall boundary.")
+            end if
+        end if
+
+        ! Initial particle mass
+        volparticle = 4._wp/3._wp*pi*particle_R0(part_id)**3 ! volume
+        particle_mass(part_id) = volparticle*rho0ref_particle ! mass
+        if (particle_mass(part_id) <= 0._wp) then
+            call s_mpi_abort("The initial particle mass is negative or zero. Check the particle file.")
+        end if
+
+    end subroutine s_add_particles
+
+    !> The purpose of this procedure is to obtain the information of the bubbles from a restart point.
+        !! @param part_id Local ID of the particle
+        !! @param save_count File identifier
+    impure subroutine s_restart_bubbles(part_id, save_count)
+
+        integer, intent(inout) :: part_id, save_count
+
+        character(LEN=path_len + 2*name_len) :: file_loc
+        real(wp) :: file_time, file_dt
+        integer :: file_num_procs, file_tot_part, tot_part
+
+#ifdef MFC_MPI
+        real(wp), dimension(20) :: inputvals
+        integer, dimension(MPI_STATUS_SIZE) :: status
+        integer(kind=MPI_OFFSET_KIND) :: disp
+        integer :: view
+
+        integer, dimension(3) :: cell
+        logical :: indomain, particle_file, file_exist
+
+        integer, dimension(2) :: gsizes, lsizes, start_idx_part
+        integer :: ifile, ierr, tot_data, id
+        integer :: i
+
+        integer, dimension(:), allocatable :: proc_particle_counts
+        real(wp), dimension(1:1, 1:lag_io_vars) :: dummy
+        dummy = 0._wp
+
+        ! Construct file path
+        write (file_loc, '(A,I0,A)') 'lag_bubbles_', save_count, '.dat'
+        file_loc = trim(case_dir)//'/restart_data'//trim(mpiiofs)//trim(file_loc)
+
+        ! Check if file exists
+        inquire (FILE=trim(file_loc), EXIST=file_exist)
+        if (.not. file_exist) then
+            call s_mpi_abort('Restart file '//trim(file_loc)//' does not exist!')
+        end if
+
+        if (.not. parallel_io) return
+
+        if (proc_rank == 0) then
+            call MPI_FILE_OPEN(MPI_COMM_SELF, file_loc, MPI_MODE_RDONLY, &
+                               mpi_info_int, ifile, ierr)
+
+            call MPI_FILE_READ(ifile, file_tot_part, 1, MPI_INTEGER, status, ierr)
+            call MPI_FILE_READ(ifile, file_time, 1, mpi_p, status, ierr)
+            call MPI_FILE_READ(ifile, file_dt, 1, mpi_p, status, ierr)
+            call MPI_FILE_READ(ifile, file_num_procs, 1, MPI_INTEGER, status, ierr)
+
+            call MPI_FILE_CLOSE(ifile, ierr)
+        end if
+
+        call MPI_BCAST(file_tot_part, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
+        call MPI_BCAST(file_time, 1, mpi_p, 0, MPI_COMM_WORLD, ierr)
+        call MPI_BCAST(file_dt, 1, mpi_p, 0, MPI_COMM_WORLD, ierr)
+        call MPI_BCAST(file_num_procs, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
+
+        allocate (proc_particle_counts(file_num_procs))
+
+        if (proc_rank == 0) then
+            call MPI_FILE_OPEN(MPI_COMM_SELF, file_loc, MPI_MODE_RDONLY, &
+                               mpi_info_int, ifile, ierr)
+
+            ! Skip to processor counts position
+            disp = int(sizeof(file_tot_part) + 2*sizeof(file_time) + sizeof(file_num_procs), &
+                       MPI_OFFSET_KIND)
+            call MPI_FILE_SEEK(ifile, disp, MPI_SEEK_SET, ierr)
+            call MPI_FILE_READ(ifile, proc_particle_counts, file_num_procs, MPI_INTEGER, status, ierr)
+
+            call MPI_FILE_CLOSE(ifile, ierr)
+        end if
+
+        call MPI_BCAST(proc_particle_counts, file_num_procs, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
+
+        ! Set time variables from file
+        mytime = file_time
+        dt = file_dt
+
+        part_id = proc_particle_counts(proc_rank + 1)
+
+        start_idx_part(1) = 0
+        do i = 1, proc_rank
+            start_idx_part(1) = start_idx_part(1) + proc_particle_counts(i)
+        end do
+
+        start_idx_part(2) = 0
+        lsizes(1) = part_id
+        lsizes(2) = lag_io_vars
+
+        gsizes(1) = file_tot_part
+        gsizes(2) = lag_io_vars
+
+        if (part_id > 0) then
+
+            allocate (MPI_IO_DATA_lag_bubbles(part_id, 1:lag_io_vars))
+
+            call MPI_TYPE_CREATE_SUBARRAY(2, gsizes, lsizes, start_idx_part, &
+                                          MPI_ORDER_FORTRAN, mpi_p, view, ierr)
+            call MPI_TYPE_COMMIT(view, ierr)
+
+            call MPI_FILE_OPEN(MPI_COMM_WORLD, file_loc, MPI_MODE_RDONLY, &
+                               mpi_info_int, ifile, ierr)
+
+            ! Skip extended header
+            disp = int(sizeof(file_tot_part) + 2*sizeof(file_time) + sizeof(file_num_procs) + &
+                       file_num_procs*sizeof(proc_particle_counts(1)), MPI_OFFSET_KIND)
+            call MPI_FILE_SET_VIEW(ifile, disp, mpi_p, view, 'native', mpi_info_int, ierr)
+
+            call MPI_FILE_READ_ALL(ifile, MPI_IO_DATA_lag_bubbles, &
+                                   lag_io_vars*part_id, mpi_p, status, ierr)
+
+            call MPI_FILE_CLOSE(ifile, ierr)
+            call MPI_TYPE_FREE(view, ierr)
+
+            n_el_particles_loc = part_id
+
+            do i = 1, part_id
+                lag_part_id(i, 1) = int(MPI_IO_DATA_lag_bubbles(i, 1))
+                particle_pos(i, 1:3, 1) = MPI_IO_DATA_lag_bubbles(i, 2:4)
+                particle_posPrev(i, 1:3, 1) = MPI_IO_DATA_lag_bubbles(i, 5:7)
+                particle_vel(i, 1:3, 1) = MPI_IO_DATA_lag_bubbles(i, 8:10)
+                particle_rad(i, 1) = MPI_IO_DATA_lag_bubbles(i, 11)
+                ! intfc_vel(i, 1) = MPI_IO_DATA_lag_bubbles(i, 12)
+                particle_R0(i) = MPI_IO_DATA_lag_bubbles(i, 13)
+                Rmax_stats_part(i) = MPI_IO_DATA_lag_bubbles(i, 14)
+                Rmin_stats_part(i) = MPI_IO_DATA_lag_bubbles(i, 15)
+                ! bub_dphidt(i) = MPI_IO_DATA_lag_bubbles(i, 16)
+                ! gas_p(i, 1) = MPI_IO_DATA_lag_bubbles(i, 17)
+                ! gas_mv(i, 1) = MPI_IO_DATA_lag_bubbles(i, 18)
+                particle_mass(i) = MPI_IO_DATA_lag_bubbles(i, 19)
+                ! gas_betaT(i) = MPI_IO_DATA_lag_bubbles(i, 20)
+                ! gas_betaC(i) = MPI_IO_DATA_lag_bubbles(i, 21)
+                cell = -buff_size
+                call s_locate_cell(particle_pos(i, 1:3, 1), cell, particle_s(i, 1:3, 1))
+            end do
+
+            deallocate (MPI_IO_DATA_lag_bubbles)
+
+        else
+            n_el_particles_loc = 0
+
+            call MPI_TYPE_CONTIGUOUS(0, mpi_p, view, ierr)
+            call MPI_TYPE_COMMIT(view, ierr)
+
+            call MPI_FILE_OPEN(MPI_COMM_WORLD, file_loc, MPI_MODE_RDONLY, &
+                               mpi_info_int, ifile, ierr)
+
+            ! Skip extended header
+            disp = int(sizeof(file_tot_part) + 2*sizeof(file_time) + sizeof(file_num_procs) + &
+                       file_num_procs*sizeof(proc_particle_counts(1)), MPI_OFFSET_KIND)
+            call MPI_FILE_SET_VIEW(ifile, disp, mpi_p, view, 'native', mpi_info_int, ierr)
+
+            call MPI_FILE_READ_ALL(ifile, dummy, 0, mpi_p, status, ierr)
+
+            call MPI_FILE_CLOSE(ifile, ierr)
+            call MPI_TYPE_FREE(view, ierr)
+        end if
+
+        if (proc_rank == 0) then
+            write (*, '(A,I0,A,I0)') 'Read ', file_tot_part, ' particles from restart file at t_step = ', save_count
+            write (*, '(A,E15.7,A,E15.7)') 'Restart time = ', mytime, ', dt = ', dt
+        end if
+
+        deallocate (proc_particle_counts)
+#endif
+
+    end subroutine s_restart_bubbles
+
+    !>  Contains the particle dynamics subroutines.
+        !! @param q_cons_vf Conservative variables
+        !! @param q_prim_vf Primitive variables
+        !! @param rhs_vf Calculated change of conservative variables
+        !! @param t_step Current time step
+        !! @param stage Current stage in the time-stepper algorithm
+    subroutine s_compute_particle_EL_dynamics(q_prim_vf, bc_type, stage, vL_x, vL_y, vL_z, vR_x, vR_y, vR_z, rhs_vf)
+        type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf
+        type(integer_field), dimension(1:num_dims, 1:2), intent(in) :: bc_type
+        type(scalar_field), dimension(sys_size), intent(in) :: rhs_vf
+        integer, intent(in) :: stage
+        real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:), intent(inout) :: vL_x, vL_y, vL_z
+        real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:), intent(inout) :: vR_x, vR_y, vR_z
+
+        integer, dimension(3) :: cell, cellijk
+        real(wp) :: myMass, myR, myBeta_c, myBeta_t, myR0, myRe, mydrhodt, myVolumeFrac, myGamma, rmass_add, func_sum
+        real(wp), dimension(3) :: myVel, myPos, force_vec, s_cell
+        logical :: only_beta = .false.
+
+        integer :: k, l, i, j
+
+        if (lag_params%pressure_force .or. lag_params%added_mass_model > 0) then
+            do l = 1, num_dims
+                if (l == 1) then
+                    call s_gradient_field(vL_x, vR_x, field_vars(dPx_id)%sf, l, E_idx)
+                elseif (l == 2) then
+                    call s_gradient_field(vL_y, vR_y, field_vars(dPy_id)%sf, l, E_idx)
+                elseif (l == 3) then
+                    call s_gradient_field(vL_z, vR_z, field_vars(dPz_id)%sf, l, E_idx)
+                end if
+            end do
+        end if
+
+        if (lag_params%added_mass_model > 0) then
+
+            do l = 1, num_dims
+                if (l == 1) then
+                    call s_gradient_field(vL_x, vR_x, field_vars(drhox_id)%sf, l, 1)
+                    ! call s_gradient_field(vL_x, vR_x, field_vars(dufx_id)%sf, l, momxb)
+                elseif (l == 2) then
+                    call s_gradient_field(vL_y, vR_y, field_vars(drhoy_id)%sf, l, 1)
+                    ! call s_gradient_field(vL_y, vR_y, field_vars(dufy_id)%sf, l, momxb+1)
+                elseif (l == 3) then
+                    call s_gradient_field(vL_z, vR_z, field_vars(drhoz_id)%sf, l, 1)
+                    ! call s_gradient_field(vL_z, vR_z, field_vars(dufz_id)%sf, l, momxb+2)
+                end if
+            end do
+
+        end if
+
+        myGamma = (1._wp/fluid_pp(1)%gamma) + 1._wp
+        myRe = 1.845e-5_wp !fluid_pp(1)%Re(1) !Need a viscosity model for when modeling inviscid eulerian fluid !< Dynamic viscosity
+
+        call nvtxStartRange("LAGRANGE-PARTICLE-DYNAMICS")
+
+        !> Compute Fluid-Particle Forces (drag/pressure/added mass) and convert to particle acceleration
+        $:GPU_PARALLEL_LOOP(private='[i,k,l,cell,s_cell,myMass,myR,myR0,myPos,myVel,myVolumeFrac,force_vec,rmass_add,func_sum,mydrhodt]',&
+        & copyin='[stage, myGamma, myRe, only_beta]')
+        do k = 1, n_el_particles_loc
+
+            f_p(k, :) = 0._wp
+            p_owner_rank(k) = proc_rank
+
+            s_cell = particle_s(k, 1:3, 2)
+            cell = int(s_cell(:))
+            do i = 1, num_dims
+                if (s_cell(i) < 0._wp) cell(i) = cell(i) - 1
+            end do
+
+            ! Current particle state
+            myMass = particle_mass(k)
+            myR = particle_rad(k, 2)
+            myR0 = particle_R0(k)
+            myPos = particle_pos(k, :, 2)
+            myVel = particle_vel(k, :, 2)
+            myVolumeFrac = 1._wp - q_particles(alphaf_id)%sf(cell(1), cell(2), cell(3))
+            mydrhodt = rhs_vf(1)%sf(cell(1), cell(2), cell(3))
+
+            particle_dposdt(k, :, stage) = 0._wp
+            particle_dveldt(k, :, stage) = 0._wp
+            particle_draddt(k, stage) = 0._wp
+
+            call s_get_particle_force(myPos, myR, myVel, myMass, myRe, myGamma, myVolumeFrac, mydrhodt, cell, &
+                                      q_prim_vf, field_vars, weights_x_interp, weights_y_interp, weights_z_interp, &
+                                      force_vec, rmass_add)
+
+            p_AM(k) = rMass_add
+            f_p(k, :) = f_p(k, :) + force_vec(:)
+
+            if (.not. lag_params%collision_force) then
+                myMass = particle_mass(k) + p_AM(k)
+                myVel = particle_vel(k, :, 2)
+                do l = 1, num_dims
+                    particle_dposdt(k, l, stage) = myVel(l)
+                    particle_dveldt(k, l, stage) = f_p(k, l)/myMass
+                    particle_draddt(k, stage) = 0._wp
+                end do
+            end if
+
+            if (lag_params%solver_approach == 2) then
+                func_sum = gSum(k)
+                call s_gaussian_atomic(myR, myVel, myPos, force_vec, func_sum, cell, q_particles, only_beta)
+            end if
+
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+        if (lag_params%solver_approach == 2) then
+            call s_finalize_beta_field(bc_type, only_beta)
+        end if
+
+        call nvtxStartRange("LAGRANGE-PARTICLE-COLLISIONS")
+        if (lag_params%collision_force) then
+            !> Compute Particle-Particle collision forces
+            call s_compute_particle_EL_collisions(stage, bc_type)
+
+            $:GPU_PARALLEL_LOOP(private='[k,l,myMass,myVel]')
+            do k = 1, n_el_particles_loc
+                myMass = particle_mass(k) + p_AM(k)
+                myVel = particle_vel(k, :, 2)
+                do l = 1, num_dims
+                    particle_dposdt(k, l, stage) = myVel(l)
+                    particle_dveldt(k, l, stage) = f_p(k, l)/myMass
+                    particle_draddt(k, stage) = 0._wp
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+
+        end if
+        call nvtxEndRange
+
+        call nvtxEndRange
+
+    end subroutine s_compute_particle_EL_dynamics
+
+    !>  Contains the particle collision force computation.
+    subroutine s_compute_particle_EL_collisions(stage, bc_type)
+
+        integer, intent(in) :: stage
+        type(integer_field), dimension(1:num_dims, 1:2), intent(in) :: bc_type
+        integer, dimension(3) :: cell
+        real(wp), dimension(3) :: s_cell
+        integer, dimension(3) :: cellaux
+        integer :: i, k, l, q, ip, jp, kp, ii, jj, kk
+        logical :: celloutside
+        real(wp) :: pidtksp2, ksp, nu1, nu2, Rp1, Rp2, E1, E2, Estar, cor, rmag, Rstar, dij, eta_n, kappa_n, mp1, mp2, dt_loc
+        real(wp), dimension(3) :: xp1, xp2, vp1, vp2, v_rel, rpij, nij, vnij, Fnpp_ij, force_vec
+        integer :: kpz
+        integer :: total_recv
+        integer :: glb_id, count
+        integer :: n_el_particles_loc_before_ghost
+
+        if (num_procs > 1) then
+            n_el_particles_loc_before_ghost = n_el_particles_loc
+            call s_reset_force_buffers()
+            call s_add_ghost_particles()
+        end if
+
+        ! if (lag_num_ts == 1) then
+        !     dt_loc = dt
+        ! elseif (lag_num_ts == 2) then
+        !     if (stage == 1) then
+        !         dt_loc = dt
+        !     elseif (stage == 2) then
+        !         dt_loc = dt/2._wp
+        !     end if
+        ! elseif (lag_num_ts == 3) then
+        !     if (stage == 1) then
+        !         dt_loc = dt
+        !     elseif (stage == 2) then
+        !         dt_loc = dt/4._wp
+        !     elseif (stage == 3) then
+        !         dt_loc = (2._wp/3._wp)*dt
+        !     end if
+        ! end if
+
+        kpz = 0
+        if (num_dims == 3) kpz = 1
+
+        ksp = 10._wp
+        nu1 = 0.35_wp
+        nu2 = 0.35_wp
+        E1 = 1.e9_wp
+        E2 = 1.e9_wp
+        cor = 0.7_wp
+
+        pidtksp2 = (pi**2)/((dt*ksp)**2)
+
+        Estar = 1._wp/(((1._wp - nu1**2)/E1) + ((1._wp - nu2**2)/E2))
+        Estar = (4._wp/3._wp)*Estar
+
+        call s_reset_linked_list()
+
+        call nvtxStartRange("LAGRANGE-PARTICLE-COLLISIONS")
+        error_flag = 0
+        $:GPU_UPDATE(device='[error_flag]')
+
+        $:GPU_PARALLEL_LOOP(private='[i,k,cell,ip,jp,kp,Rp1,xp1,mp1,vp1,kk,jj,ii,cellaux,q,Rp2,xp2,mp2,vp2,v_rel,Rstar,rpij,rmag,nij,vnij,dij,kappa_n,eta_n,Fnpp_ij,force_vec,s_cell,celloutside,count]',&
+        & copyin='[ksp,nu1,nu2,E1,E2,cor,pidtksp2,Estar,kpz]')
+        do k = 1, n_el_particles_loc
+
+            if (.not. particle_in_domain_physical(particle_pos(k, 1:3, 2))) then
+                cycle
+            end if
+
+            s_cell = particle_s(k, 1:3, 2)
+            cell = int(s_cell(:))
+            do i = 1, num_dims
+                if (s_cell(i) < 0._wp) cell(i) = cell(i) - 1
+            end do
+
+            ip = cell(1)
+            jp = cell(2)
+            kp = cell(3)
+
+            Rp1 = particle_rad(k, 2)
+            xp1 = particle_pos(k, :, 2)
+            mp1 = particle_mass(k)
+            vp1 = particle_vel(k, :, 2)
+
+            do kk = kp - kpz, kp + kpz
+                do jj = jp - 1, jp + 1
+                    do ii = ip - 1, ip + 1
+
+                        cellaux(1) = ii
+                        cellaux(2) = jj
+                        cellaux(3) = kk
+
+                        call s_check_celloutside_wbuff(cellaux, celloutside)
+
+                        if (.not. celloutside) then
+                            q = particle_head(ii, jj, kk)
+                            ! Traverse linked list in that cell
+
+                            count = 0
+                            do while (q /= -1)
+
+                                count = count + 1
+                                if (count > n_el_particles_loc) then
+                                    $:GPU_ATOMIC(atomic='write')
+                                    error_flag = 1
+                                    exit
+                                end if
+
+                                if (lag_part_id(q, 1) > lag_part_id(k, 1)) then
+
+                                    Rp2 = particle_rad(q, 2)
+                                    xp2 = particle_pos(q, :, 2)
+                                    mp2 = particle_mass(q)
+                                    vp2 = particle_vel(q, :, 2)
+                                    v_rel = vp2 - vp1
+
+                                    Rstar = (Rp1*Rp2)/(Rp1 + Rp2)
+                                    rpij = xp2 - xp1
+                                    rmag = sqrt(rpij(1)**2 + rpij(2)**2 + rpij(3)**2)
+                                    rmag = max(rmag, eps_overlap)
+                                    nij = rpij/rmag
+                                    vnij = dot_product(v_rel, nij)*nij
+                                    dij = (Rp1 + Rp2) - rmag
+
+                                    if (dij > 0._wp) then
+
+                                        kappa_n = min((pidtksp2*mp1), (pidtksp2*mp2), (Estar*sqrt(Rstar)*sqrt(abs(dij))))
+
+                                        eta_n = ((-2._wp*sqrt(kappa_n)*log(cor))/sqrt((log(cor))**2 + pi**2))*(1._wp/sqrt((1._wp/mp1) + (1._wp/mp2)))
+
+                                        Fnpp_ij = -kappa_n*dij*nij - eta_n*vnij
+
+                                        f_p(k, :) = f_p(k, :) + Fnpp_ij
+
+                                        if (p_owner_rank(q) == proc_rank) then
+                                            ! f_p(q, :) = f_p(q, :) - Fnpp_ij
+
+                                            $:GPU_ATOMIC(atomic='update')
+                                            f_p(q, 1) = f_p(q, 1) - Fnpp_ij(1)
+
+                                            $:GPU_ATOMIC(atomic='update')
+                                            f_p(q, 2) = f_p(q, 2) - Fnpp_ij(2)
+
+                                            $:GPU_ATOMIC(atomic='update')
+                                            f_p(q, 3) = f_p(q, 3) - Fnpp_ij(3)
+
+                                        else
+                                            call s_add_force_to_send_buffer(p_owner_rank(q), lag_part_id(q, 1), -Fnpp_ij)
+                                        end if
+                                    end if
+                                end if
+
+                                q = linked_list(q)
+
+                            end do
+                        end if
+
+                    end do
+                end do
+            end do
+
+            !>Check each local particle for wall collisions
+
+            call s_compute_wall_collisions(xp1, vp1, Rp1, mp1, Estar, pidtksp2, cor, force_vec)
+            f_p(k, :) = f_p(k, :) + force_vec
+
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+        call nvtxEndRange
+
+        $:GPU_UPDATE(host='[error_flag]')
+        if (error_flag == 1) then
+            call s_mpi_abort("Linked list infinite loop detected")
+        end if
+
+        if (num_procs > 1) then
+
+            n_el_particles_loc = n_el_particles_loc_before_ghost
+            $:GPU_UPDATE(device='[n_el_particles_loc]')
+
+            total_recv = 0
+            force_recv_ids = 0
+            force_recv_vals = 0.
+
+            call s_transfer_collision_forces(total_recv, force_recv_ids, force_recv_vals)
+
+            $:GPU_UPDATE(device = '[force_recv_ids,force_recv_vals]')
+
+            $:GPU_PARALLEL_LOOP(private='[i,k]',copyin = '[total_recv]')
+            do i = 1, total_recv
+                k = gid_to_local(force_recv_ids(i))
+                if (k > 0) then
+                    $:GPU_ATOMIC(atomic='update')
+                    f_p(k, 1) = f_p(k, 1) + force_recv_vals(3*(i - 1) + 1)
+
+                    $:GPU_ATOMIC(atomic='update')
+                    f_p(k, 2) = f_p(k, 2) + force_recv_vals(3*(i - 1) + 2)
+
+                    $:GPU_ATOMIC(atomic='update')
+                    f_p(k, 3) = f_p(k, 3) + force_recv_vals(3*(i - 1) + 3)
+
+                end if
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+
+        end if
+
+    end subroutine s_compute_particle_EL_collisions
+
+    !> This subroutine checks for particles at solid walls to compute a collision force
+    subroutine s_compute_wall_collisions(pos, vel, rad, mass, Es, pidtksp, core, wcol_force)
+        $:GPU_ROUTINE(function_name='s_compute_wall_collisions',parallelism='[seq]', &
+            & cray_inline=True)
+
+        real(wp), dimension(3), intent(in) :: pos, vel
+        real(wp), intent(in) :: rad, mass, Es, pidtksp, core
+        real(wp), dimension(3), intent(inout) :: wcol_force
+
+        real(wp) :: dij
+
+        wcol_force = 0._wp
+
+        ! Check for particles at solid boundaries
+        if (any(bc_x%beg == (/BC_REFLECTIVE, BC_CHAR_SLIP_WALL, BC_SLIP_WALL, BC_NO_SLIP_WALL/))) then
+            dij = rad - (pos(1) - x_cb(-1))
+
+            if (dij > 0._wp) then
+                call s_compute_wall_collision_force(dij, vel, rad, mass, Es, pidtksp, core, 1, 1._wp, wcol_force)
+            end if
+        end if
+
+        if (any(bc_x%end == (/BC_REFLECTIVE, BC_CHAR_SLIP_WALL, BC_SLIP_WALL, BC_NO_SLIP_WALL/))) then
+
+            dij = rad - (x_cb(m) - pos(1))
+
+            if (dij > 0._wp) then
+                call s_compute_wall_collision_force(dij, vel, rad, mass, Es, pidtksp, core, 1, -1._wp, wcol_force)
+            end if
+        end if
+
+        if (any(bc_y%beg == (/BC_REFLECTIVE, BC_CHAR_SLIP_WALL, BC_SLIP_WALL, BC_NO_SLIP_WALL/))) then
+
+            dij = rad - (pos(2) - y_cb(-1))
+
+            if (dij > 0._wp) then
+                call s_compute_wall_collision_force(dij, vel, rad, mass, Es, pidtksp, core, 2, 1._wp, wcol_force)
+            end if
+        end if
+
+        if (any(bc_y%end == (/BC_REFLECTIVE, BC_CHAR_SLIP_WALL, BC_SLIP_WALL, BC_NO_SLIP_WALL/))) then
+
+            dij = rad - (y_cb(n) - pos(2))
+
+            if (dij > 0._wp) then
+                call s_compute_wall_collision_force(dij, vel, rad, mass, Es, pidtksp, core, 2, -1._wp, wcol_force)
+            end if
+        end if
+
+        if (p > 0) then
+            if (any(bc_z%beg == (/BC_REFLECTIVE, BC_CHAR_SLIP_WALL, BC_SLIP_WALL, BC_NO_SLIP_WALL/))) then
+
+                dij = rad - (pos(3) - z_cb(-1))
+
+                if (dij > 0._wp) then
+                    call s_compute_wall_collision_force(dij, vel, rad, mass, Es, pidtksp, core, 3, 1._wp, wcol_force)
+                end if
+            end if
+
+            if (any(bc_z%end == (/BC_REFLECTIVE, BC_CHAR_SLIP_WALL, BC_SLIP_WALL, BC_NO_SLIP_WALL/))) then
+
+                dij = rad - (z_cb(p) - pos(3))
+
+                if (dij > 0._wp) then
+                    call s_compute_wall_collision_force(dij, vel, rad, mass, Es, pidtksp, core, 3, -1._wp, wcol_force)
+                end if
+            end if
+
+        end if
+
+    end subroutine s_compute_wall_collisions
+
+    !> This subroutine computes the collision force with a solid wall
+    subroutine s_compute_wall_collision_force(dij, vel, rad, mass, Es, pidtksp, core, dir, normal, wcol_force)
+        $:GPU_ROUTINE(function_name='s_compute_wall_collision_force',parallelism='[seq]', &
+            & cray_inline=True)
+
+        real(wp), dimension(3), intent(in) :: vel
+        real(wp), intent(in) :: dij, rad, mass, Es, pidtksp, core, normal
+        integer, intent(in) :: dir
+        real(wp), dimension(3), intent(inout) :: wcol_force
+
+        real(wp), dimension(3) :: nij, v_rel, vnij
+        real(wp) :: kappa_n, eta_n
+
+        ! Normal points away from wall (into domain)
+        nij = 0._wp
+        nij(dir) = normal
+
+        ! Relative velocity (wall has zero velocity)
+        v_rel = vel
+        vnij = dot_product(v_rel, nij)*nij
+
+        ! Wall has infinite mass so use mp1 only
+        kappa_n = min((pidtksp*mass), (Es*sqrt(rad)*sqrt(abs(dij))))
+
+        eta_n = ((-2._wp*sqrt(kappa_n)*log(core))/sqrt((log(core))**2 + pi**2)) &
+                *(1._wp/sqrt(1._wp/mass))
+
+        wcol_force = wcol_force + (kappa_n*dij*nij - eta_n*vnij)
+
+    end subroutine s_compute_wall_collision_force
+
+    !> This subroutine adds temporary ghost particles for collision purposes
+    subroutine s_add_ghost_particles()
+
+        integer :: k, i, q
+        integer :: patch_id, newBubs
+        integer, dimension(3) :: cell
+        logical :: inc_ghost = .true.
+
+        call nvtxStartRange("LAG-GHOSTADD")
+        call nvtxStartRange("LAG-GHOSTADD-DEV2HOST")
+        $:GPU_UPDATE(host='[p_owner_rank, particle_R0, Rmax_stats_part, Rmin_stats_part, particle_mass, f_p, &
+            & lag_part_id, particle_rad, &
+            & particle_pos, particle_posPrev, particle_vel, particle_s, particle_draddt, &
+            & particle_dposdt, particle_dveldt, n_el_particles_loc, &
+            & wrap_bubble_dir, wrap_bubble_loc]')
+        call nvtxEndRange
+
+        ! Handle MPI transfer of particles going to another processor's local domain
+        if (num_procs > 1) then
+            call nvtxStartRange("LAG-GHOSTADD-TRANSFER-LIST")
+            call s_add_particles_to_transfer_list(n_el_particles_loc, particle_pos(:, :, 2), particle_posPrev(:, :, 2), inc_ghost)
+            call nvtxEndRange
+
+            call nvtxStartRange("LAG-GHOSTADD-SENDRECV")
+            call s_mpi_sendrecv_solid_particles(p_owner_rank, particle_R0, Rmax_stats_part, Rmin_stats_part, particle_mass, f_p, &
+                                                lag_part_id, &
+                                                particle_rad, particle_pos, particle_posPrev, particle_vel, &
+                                                particle_s, particle_draddt, particle_dposdt, particle_dveldt, lag_num_ts, n_el_particles_loc, &
+                                                2)
+            call nvtxEndRange
+        end if
+
+        call nvtxStartRange("LAG-GHOSTADD-HOST2DEV")
+        $:GPU_UPDATE(device='[p_owner_rank,particle_R0, Rmax_stats_part, Rmin_stats_part, particle_mass, f_p, &
+            & lag_part_id, particle_rad, &
+            & particle_pos, particle_posPrev, particle_vel, particle_s, particle_draddt, &
+            & particle_dposdt, particle_dveldt, n_el_particles_loc]')
+        call nvtxEndRange
+
+        call nvtxEndRange ! LAG-GHOSTADD
+
+        $:GPU_PARALLEL_LOOP(private='[k,cell]')
+        do k = 1, n_el_particles_loc
+
+            cell = fd_number - buff_size
+            call s_locate_cell(particle_pos(k, 1:3, 2), cell, particle_s(k, 1:3, 2))
+
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+    end subroutine s_add_ghost_particles
+
+    !> The purpose of this subroutine is to check if the current cell is outside the computational domain or not (including ghost cells).
+            !! @param cellaux Tested cell to smear the particle effect in.
+            !! @param celloutside If true, then cellaux is outside the computational domain.
+    subroutine s_check_celloutside_wbuff(cellaux, celloutside)
+        $:GPU_ROUTINE(function_name='s_check_celloutside_wbuff',parallelism='[seq]', &
+            & cray_inline=True)
+
+        integer, dimension(3), intent(inout) :: cellaux
+        logical, intent(out) :: celloutside
+
+        celloutside = .false.
+
+        if (num_dims == 2) then
+            if ((cellaux(1) < -buff_size) .or. (cellaux(2) < -buff_size)) then
+                celloutside = .true.
+            end if
+
+            if ((cellaux(1) > m + buff_size) .or. (cellaux(2) > n + buff_size)) then
+                celloutside = .true.
+            end if
+        else
+            if ((cellaux(1) < -buff_size) .or. (cellaux(2) < -buff_size) .or. (cellaux(3) < -buff_size)) then
+                celloutside = .true.
+            end if
+
+            if ((cellaux(1) > m + buff_size) .or. (cellaux(2) > n + buff_size) .or. (cellaux(3) > p + buff_size)) then
+                celloutside = .true.
+            end if
+        end if
+
+    end subroutine s_check_celloutside_wbuff
+
+    !>  The purpose of this subroutine is to obtain the bubble source terms based on Maeda and Colonius (2018)
+        !!      and add them to the RHS scalar field.
+        !! @param q_cons_vf Conservative variables
+        !! @param q_prim_vf Primitive variables
+        !! @param rhs_vf Time derivative of the conservative variables
+    subroutine s_compute_particles_EL_source(q_cons_vf, q_prim_vf, rhs_vf, stage)
+
+        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
+        type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
+        type(scalar_field), dimension(sys_size), intent(inout) :: rhs_vf
+
+        integer :: i, j, k, l, nf, stage
+        real(wp) :: dalphapdt, alpha_f, udot_gradalpha
+
+        !Spatial derivative of the fluid volume fraction and eulerian particle momentum fields.
+        do l = 1, num_dims
+            call s_gradient_dir_fornberg(q_particles(alphaf_id)%sf, field_vars(dalphafx_id + l - 1)%sf, l)
+            call s_gradient_dir_fornberg(q_particles(alphaupx_id + l - 1)%sf, field_vars(dalphap_upx_id + l - 1)%sf, l)
+        end do
+
+        !> Apply particle sources to the Eulerian RHS
+        $:GPU_PARALLEL_LOOP(private='[i,j,k,alpha_f,dalphapdt,udot_gradalpha]', collapse=3)
+        do k = idwint(3)%beg, idwint(3)%end
+            do j = idwint(2)%beg, idwint(2)%end
+                do i = idwint(1)%beg, idwint(1)%end
+                    if (q_particles(alphaf_id)%sf(i, j, k) > (1._wp - lag_params%valmaxvoid)) then
+
+                        alpha_f = q_particles(alphaf_id)%sf(i, j, k)
+
+                        dalphapdt = 0._wp
+                        udot_gradalpha = 0._wp
+                        do l = 1, num_dims
+                            dalphapdt = dalphapdt + field_vars(dalphap_upx_id + l - 1)%sf(i, j, k)
+                            udot_gradalpha = udot_gradalpha + q_prim_vf(momxb + l - 1)%sf(i, j, k)*field_vars(dalphafx_id + l - 1)%sf(i, j, k)
+                        end do
+                        dalphapdt = -dalphapdt
+                        !Add any contribution to dalphapdt from particles growing or shrinking
+
+                        !> Step 1: Source terms for volume fraction corrections
+                        !cons_var/alpha_f * (dalpha_p/dt - u dot grad(alpha_f))
+                        do l = 1, E_idx
+                            rhs_vf(l)%sf(i, j, k) = rhs_vf(l)%sf(i, j, k) + &
+                                                    (q_cons_vf(l)%sf(i, j, k)/alpha_f)*(dalphapdt - udot_gradalpha)
+                        end do
+
+                        !momentum term -1/alpha_f * (p*grad(alpha_f) - Tau^v dot grad(alpha_f)) !Viscous term not implemented
+                        do l = 1, num_dims
+                            rhs_vf(momxb + l - 1)%sf(i, j, k) = rhs_vf(momxb + l - 1)%sf(i, j, k) - &
+                                                                ((1._wp/alpha_f)* &
+                                                                 (q_prim_vf(E_idx)%sf(i, j, k)*field_vars(dalphafx_id + l - 1)%sf(i, j, k)))
+                        end do
+
+                        !energy term -1/alpha_f * (p*u dot grad(alpha_f) - (Tau^v dot u) dot grad(alpha_f)) !Viscous term not implemented
+                        rhs_vf(E_idx)%sf(i, j, k) = rhs_vf(E_idx)%sf(i, j, k) - &
+                                                    ((1._wp/alpha_f)* &
+                                                     (q_prim_vf(E_idx)%sf(i, j, k)*udot_gradalpha))
+
+                        !> Step 2: Add the drag/pressure/added mass forces to the fluid
+                        rhs_vf(momxb)%sf(i, j, k) = rhs_vf(momxb)%sf(i, j, k) + q_particles(Smx_id)%sf(i, j, k)*(1._wp/alpha_f)
+                        rhs_vf(momxb + 1)%sf(i, j, k) = rhs_vf(momxb + 1)%sf(i, j, k) + q_particles(Smy_id)%sf(i, j, k)*(1._wp/alpha_f)
+
+                        ! Energy source
+                        rhs_vf(E_idx)%sf(i, j, k) = rhs_vf(E_idx)%sf(i, j, k) + &
+                                                    (q_particles(Smx_id)%sf(i, j, k)*q_prim_vf(momxb)%sf(i, j, k) &
+                                                     + q_particles(Smy_id)%sf(i, j, k)*q_prim_vf(momxb + 1)%sf(i, j, k) &
+                                                     + q_particles(SE_id)%sf(i, j, k))*(1._wp/alpha_f)
+
+                        if (num_dims == 3) then
+                            rhs_vf(momxb + 2)%sf(i, j, k) = rhs_vf(momxb + 2)%sf(i, j, k) + q_particles(Smz_id)%sf(i, j, k)*(1._wp/alpha_f)
+                            ! Energy source
+                            rhs_vf(E_idx)%sf(i, j, k) = rhs_vf(E_idx)%sf(i, j, k) + &
+                                                        (q_particles(Smz_id)%sf(i, j, k)*q_prim_vf(momxb + 2)%sf(i, j, k))*(1._wp/alpha_f)
+                        end if
+
+                    end if
+                end do
+            end do
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+    end subroutine s_compute_particles_EL_source
+
+    !>  The purpose of this subroutine is to smear the effect of the particles in the Eulerian framework
+    subroutine s_reset_linked_list()
+
+        integer :: j, k, l
+
+        $:GPU_PARALLEL_LOOP(private='[j,k,l]', collapse=3)
+        do l = idwbuff(3)%beg, idwbuff(3)%end
+            do k = idwbuff(2)%beg, idwbuff(2)%end
+                do j = idwbuff(1)%beg, idwbuff(1)%end
+                    particle_head(j, k, l) = -1
+                end do
+            end do
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+        $:GPU_PARALLEL_LOOP(private='[k]')
+        do k = 1, n_el_particles_loc
+            linked_list(k) = -1
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+        call s_build_linked_list()
+
+    end subroutine s_reset_linked_list
+
+    !>  The purpose of this subroutine is to smear the effect of the particles in the Eulerian framework
+    subroutine s_reset_cell_vars()
+
+        integer :: i, j, k, l
+
+        $:GPU_PARALLEL_LOOP(private='[i,j,k,l]', collapse=4)
+        do i = 1, max(nField_vars, q_particles_idx)  ! outermost is largest of the i-like dims
+            do l = idwbuff(3)%beg, idwbuff(3)%end
+                do k = idwbuff(2)%beg, idwbuff(2)%end
+                    do j = idwbuff(1)%beg, idwbuff(1)%end
+                        ! Zero field_vars if i <= nField_vars
+                        if (i <= nField_vars) field_vars(i)%sf(j, k, l) = 0._wp
+                        ! Zero q_particles if i <= q_particles_idx
+                        if (i <= q_particles_idx) then
+                            q_particles(i)%sf(j, k, l) = 0._wp
+                            kahan_comp(i)%sf(j, k, l) = 0._wp
+                        end if
+                    end do
+                end do
+            end do
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+    end subroutine s_reset_cell_vars
+
+    subroutine s_finalize_beta_field(bc_type, onlyBeta)
+
+        type(integer_field), dimension(1:num_dims, 1:2), intent(in) :: bc_type
+        integer :: j, k, l
+        logical, intent(in) :: onlyBeta
+
+        call nvtxStartRange("PARTICLES-LAGRANGE-BETA-COMM")
+        if (onlyBeta) then
+            call s_populate_beta_buffers(q_particles, bc_type, 1)
+        else
+            call s_populate_beta_buffers(q_particles, bc_type, q_particles_idx)
+        end if
+        call nvtxEndRange
+
+        !Store 1-q_particles(1)
+        $:GPU_PARALLEL_LOOP(private='[j,k,l]', collapse=3)
+        do l = idwbuff(3)%beg, idwbuff(3)%end
+            do k = idwbuff(2)%beg, idwbuff(2)%end
+                do j = idwbuff(1)%beg, idwbuff(1)%end
+                    q_particles(alphaf_id)%sf(j, k, l) = 1._wp - q_particles(alphaf_id)%sf(j, k, l)
+                    ! Limiting void fraction given max value
+                    q_particles(alphaf_id)%sf(j, k, l) = max(q_particles(alphaf_id)%sf(j, k, l), &
+                                                             1._wp - lag_params%valmaxvoid)
+                end do
+            end do
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+    end subroutine s_finalize_beta_field
+
+    subroutine s_build_linked_list()
+        integer :: k, glb_id, i
+        integer, dimension(3) :: cell
+        real(wp), dimension(3) :: s_cell
+        logical :: celloutside
+
+        $:GPU_PARALLEL_LOOP(private='[i,k,cell,s_cell,glb_id,celloutside]')
+        do k = 1, n_el_particles_loc
+
+            glb_id = lag_part_id(k, 1)
+            gid_to_local(glb_id) = k
+
+            s_cell = particle_s(k, 1:3, 2)
+            cell = int(s_cell(:))
+            do i = 1, num_dims
+                if (s_cell(i) < 0._wp) cell(i) = cell(i) - 1
+            end do
+
+            call s_check_celloutside_wbuff(cell, celloutside)
+
+            if (.not. celloutside) then
+              !!!!! Particle linked list building
+                $:GPU_ATOMIC(atomic='capture')
+                linked_list(k) = particle_head(cell(1), cell(2), cell(3))
+                particle_head(cell(1), cell(2), cell(3)) = k
+                $:END_GPU_ATOMIC_CAPTURE()
+            end if
+
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+    end subroutine s_build_linked_list
+
+    !>  This subroutine updates the Lagrange variables using the tvd RK time steppers.
+        !!      The time derivative of the particle variables must be stored at every stage to avoid precision errors.
+        !! @param stage Current tvd RK stage
+    impure subroutine s_update_lagrange_particles_tdv_rk(q_prim_vf, bc_type, stage)
+
+        type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf
+        type(integer_field), dimension(1:num_dims, 1:2), intent(in) :: bc_type
+        integer, intent(in) :: stage
+
+        integer :: k
+
+        if (time_stepper == 1) then ! 1st order TVD RK
+
+            $:GPU_PARALLEL_LOOP(private='[k]')
+            do k = 1, n_el_particles_loc
+                !u{1} = u{n} +  dt * RHS{n}
+                particle_rad(k, 1) = particle_rad(k, 1) + dt*particle_draddt(k, 1)
+                if (moving_lag_particles) then
+                    particle_posPrev(k, 1:3, 1) = particle_pos(k, 1:3, 1)
+                    particle_pos(k, 1:3, 1) = particle_pos(k, 1:3, 1) + dt*particle_dposdt(k, 1:3, 1)
+                    particle_vel(k, 1:3, 1) = particle_vel(k, 1:3, 1) + dt*particle_dveldt(k, 1:3, 1)
+                end if
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+
+            call s_transfer_data_to_tmp_particles()
+            if (moving_lag_particles) call s_enforce_EL_particles_boundary_conditions(q_prim_vf, stage, bc_type)
+            if (lag_params%write_void_evol) call s_write_void_evol_particles(mytime)
+            if (lag_params%write_bubbles_stats) call s_calculate_lag_particle_stats()
+            if (lag_params%write_bubbles) then
+                ! $:GPU_UPDATE(host='[gas_p,gas_mv,particle_rad,intfc_vel]')
+                $:GPU_UPDATE(host='[particle_rad]')
+                call s_write_lag_particle_evol(mytime)
+            end if
+
+        elseif (time_stepper == 2) then ! 2nd order TVD RK
+            if (stage == 1) then
+
+                $:GPU_PARALLEL_LOOP(private='[k]')
+                do k = 1, n_el_particles_loc
+                    !u{1} = u{n} +  dt * RHS{n}
+                    particle_rad(k, 2) = particle_rad(k, 1) + dt*particle_draddt(k, 1)
+                    if (moving_lag_particles) then
+                        particle_posPrev(k, 1:3, 2) = particle_pos(k, 1:3, 1)
+                        particle_pos(k, 1:3, 2) = particle_pos(k, 1:3, 1) + dt*particle_dposdt(k, 1:3, 1)
+                        particle_vel(k, 1:3, 2) = particle_vel(k, 1:3, 1) + dt*particle_dveldt(k, 1:3, 1)
+                    end if
+                end do
+                $:END_GPU_PARALLEL_LOOP()
+
+                if (moving_lag_particles) call s_enforce_EL_particles_boundary_conditions(q_prim_vf, stage, bc_type)
+
+            elseif (stage == 2) then
+
+                $:GPU_PARALLEL_LOOP(private='[k]')
+                do k = 1, n_el_particles_loc
+                    !u{1} = u{n} + (1/2) * dt * (RHS{n} + RHS{1})
+                    particle_rad(k, 1) = particle_rad(k, 1) + dt*(particle_draddt(k, 1) + particle_draddt(k, 2))/2._wp
+                    if (moving_lag_particles) then
+                        particle_posPrev(k, 1:3, 1) = particle_pos(k, 1:3, 2)
+                        particle_pos(k, 1:3, 1) = particle_pos(k, 1:3, 1) + dt*(particle_dposdt(k, 1:3, 1) + particle_dposdt(k, 1:3, 2))/2._wp
+                        particle_vel(k, 1:3, 1) = particle_vel(k, 1:3, 1) + dt*(particle_dveldt(k, 1:3, 1) + particle_dveldt(k, 1:3, 2))/2._wp
+                    end if
+
+                end do
+                $:END_GPU_PARALLEL_LOOP()
+
+                call s_transfer_data_to_tmp_particles()
+                if (moving_lag_particles) call s_enforce_EL_particles_boundary_conditions(q_prim_vf, stage, bc_type)
+                if (lag_params%write_void_evol) call s_write_void_evol_particles(mytime)
+                if (lag_params%write_bubbles_stats) call s_calculate_lag_particle_stats()
+                if (lag_params%write_bubbles) then
+                    $:GPU_UPDATE(host='[particle_rad]')
+                    call s_write_lag_particle_evol(mytime)
+                end if
+
+            end if
+
+        elseif (time_stepper == 3) then ! 3rd order TVD RK
+            if (stage == 1) then
+
+                $:GPU_PARALLEL_LOOP(private='[k]')
+                do k = 1, n_el_particles_loc
+                    !u{1} = u{n} +  dt * RHS{n}
+                    particle_rad(k, 2) = particle_rad(k, 1) + dt*particle_draddt(k, 1)
+                    if (moving_lag_particles) then
+                        particle_posPrev(k, 1:3, 2) = particle_pos(k, 1:3, 1)
+                        particle_pos(k, 1:3, 2) = particle_pos(k, 1:3, 1) + dt*particle_dposdt(k, 1:3, 1)
+                        particle_vel(k, 1:3, 2) = particle_vel(k, 1:3, 1) + dt*particle_dveldt(k, 1:3, 1)
+                    end if
+                end do
+                $:END_GPU_PARALLEL_LOOP()
+
+                if (moving_lag_particles) call s_enforce_EL_particles_boundary_conditions(q_prim_vf, stage, bc_type)
+
+            elseif (stage == 2) then
+
+                $:GPU_PARALLEL_LOOP(private='[k]')
+                do k = 1, n_el_particles_loc
+                    !u{2} = u{n} + (1/4) * dt * [RHS{n} + RHS{1}]
+                    particle_rad(k, 2) = particle_rad(k, 1) + dt*(particle_draddt(k, 1) + particle_draddt(k, 2))/4._wp
+                    if (moving_lag_particles) then
+                        particle_posPrev(k, 1:3, 2) = particle_pos(k, 1:3, 2)
+                        particle_pos(k, 1:3, 2) = particle_pos(k, 1:3, 1) + dt*(particle_dposdt(k, 1:3, 1) + particle_dposdt(k, 1:3, 2))/4._wp
+                        particle_vel(k, 1:3, 2) = particle_vel(k, 1:3, 1) + dt*(particle_dveldt(k, 1:3, 1) + particle_dveldt(k, 1:3, 2))/4._wp
+                    end if
+                end do
+                $:END_GPU_PARALLEL_LOOP()
+
+                if (moving_lag_particles) call s_enforce_EL_particles_boundary_conditions(q_prim_vf, stage, bc_type)
+
+            elseif (stage == 3) then
+
+                $:GPU_PARALLEL_LOOP(private='[k]')
+                do k = 1, n_el_particles_loc
+                    !u{n+1} = u{n} + (2/3) * dt * [(1/4)* RHS{n} + (1/4)* RHS{1} + RHS{2}]
+                    particle_rad(k, 1) = particle_rad(k, 1) + (2._wp/3._wp)*dt*(particle_draddt(k, 1)/4._wp + particle_draddt(k, 2)/4._wp + particle_draddt(k, 3))
+                    if (moving_lag_particles) then
+                        particle_posPrev(k, 1:3, 1) = particle_pos(k, 1:3, 2)
+                        particle_pos(k, 1:3, 1) = particle_pos(k, 1:3, 1) + (2._wp/3._wp)*dt*(particle_dposdt(k, 1:3, 1)/4._wp + particle_dposdt(k, 1:3, 2)/4._wp + particle_dposdt(k, 1:3, 3))
+                        particle_vel(k, 1:3, 1) = particle_vel(k, 1:3, 1) + (2._wp/3._wp)*dt*(particle_dveldt(k, 1:3, 1)/4._wp + particle_dveldt(k, 1:3, 2)/4._wp + particle_dveldt(k, 1:3, 3))
+                    end if
+
+                end do
+                $:END_GPU_PARALLEL_LOOP()
+
+                call s_transfer_data_to_tmp_particles()
+                if (moving_lag_particles) call s_enforce_EL_particles_boundary_conditions(q_prim_vf, stage, bc_type)
+                if (lag_params%write_void_evol) call s_write_void_evol_particles(mytime)
+                if (lag_params%write_bubbles_stats) call s_calculate_lag_particle_stats()
+                if (lag_params%write_bubbles .and. mytime >= next_write_time) then
+                    $:GPU_UPDATE(host='[particle_mass,particle_rad]')
+                    call s_write_lag_particle_evol(mytime)
+                    next_write_time = next_write_time + t_save
+                end if
+
+            end if
+
+        end if
+
+    end subroutine s_update_lagrange_particles_tdv_rk
+
+    !> This subroutine enforces reflective and wall boundary conditions for EL particles
+        !! @param dest Destination for the bubble position update
+    impure subroutine s_enforce_EL_particles_boundary_conditions(q_prim_vf, nstage, bc_type)
+
+        type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf
+        type(integer_field), dimension(1:num_dims, 1:2), intent(in) :: bc_type
+        integer, intent(in) :: nstage
+        real(wp) :: offset
+
+        integer :: k, i, q
+        integer :: patch_id, newBubs, new_idx
+        integer, dimension(3) :: cell
+        logical :: inc_ghost = .false.
+        real(wp) :: myR, func_sum
+        real(wp), dimension(3) :: myPos, myVel, myForce
+        logical :: only_beta = .true.
+
+        call nvtxStartRange("LAG-BC")
+        call nvtxStartRange("LAG-BC-DEV2HOST")
+        $:GPU_UPDATE(host='[p_owner_rank, particle_R0, Rmax_stats_part, Rmin_stats_part, particle_mass, f_p, &
+            & lag_part_id, particle_rad, &
+            & particle_pos, particle_posPrev, particle_vel, particle_s, particle_draddt, &
+            & particle_dposdt, particle_dveldt, keep_bubble, n_el_particles_loc, &
+            & wrap_bubble_dir, wrap_bubble_loc]')
+        call nvtxEndRange
+
+        ! Handle MPI transfer of particles going to another processor's local domain
+        if (num_procs > 1) then
+            call nvtxStartRange("LAG-BC-TRANSFER-LIST")
+            call s_add_particles_to_transfer_list(n_el_particles_loc, particle_pos(:, :, 2), particle_posPrev(:, :, 2), inc_ghost)
+            call nvtxEndRange
+
+            call nvtxStartRange("LAG-BC-SENDRECV")
+            call s_mpi_sendrecv_solid_particles(p_owner_rank, particle_R0, Rmax_stats_part, Rmin_stats_part, particle_mass, f_p, &
+                                                lag_part_id, &
+                                                particle_rad, particle_pos, particle_posPrev, particle_vel, &
+                                                particle_s, particle_draddt, particle_dposdt, particle_dveldt, lag_num_ts, n_el_particles_loc, &
+                                                2)
+            call nvtxEndRange
+        end if
+
+        call nvtxStartRange("LAG-BC-HOST2DEV")
+        $:GPU_UPDATE(device='[p_owner_rank, particle_R0, Rmax_stats_part, Rmin_stats_part, particle_mass, f_p, &
+            & lag_part_id, particle_rad, &
+            & particle_pos, particle_posPrev, particle_vel, particle_s, particle_draddt, &
+            & particle_dposdt, particle_dveldt, n_el_particles_loc]')
+        call nvtxEndRange
+
+        $:GPU_PARALLEL_LOOP(private='[k, cell]',copyin='[nstage]')
+        do k = 1, n_el_particles_loc
+            keep_bubble(k) = 1
+            wrap_bubble_loc(k, :) = 0
+            wrap_bubble_dir(k, :) = 0
+
+            ! Relocate particles at solid boundaries and delete particles that leave
+            ! buffer regions
+            if (any(bc_x%beg == (/BC_REFLECTIVE, BC_CHAR_SLIP_WALL, BC_SLIP_WALL, BC_NO_SLIP_WALL/)) &
+                .and. particle_pos(k, 1, 2) < x_cb(-1) + eps_overlap*particle_rad(k, 2)) then
+                particle_pos(k, 1, 2) = x_cb(-1) + eps_overlap*particle_rad(k, 2)
+                if (nstage == lag_num_ts) then
+                    particle_pos(k, 1, 1) = particle_pos(k, 1, 2)
+                end if
+            elseif (any(bc_x%end == (/BC_REFLECTIVE, BC_CHAR_SLIP_WALL, BC_SLIP_WALL, BC_NO_SLIP_WALL/)) &
+                    .and. particle_pos(k, 1, 2) > x_cb(m) - eps_overlap*particle_rad(k, 2)) then
+                particle_pos(k, 1, 2) = x_cb(m) - eps_overlap*particle_rad(k, 2)
+                if (nstage == lag_num_ts) then
+                    particle_pos(k, 1, 1) = particle_pos(k, 1, 2)
+                end if
+            elseif (bc_x%beg == BC_PERIODIC .and. particle_pos(k, 1, 2) < pcomm_coords(1)%beg .and. &
+                    particle_posPrev(k, 1, 2) >= pcomm_coords(1)%beg) then
+                wrap_bubble_dir(k, 1) = 1
+                wrap_bubble_loc(k, 1) = -1
+            elseif (bc_x%end == BC_PERIODIC .and. particle_pos(k, 1, 2) > pcomm_coords(1)%end .and. &
+                    particle_posPrev(k, 1, 2) <= pcomm_coords(1)%end) then
+                wrap_bubble_dir(k, 1) = 1
+                wrap_bubble_loc(k, 1) = 1
+            elseif (particle_pos(k, 1, 2) >= x_cb(m)) then
+                keep_bubble(k) = 0
+            elseif (particle_pos(k, 1, 2) < x_cb(-1)) then
+                keep_bubble(k) = 0
+            end if
+
+            if (any(bc_y%beg == (/BC_REFLECTIVE, BC_CHAR_SLIP_WALL, BC_SLIP_WALL, BC_NO_SLIP_WALL/)) &
+                .and. particle_pos(k, 2, 2) < y_cb(-1) + eps_overlap*particle_rad(k, 2)) then
+                particle_pos(k, 2, 2) = y_cb(-1) + eps_overlap*particle_rad(k, 2)
+                if (nstage == lag_num_ts) then
+                    particle_pos(k, 2, 1) = particle_pos(k, 2, 2)
+                end if
+            else if (any(bc_y%end == (/BC_REFLECTIVE, BC_CHAR_SLIP_WALL, BC_SLIP_WALL, BC_NO_SLIP_WALL/)) &
+                     .and. particle_pos(k, 2, 2) > y_cb(n) - eps_overlap*particle_rad(k, 2)) then
+                particle_pos(k, 2, 2) = y_cb(n) - eps_overlap*particle_rad(k, 2)
+                if (nstage == lag_num_ts) then
+                    particle_pos(k, 2, 1) = particle_pos(k, 2, 2)
+                end if
+            elseif (bc_y%beg == BC_PERIODIC .and. particle_pos(k, 2, 2) < pcomm_coords(2)%beg .and. &
+                    particle_posPrev(k, 2, 2) >= pcomm_coords(2)%beg) then
+                wrap_bubble_dir(k, 2) = 1
+                wrap_bubble_loc(k, 2) = -1
+            elseif (bc_y%end == BC_PERIODIC .and. particle_pos(k, 2, 2) > pcomm_coords(2)%end .and. &
+                    particle_posPrev(k, 2, 2) <= pcomm_coords(2)%end) then
+                wrap_bubble_dir(k, 2) = 1
+                wrap_bubble_loc(k, 2) = 1
+            elseif (particle_pos(k, 2, 2) >= y_cb(n)) then
+                keep_bubble(k) = 0
+            elseif (particle_pos(k, 2, 2) < y_cb(-1)) then
+                keep_bubble(k) = 0
+            end if
+
+            if (p > 0) then
+                if (any(bc_z%beg == (/BC_REFLECTIVE, BC_CHAR_SLIP_WALL, BC_SLIP_WALL, BC_NO_SLIP_WALL/)) &
+                    .and. particle_pos(k, 3, 2) < z_cb(-1) + eps_overlap*particle_rad(k, 2)) then
+                    particle_pos(k, 3, 2) = z_cb(-1) + eps_overlap*particle_rad(k, 2)
+                    if (nstage == lag_num_ts) then
+                        particle_pos(k, 3, 1) = particle_pos(k, 3, 2)
+                    end if
+                else if (any(bc_z%end == (/BC_REFLECTIVE, BC_CHAR_SLIP_WALL, BC_SLIP_WALL, BC_NO_SLIP_WALL/)) &
+                         .and. particle_pos(k, 3, 2) > z_cb(p) - eps_overlap*particle_rad(k, 2)) then
+                    particle_pos(k, 3, 2) = z_cb(p) - eps_overlap*particle_rad(k, 2)
+                    if (nstage == lag_num_ts) then
+                        particle_pos(k, 3, 1) = particle_pos(k, 3, 2)
+                    end if
+                elseif (bc_z%beg == BC_PERIODIC .and. particle_pos(k, 3, 2) < pcomm_coords(3)%beg .and. &
+                        particle_posPrev(k, 3, 2) >= pcomm_coords(3)%beg) then
+                    wrap_bubble_dir(k, 3) = 1
+                    wrap_bubble_loc(k, 3) = -1
+                elseif (bc_z%end == BC_PERIODIC .and. particle_pos(k, 3, 2) > pcomm_coords(3)%end .and. &
+                        particle_posPrev(k, 3, 2) <= pcomm_coords(3)%end) then
+                    wrap_bubble_dir(k, 3) = 1
+                    wrap_bubble_loc(k, 3) = 1
+                elseif (particle_pos(k, 3, 2) >= z_cb(p)) then
+                    keep_bubble(k) = 0
+                elseif (particle_pos(k, 3, 2) < z_cb(-1)) then
+                    keep_bubble(k) = 0
+                end if
+            end if
+
+            if (keep_bubble(k) == 1) then
+                ! Remove bubbles that are no longer in a liquid
+                cell = fd_number - buff_size
+                call s_locate_cell(particle_pos(k, 1:3, 2), cell, particle_s(k, 1:3, 2))
+
+                if (q_prim_vf(advxb)%sf(cell(1), cell(2), cell(3)) < (1._wp - lag_params%valmaxvoid)) then
+                    keep_bubble(k) = 0
+                end if
+            end if
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+        if (n_el_particles_loc > 0) then
+            call nvtxStartRange("LAG-BC")
+            call nvtxStartRange("LAG-BC-DEV2HOST")
+            $:GPU_UPDATE(host='[particle_R0, Rmax_stats_part, Rmin_stats_part, particle_mass, f_p, &
+                & lag_part_id, particle_rad, &
+                & particle_pos, particle_posPrev, particle_vel, particle_s, particle_draddt, &
+                & particle_dposdt, particle_dveldt, keep_bubble, n_el_particles_loc, &
+                & wrap_bubble_dir, wrap_bubble_loc]')
+            call nvtxEndRange
+
+            newBubs = 0
+            do k = 1, n_el_particles_loc
+                if (keep_bubble(k) == 1) then
+                    newBubs = newBubs + 1
+                    if (newBubs /= k) then
+                        call s_copy_lag_particle(newBubs, k)
+                        wrap_bubble_dir(newBubs, :) = wrap_bubble_dir(k, :)
+                        wrap_bubble_loc(newBubs, :) = wrap_bubble_loc(k, :)
+                    end if
+                end if
+            end do
+
+            n_el_particles_loc = newBubs
+
+            ! Handle periodic wrapping of bubbles on same processor
+            newBubs = 0
+            do k = 1, n_el_particles_loc
+                if (any(wrap_bubble_dir(k, :) == 1)) then
+                    newBubs = newBubs + 1
+                    new_idx = n_el_particles_loc + newBubs
+                    call s_copy_lag_particle(new_idx, k)
+                    do i = 1, num_dims
+                        if (wrap_bubble_dir(k, i) == 1) then
+                            offset = glb_bounds(i)%end - glb_bounds(i)%beg
+                            if (wrap_bubble_loc(k, i) == 1) then
+                                do q = 1, 2
+                                    particle_pos(new_idx, i, q) = particle_pos(new_idx, i, q) - offset
+                                    particle_posPrev(new_idx, i, q) = particle_posPrev(new_idx, i, q) - offset
+                                end do
+                            else if (wrap_bubble_loc(k, i) == -1) then
+                                do q = 1, 2
+                                    particle_pos(new_idx, i, q) = particle_pos(new_idx, i, q) + offset
+                                    particle_posPrev(new_idx, i, q) = particle_posPrev(new_idx, i, q) + offset
+                                end do
+                            end if
+                        end if
+                    end do
+                end if
+            end do
+            call nvtxStartRange("LAG-BC-HOST2DEV")
+            $:GPU_UPDATE(device='[particle_R0, Rmax_stats_part, Rmin_stats_part, particle_mass, f_p, &
+            & lag_part_id, particle_rad, &
+            & particle_pos, particle_posPrev, particle_vel, particle_s, particle_draddt, &
+            & particle_dposdt, particle_dveldt, n_el_particles_loc]')
+            call nvtxEndRange
+        end if
+
+        call s_reset_cell_vars()
+
+        $:GPU_PARALLEL_LOOP(private='[cell,myR,myPos,myVel,myForce,func_sum]',copyin='[only_beta]')
+        do k = 1, n_el_particles_loc
+            myR = particle_rad(k, 2)
+            myPos = particle_pos(k, 1:3, 2)
+            myVel = particle_vel(k, 1:3, 2)
+            myForce = f_p(k, :)
+
+            cell = fd_number - buff_size
+            call s_locate_cell(particle_pos(k, 1:3, 2), cell, particle_s(k, 1:3, 2))
+
+            !Compute the total gaussian contribution for each particle for normalization
+            call s_compute_gaussian_contribution(myR, myPos, cell, func_sum)
+            gSum(k) = func_sum
+
+            call s_gaussian_atomic(myR, myVel, myPos, myForce, func_sum, cell, q_particles, only_beta)
+
+        end do
+
+        ! Update void fraction and communicate buffers
+        call s_finalize_beta_field(bc_type, only_beta)
+
+        call nvtxEndRange ! LAG-BC
+
+    end subroutine s_enforce_EL_particles_boundary_conditions
+
+    !> This subroutine returns the computational coordinate of the cell for the given position.
+          !! @param pos Input coordinates
+          !! @param cell Computational coordinate of the cell
+          !! @param scoord Calculated particle coordinates
+    subroutine s_locate_cell(pos, cell, scoord)
+        $:GPU_ROUTINE(function_name='s_locate_cell',parallelism='[seq]', &
+            & cray_inline=True)
+
+        real(wp), dimension(3), intent(in) :: pos
+        real(wp), dimension(3), intent(out) :: scoord
+        integer, dimension(3), intent(inout) :: cell
+
+        integer :: i
+
+        do while (pos(1) < x_cb(cell(1) - 1))
+            cell(1) = cell(1) - 1
+        end do
+
+        do while (pos(1) >= x_cb(cell(1)))
+            cell(1) = cell(1) + 1
+        end do
+
+        do while (pos(2) < y_cb(cell(2) - 1))
+            cell(2) = cell(2) - 1
+        end do
+
+        do while (pos(2) >= y_cb(cell(2)))
+            cell(2) = cell(2) + 1
+        end do
+
+        if (p > 0) then
+            do while (pos(3) < z_cb(cell(3) - 1))
+                cell(3) = cell(3) - 1
+            end do
+            do while (pos(3) >= z_cb(cell(3)))
+                cell(3) = cell(3) + 1
+            end do
+        end if
+
+        ! The numbering of the cell of which left boundary is the domain boundary is 0.
+        ! if comp.coord of the pos is s, the real coordinate of s is
+        ! (the coordinate of the left boundary of the Floor(s)-th cell)
+        ! + (s-(int(s))*(cell-width).
+        ! In other words,  the coordinate of the center of the cell is x_cc(cell).
+
+        !coordinates in computational space
+        scoord(1) = cell(1) + (pos(1) - x_cb(cell(1) - 1))/dx(cell(1))
+        scoord(2) = cell(2) + (pos(2) - y_cb(cell(2) - 1))/dy(cell(2))
+        scoord(3) = 0._wp
+        if (p > 0) scoord(3) = cell(3) + (pos(3) - z_cb(cell(3) - 1))/dz(cell(3))
+        cell(:) = int(scoord(:))
+        do i = 1, num_dims
+            if (scoord(i) < 0._wp) cell(i) = cell(i) - 1
+        end do
+
+    end subroutine s_locate_cell
+
+    !> This subroutine transfer data into the temporal variables.
+    impure subroutine s_transfer_data_to_tmp_particles()
+
+        integer :: k
+
+        $:GPU_PARALLEL_LOOP(private='[k]')
+        do k = 1, n_el_particles_loc
+            particle_rad(k, 2) = particle_rad(k, 1)
+            particle_pos(k, 1:3, 2) = particle_pos(k, 1:3, 1)
+            particle_posPrev(k, 1:3, 2) = particle_posPrev(k, 1:3, 1)
+            particle_vel(k, 1:3, 2) = particle_vel(k, 1:3, 1)
+            particle_s(k, 1:3, 2) = particle_s(k, 1:3, 1)
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+    end subroutine s_transfer_data_to_tmp_particles
+
+    !> The purpose of this procedure is to determine if the global coordinates of the bubbles
+        !!      are present in the current MPI processor (including ghost cells).
+        !! @param pos_part Spatial coordinates of the bubble
+    function particle_in_domain(pos_part)
+
+        logical :: particle_in_domain
+        real(wp), dimension(3), intent(in) :: pos_part
+
+        ! 2D
+        if (p == 0 .and. cyl_coord .neqv. .true.) then
+            ! Defining a virtual z-axis that has the same dimensions as y-axis
+            ! defined in the input file
+            particle_in_domain = ((pos_part(1) < x_cb(m + buff_size - fd_number)) .and. &
+                                  (pos_part(1) >= x_cb(fd_number - buff_size - 1)) .and. &
+                                  (pos_part(2) < y_cb(n + buff_size - fd_number)) .and. &
+                                  (pos_part(2) >= y_cb(fd_number - buff_size - 1)) .and. &
+                                  (pos_part(3) < lag_params%charwidth/2._wp) .and. (pos_part(3) > -lag_params%charwidth/2._wp))
+        else
+            ! cyl_coord
+            particle_in_domain = ((pos_part(1) < x_cb(m + buff_size - fd_number)) .and. &
+                                  (pos_part(1) >= x_cb(fd_number - buff_size - 1)) .and. &
+                                  (abs(pos_part(2)) < y_cb(n + buff_size - fd_number)) .and. &
+                                  (abs(pos_part(2)) >= max(y_cb(fd_number - buff_size - 1), 0._wp)))
+        end if
+
+        ! 3D
+        if (p > 1) then
+            particle_in_domain = ((pos_part(1) < x_cb(m + buff_size - fd_number)) .and. &
+                                  (pos_part(1) >= x_cb(fd_number - buff_size - 1)) .and. &
+                                  (pos_part(2) < y_cb(n + buff_size - fd_number)) .and. &
+                                  (pos_part(2) >= y_cb(fd_number - buff_size - 1)) .and. &
+                                  (pos_part(3) < z_cb(p + buff_size - fd_number)) .and. &
+                                  (pos_part(3) >= z_cb(fd_number - buff_size - 1)))
+        end if
+
+        ! For symmetric and wall boundary condition
+        if (any(bc_x%beg == (/BC_REFLECTIVE, BC_CHAR_SLIP_WALL, BC_SLIP_WALL, BC_NO_SLIP_WALL/))) then
+            particle_in_domain = (particle_in_domain .and. (pos_part(1) >= x_cb(-1)))
+        end if
+        if (any(bc_x%end == (/BC_REFLECTIVE, BC_CHAR_SLIP_WALL, BC_SLIP_WALL, BC_NO_SLIP_WALL/))) then
+            particle_in_domain = (particle_in_domain .and. (pos_part(1) < x_cb(m)))
+        end if
+        if (any(bc_y%beg == (/BC_REFLECTIVE, BC_CHAR_SLIP_WALL, BC_SLIP_WALL, BC_NO_SLIP_WALL/)) .and. (.not. cyl_coord)) then
+            particle_in_domain = (particle_in_domain .and. (pos_part(2) >= y_cb(-1)))
+        end if
+        if (any(bc_y%end == (/BC_REFLECTIVE, BC_CHAR_SLIP_WALL, BC_SLIP_WALL, BC_NO_SLIP_WALL/)) .and. (.not. cyl_coord)) then
+            particle_in_domain = (particle_in_domain .and. (pos_part(2) < y_cb(n)))
+        end if
+        if (p > 0) then
+            if (any(bc_z%beg == (/BC_REFLECTIVE, BC_CHAR_SLIP_WALL, BC_SLIP_WALL, BC_NO_SLIP_WALL/))) then
+                particle_in_domain = (particle_in_domain .and. (pos_part(3) >= z_cb(-1)))
+            end if
+            if (any(bc_z%end == (/BC_REFLECTIVE, BC_CHAR_SLIP_WALL, BC_SLIP_WALL, BC_NO_SLIP_WALL/))) then
+                particle_in_domain = (particle_in_domain .and. (pos_part(3) < z_cb(p)))
+            end if
+        end if
+
+    end function particle_in_domain
+
+    !> The purpose of this procedure is to determine if the lagrangian bubble is located in the
+        !!       physical domain. The ghost cells are not part of the physical domain.
+        !! @param pos_part Spatial coordinates of the bubble
+    function particle_in_domain_physical(pos_part)
+
+        logical :: particle_in_domain_physical
+        real(wp), dimension(3), intent(in) :: pos_part
+
+        particle_in_domain_physical = ((pos_part(1) < x_cb(m)) .and. (pos_part(1) >= x_cb(-1)) .and. &
+                                       (pos_part(2) < y_cb(n)) .and. (pos_part(2) >= y_cb(-1)))
+
+        if (p > 0) then
+            particle_in_domain_physical = (particle_in_domain_physical .and. (pos_part(3) < z_cb(p)) .and. (pos_part(3) >= z_cb(-1)))
+        end if
+
+    end function particle_in_domain_physical
+
+    !> The purpose of this procedure is to calculate the gradient from reconstructed states along the x, y and z
+        !! @param vL_field left edge reconstructed values
+        !! @param vR_field right edge reconstructed values
+        !! @param dq Output gradient of q
+        !! @param dir Gradient spatial direction
+        !! @param field_var variable index for reconstructed states
+    subroutine s_gradient_field(vL_field, vR_field, dq, dir, field_var)
+
+        real(stp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:), intent(out) :: dq
+        real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:), intent(inout) :: vL_field
+        real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:), intent(inout) :: vR_field
+        integer, intent(in) :: dir, field_var
+
+        integer :: i, j, k
+        real(wp) :: mydx
+
+        if (dir == 1) then
+
+            $:GPU_PARALLEL_LOOP(private='[i,j,k,mydx]', collapse=3,copyin='[dir, field_var]')
+            do k = idwbuff(3)%beg, idwbuff(3)%end
+                do j = idwbuff(2)%beg, idwbuff(2)%end
+                    do i = idwbuff(1)%beg, idwbuff(1)%end
+                        mydx = dx(i)
+                        dq(i, j, k) = (vR_field(i, j, k, field_var) - vL_field(i, j, k, field_var))/mydx
+                    end do
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+
+        elseif (dir == 2) then
+
+            $:GPU_PARALLEL_LOOP(private='[i,j,k,mydx]', collapse=3,copyin='[dir, field_var]')
+            do k = idwbuff(3)%beg, idwbuff(3)%end
+                do j = idwbuff(1)%beg, idwbuff(1)%end
+                    do i = idwbuff(2)%beg, idwbuff(2)%end
+                        mydx = dy(i)
+                        dq(j, i, k) = (vR_field(i, j, k, field_var) - vL_field(i, j, k, field_var))/mydx
+                    end do
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+
+        elseif (dir == 3) then
+
+            $:GPU_PARALLEL_LOOP(private='[i,j,k,mydx]', collapse=3,copyin='[dir, field_var]')
+            do k = idwbuff(1)%beg, idwbuff(1)%end
+                do j = idwbuff(2)%beg, idwbuff(2)%end
+                    do i = idwbuff(3)%beg, idwbuff(3)%end
+                        mydx = dz(i)
+                        dq(k, j, i) = (vR_field(i, j, k, field_var) - vL_field(i, j, k, field_var))/mydx
+                    end do
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+
+        end if
+
+    end subroutine s_gradient_field
+
+    !> The purpose of this procedure is to calculate the gradient of a scalar field along the x, y and z directions using Fornberg's method
+    !! @param q Input scalar field
+    !! @param dq Output gradient of q
+    !! @param dir Gradient spatial direction
+    subroutine s_gradient_dir_fornberg(q, dq, dir)
+
+        real(stp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:), intent(in) :: q
+        real(stp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:), intent(out) :: dq
+        integer, intent(in) :: dir
+
+        integer :: i, j, k, a, npts, s_idx
+
+        npts = (nWeights_grad - 1)/2
+
+        if (dir == 1) then
+            $:GPU_PARALLEL_LOOP(private='[i,j,k,s_idx,a]', collapse=3,copyin='[npts]')
+            do k = idwbuff(3)%beg, idwbuff(3)%end
+                do j = idwbuff(2)%beg, idwbuff(2)%end
+                    do i = idwbuff(1)%beg + 2, idwbuff(1)%end - 2
+                        dq(i, j, k) = 0._wp
+                        do a = -npts, npts
+                            s_idx = a + npts + 1
+                            dq(i, j, k) = dq(i, j, k) + weights_x_grad(s_idx)%sf(i, 1, 1)*q(i + a, j, k)
+                        end do
+                    end do
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+        elseif (dir == 2) then
+            $:GPU_PARALLEL_LOOP(private='[i,j,k,s_idx,a]', collapse=3,copyin='[npts]')
+            do k = idwbuff(3)%beg, idwbuff(3)%end
+                do j = idwbuff(2)%beg + 2, idwbuff(2)%end - 2
+                    do i = idwbuff(1)%beg, idwbuff(1)%end
+                        dq(i, j, k) = 0._wp
+                        do a = -npts, npts
+                            s_idx = a + npts + 1
+                            dq(i, j, k) = dq(i, j, k) + weights_y_grad(s_idx)%sf(j, 1, 1)*q(i, j + a, k)
+                        end do
+                    end do
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+        elseif (dir == 3) then
+            $:GPU_PARALLEL_LOOP(private='[i,j,k,s_idx,a]', collapse=3,copyin='[npts]')
+            do k = idwbuff(3)%beg + 2, idwbuff(3)%end - 2
+                do j = idwbuff(2)%beg, idwbuff(2)%end
+                    do i = idwbuff(1)%beg, idwbuff(1)%end
+                        dq(i, j, k) = 0._wp
+                        do a = -npts, npts
+                            s_idx = a + npts + 1
+                            dq(i, j, k) = dq(i, j, k) + weights_z_grad(s_idx)%sf(k, 1, 1)*q(i, j, k + a)
+                        end do
+                    end do
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+        end if
+
+    end subroutine s_gradient_dir_fornberg
+
+    !> The purpose of this procedure is to compute the Fornberg finite difference weights for derivatives (only done once at start time)
+    impure subroutine s_compute_fornberg_fd_weights(npts)
+
+        integer, intent(in) :: npts
+        integer :: i, j, k, a, m_order
+        integer :: s_idx
+        real(wp) :: x0, y0, z0
+        real(wp) :: x_stencil(nWeights_grad)
+        real(wp) :: c(nWeights_grad, 0:1)
+
+        m_order = 1   ! first derivative
+
+        $:GPU_PARALLEL_LOOP(private='[i,a,x_stencil,c,s_idx,x0]', copyin='[npts,m_order]')
+        do i = idwbuff(1)%beg + npts, idwbuff(1)%end - npts
+            do a = -npts, npts
+                s_idx = a + npts + 1
+                x_stencil(s_idx) = x_cc(i + a)
+            end do
+            x0 = x_cc(i)
+
+            call s_fornberg_weights(x0, x_stencil, nWeights_grad, m_order, c)
+
+            do a = -npts, npts
+                s_idx = a + npts + 1
+                weights_x_grad(s_idx)%sf(i, 1, 1) = c(s_idx, 1)
+            end do
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+        $:GPU_PARALLEL_LOOP(private='[j,a,x_stencil,c,s_idx,y0]', copyin='[npts,m_order]')
+        do j = idwbuff(2)%beg + npts, idwbuff(2)%end - npts
+            do a = -npts, npts
+                s_idx = a + npts + 1
+                x_stencil(s_idx) = y_cc(j + a)
+            end do
+            y0 = y_cc(j)
+
+            call s_fornberg_weights(y0, x_stencil, nWeights_grad, m_order, c)
+
+            do a = -npts, npts
+                s_idx = a + npts + 1
+                weights_y_grad(s_idx)%sf(j, 1, 1) = c(s_idx, 1)
+            end do
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+        if (num_dims == 3) then
+
+            $:GPU_PARALLEL_LOOP(private='[k,a,x_stencil,c,s_idx,z0]', copyin='[npts,m_order]')
+            do k = idwbuff(3)%beg + npts, idwbuff(3)%end - npts
+                do a = -npts, npts
+                    s_idx = a + npts + 1
+                    x_stencil(s_idx) = z_cc(k + a)
+                end do
+                z0 = z_cc(k)
+
+                call s_fornberg_weights(z0, x_stencil, nWeights_grad, m_order, c)
+
+                do a = -npts, npts
+                    s_idx = a + npts + 1
+                    weights_z_grad(s_idx)%sf(k, 1, 1) = c(s_idx, 1)
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+
+        end if
+
+    end subroutine s_compute_fornberg_fd_weights
+
+    !> The purpose of this procedure is to compute the Fornberg finite difference weights on a local stencil
+    subroutine s_fornberg_weights(x0, stencil, npts, m_order, coeffs)
+        ! $:GPU_ROUTINE(parallelism='[seq]')
+
+        integer, intent(in) :: npts       ! number of stencil points
+        integer, intent(in) :: m_order       ! highest derivative order
+        real(wp), intent(in) :: x0     ! evaluation point
+        real(wp), intent(in) :: stencil(npts)   ! stencil coordinates
+        real(wp), intent(out) :: coeffs(npts, 0:m_order)
+
+        integer :: i, j, k, mn
+        real(wp) :: c1, c2, c3, c4, c5
+
+        coeffs = 0.0_wp
+        c1 = 1.0_wp
+        c4 = stencil(1) - x0
+        coeffs(1, 0) = 1.0_wp
+
+        do i = 2, npts
+            mn = min(i - 1, m_order)
+            c2 = 1.0_wp
+            c5 = c4
+            c4 = stencil(i) - x0
+
+            do j = 1, i - 1
+                c3 = stencil(i) - stencil(j)
+                c2 = c2*c3
+
+                if (j == i - 1) then
+                    do k = mn, 1, -1
+                        coeffs(i, k) = c1*(k*coeffs(i - 1, k - 1) - c5*coeffs(i - 1, k))/c2
+                    end do
+                    coeffs(i, 0) = -c1*c5*coeffs(i - 1, 0)/c2
+                end if
+
+                do k = mn, 1, -1
+                    coeffs(j, k) = (c4*coeffs(j, k) - k*coeffs(j, k - 1))/c3
+                end do
+                coeffs(j, 0) = c4*coeffs(j, 0)/c3
+            end do
+
+            c1 = c2
+        end do
+
+    end subroutine s_fornberg_weights
+
+    !> The purpose of this procedure is to compute the barycentric weights for interpolation (only done once at start time)
+    impure subroutine s_compute_barycentric_weights(npts)
+
+        integer, intent(in) :: npts
+        integer :: i, j, k, l, a, b
+        real(wp) :: prod_x, prod_y, prod_z, dx_loc, dy_loc, dz_loc
+
+        $:GPU_PARALLEL_LOOP(private='[i,a,b,prod_x,dx_loc]', copyin = '[npts]')
+        do i = idwbuff(1)%beg + npts, idwbuff(1)%end - npts
+            do a = -npts, npts
+                prod_x = 1._wp
+                do b = -npts, npts
+                    if (a /= b) then
+                        dx_loc = x_cc(i + a) - x_cc(i + b)
+                        prod_x = prod_x*dx_loc
+                    end if
+                end do
+                weights_x_interp(a + npts + 1)%sf(i, 1, 1) = 1._wp/prod_x
+            end do
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+        $:GPU_PARALLEL_LOOP(private='[j,a,b,prod_y,dy_loc]', copyin = '[npts]')
+        do j = idwbuff(2)%beg + npts, idwbuff(2)%end - npts
+            do a = -npts, npts
+                prod_y = 1._wp
+                do b = -npts, npts
+                    if (a /= b) then
+                        dy_loc = y_cc(j + a) - y_cc(j + b)
+                        prod_y = prod_y*dy_loc
+                    end if
+                end do
+                weights_y_interp(a + npts + 1)%sf(j, 1, 1) = 1._wp/prod_y
+            end do
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+        if (num_dims == 3) then
+            $:GPU_PARALLEL_LOOP(private='[k,a,b,prod_z,dz_loc]', copyin = '[npts]')
+            do k = idwbuff(3)%beg + npts, idwbuff(3)%end - npts
+                do a = -npts, npts
+                    prod_z = 1._wp
+                    do b = -npts, npts
+                        if (a /= b) then
+                            dz_loc = z_cc(k + a) - z_cc(k + b)
+                            prod_z = prod_z*dz_loc
+                        end if
+                    end do
+                    weights_z_interp(a + npts + 1)%sf(k, 1, 1) = 1._wp/prod_z
+                end do
+            end do
+        end if
+
+    end subroutine s_compute_barycentric_weights
+
+    impure subroutine s_open_lag_bubble_evol
+
+        character(LEN=path_len + 2*name_len) :: file_loc
+        logical file_exist
+        character(LEN=25) :: FMT
+
+        write (file_loc, '(A,I0,A)') 'lag_bubble_evol_', proc_rank, '.dat'
+        file_loc = trim(case_dir)//'/D/'//trim(file_loc)
+        call my_inquire(trim(file_loc), file_exist)
+
+        if (precision == 1) then
+            FMT = "(A16,A14,8A16)"
+        else
+            FMT = "(A24,A14,8A24)"
+        end if
+
+        if (.not. file_exist) then
+            open (LAG_EVOL_ID, FILE=trim(file_loc), FORM='formatted', position='rewind')
+            write (LAG_EVOL_ID, FMT) 'currentTime', 'particleID', 'x', 'y', 'z', &
+                'Vx', 'Vy', 'Vz', 'Fp_x', 'Fp_y', 'Fp_z', &
+                'radius'
+        else
+            open (LAG_EVOL_ID, FILE=trim(file_loc), FORM='formatted', position='append')
+        end if
+
+    end subroutine s_open_lag_bubble_evol
+
+    !> Subroutine that writes on each time step the changes of the lagrangian bubbles.
+        !!  @param q_time Current time
+    impure subroutine s_write_lag_particle_evol(qtime)
+
+        real(wp), intent(in) :: qtime
+        integer :: k, ios
+        character(LEN=25) :: FMT
+
+        character(LEN=path_len + 2*name_len) :: file_loc, path
+        logical :: file_exist
+
+        if (precision == 1) then
+            ! FMT = "(F16.8,I14,8F16.8)"
+            FMT = "(F16.8,I14,10F16.8)"
+        else
+            ! FMT = "(F24.16,I14,8F24.16)"
+            FMT = "(F24.16,I14,10F24.16)"
+        end if
+
+        ! Cycle through list
+        do k = 1, n_el_particles_loc
+            write (LAG_EVOL_ID, FMT) &
+                qtime, &
+                lag_part_id(k, 1), &
+                particle_pos(k, 1, 1), &
+                particle_pos(k, 2, 1), &
+                particle_pos(k, 3, 1), &
+                particle_vel(k, 1, 1), &
+                particle_vel(k, 2, 1), &
+                particle_vel(k, 3, 1), &
+                f_p(k, 1), &
+                f_p(k, 2), &
+                f_p(k, 3), &
+                particle_rad(k, 1)
+        end do
+
+    end subroutine s_write_lag_particle_evol
+
+    impure subroutine s_close_lag_particle_evol
+
+        close (LAG_EVOL_ID)
+
+    end subroutine s_close_lag_particle_evol
+
+    subroutine s_open_void_evol
+
+        character(LEN=path_len + 2*name_len) :: file_loc
+        logical :: file_exist
+
+        if (proc_rank == 0) then
+            write (file_loc, '(A)') 'voidfraction.dat'
+            file_loc = trim(case_dir)//'/D/'//trim(file_loc)
+            call my_inquire(trim(file_loc), file_exist)
+            if (.not. file_exist) then
+                open (LAG_VOID_ID, FILE=trim(file_loc), FORM='formatted', position='rewind')
+                !write (12, *) 'currentTime, averageVoidFraction, ', &
+                !    'maximumVoidFraction, totalParticlesVolume'
+                !write (12, *) 'The averageVoidFraction value does ', &
+                !    'not reflect the real void fraction in the cloud since the ', &
+                !    'cells which do not have bubbles are not accounted'
+            else
+                open (LAG_VOID_ID, FILE=trim(file_loc), FORM='formatted', position='append')
+            end if
+        end if
+
+    end subroutine s_open_void_evol
+
+    !>  Subroutine that writes some useful statistics related to the volume fraction
+            !!       of the particles (void fraction) in the computatioational domain
+            !!       on each time step.
+            !!  @param q_time Current time
+    impure subroutine s_write_void_evol_particles(qtime)
+
+        real(wp), intent(in) :: qtime
+        real(wp) :: volcell, voltot
+        real(wp) :: lag_void_max, lag_void_avg, lag_vol
+        real(wp) :: void_max_glb, void_avg_glb, vol_glb
+
+        integer :: i, j, k
+
+        character(LEN=path_len + 2*name_len) :: file_loc
+        logical :: file_exist
+
+        lag_void_max = 0._wp
+        lag_void_avg = 0._wp
+        lag_vol = 0._wp
+        $:GPU_PARALLEL_LOOP(private='[volcell]', collapse=3, reduction='[[lag_vol, lag_void_avg], [lag_void_max]]', reductionOp='[+, MAX]', copy='[lag_vol, lag_void_avg, lag_void_max]')
+        do k = 0, p
+            do j = 0, n
+                do i = 0, m
+                    lag_void_max = max(lag_void_max, 1._wp - q_particles(alphaf_id)%sf(i, j, k))
+                    call s_get_char_vol(i, j, k, volcell)
+                    if ((1._wp - q_particles(alphaf_id)%sf(i, j, k)) > 5.0d-11) then
+                        lag_void_avg = lag_void_avg + (1._wp - q_particles(alphaf_id)%sf(i, j, k))*volcell
+                        lag_vol = lag_vol + volcell
+                    end if
+                end do
+            end do
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+#ifdef MFC_MPI
+        if (num_procs > 1) then
+            call s_mpi_allreduce_max(lag_void_max, void_max_glb)
+            lag_void_max = void_max_glb
+            call s_mpi_allreduce_sum(lag_vol, vol_glb)
+            lag_vol = vol_glb
+            call s_mpi_allreduce_sum(lag_void_avg, void_avg_glb)
+            lag_void_avg = void_avg_glb
+        end if
+#endif
+        voltot = lag_void_avg
+        ! This voidavg value does not reflect the real void fraction in the cloud
+        ! since the cell which does not have bubbles are not accounted
+        if (lag_vol > 0._wp) lag_void_avg = lag_void_avg/lag_vol
+
+        if (proc_rank == 0) then
+            write (LAG_VOID_ID, '(6X,4e24.8)') &
+                qtime, &
+                lag_void_avg, &
+                lag_void_max, &
+                voltot
+        end if
+
+    end subroutine s_write_void_evol_particles
+
+    subroutine s_close_void_evol
+
+        if (proc_rank == 0) close (LAG_VOID_ID)
+
+    end subroutine s_close_void_evol
+
+    !>  Subroutine that writes the restarting files for the particles in the lagrangian solver.
+        !!  @param t_step Current time step
+    impure subroutine s_write_restart_lag_particles(t_step)
+
+        ! Generic string used to store the address of a particular file
+        integer, intent(in) :: t_step
+
+        character(LEN=path_len + 2*name_len) :: file_loc
+        logical :: file_exist
+        integer :: part_id, tot_part
+        integer :: i, k
+
+#ifdef MFC_MPI
+        ! For Parallel I/O
+        integer :: ifile, ierr
+        integer, dimension(MPI_STATUS_SIZE) :: status
+        integer(KIND=MPI_OFFSET_KIND) :: disp
+        integer :: view
+        integer, dimension(2) :: gsizes, lsizes, start_idx_part
+        integer, dimension(num_procs) :: part_order, part_ord_mpi
+        integer, dimension(num_procs) :: proc_particle_counts
+        real(wp), dimension(1:1, 1:lag_io_vars) :: dummy
+        dummy = 0._wp
+
+        part_id = 0._wp
+        if (n_el_particles_loc /= 0) then
+            do k = 1, n_el_particles_loc
+                if (particle_in_domain_physical(particle_pos(k, 1:3, 1))) then
+                    part_id = part_id + 1
+                end if
+            end do
+        end if
+
+        if (.not. parallel_io) return
+
+        lsizes(1) = part_id
+        lsizes(2) = lag_io_vars
+
+        ! Total number of particles
+        call MPI_ALLREDUCE(part_id, tot_part, 1, MPI_integer, &
+                           MPI_SUM, MPI_COMM_WORLD, ierr)
+
+        call MPI_ALLGATHER(part_id, 1, MPI_INTEGER, proc_particle_counts, 1, MPI_INTEGER, &
+                           MPI_COMM_WORLD, ierr)
+
+        ! Calculate starting index for this processor's particles
+        call MPI_EXSCAN(lsizes(1), start_idx_part(1), 1, MPI_INTEGER, MPI_SUM, MPI_COMM_WORLD, ierr)
+        if (proc_rank == 0) start_idx_part(1) = 0
+        start_idx_part(2) = 0
+
+        gsizes(1) = tot_part
+        gsizes(2) = lag_io_vars
+
+        write (file_loc, '(A,I0,A)') 'lag_bubbles_', t_step, '.dat'
+        file_loc = trim(case_dir)//'/restart_data'//trim(mpiiofs)//trim(file_loc)
+
+        ! Clean up existing file
+        if (proc_rank == 0) then
+            inquire (FILE=trim(file_loc), EXIST=file_exist)
+            if (file_exist) then
+                call MPI_FILE_DELETE(file_loc, mpi_info_int, ierr)
+            end if
+        end if
+
+        call MPI_BARRIER(MPI_COMM_WORLD, ierr)
+
+        if (proc_rank == 0) then
+            call MPI_FILE_OPEN(MPI_COMM_SELF, file_loc, &
+                               ior(MPI_MODE_WRONLY, MPI_MODE_CREATE), &
+                               mpi_info_int, ifile, ierr)
+
+            ! Write header using MPI I/O for consistency
+            call MPI_FILE_WRITE(ifile, tot_part, 1, MPI_INTEGER, status, ierr)
+            call MPI_FILE_WRITE(ifile, mytime, 1, mpi_p, status, ierr)
+            call MPI_FILE_WRITE(ifile, dt, 1, mpi_p, status, ierr)
+            call MPI_FILE_WRITE(ifile, num_procs, 1, MPI_INTEGER, status, ierr)
+            call MPI_FILE_WRITE(ifile, proc_particle_counts, num_procs, MPI_INTEGER, status, ierr)
+
+            call MPI_FILE_CLOSE(ifile, ierr)
+        end if
+
+        call MPI_BARRIER(MPI_COMM_WORLD, ierr)
+
+        if (part_id > 0) then
+            allocate (MPI_IO_DATA_lag_bubbles(max(1, part_id), 1:lag_io_vars))
+
+            i = 1
+            do k = 1, n_el_particles_loc
+                if (particle_in_domain_physical(particle_pos(k, 1:3, 1))) then
+                    MPI_IO_DATA_lag_bubbles(i, 1) = real(lag_part_id(k, 1))
+                    MPI_IO_DATA_lag_bubbles(i, 2:4) = particle_pos(k, 1:3, 1)
+                    MPI_IO_DATA_lag_bubbles(i, 5:7) = particle_posPrev(k, 1:3, 1)
+                    MPI_IO_DATA_lag_bubbles(i, 8:10) = particle_vel(k, 1:3, 1)
+                    MPI_IO_DATA_lag_bubbles(i, 11) = particle_rad(k, 1)
+                    ! MPI_IO_DATA_lag_bubbles(i, 12) = intfc_vel(k, 1)
+                    MPI_IO_DATA_lag_bubbles(i, 13) = particle_R0(k)
+                    MPI_IO_DATA_lag_bubbles(i, 14) = Rmax_stats_part(k)
+                    MPI_IO_DATA_lag_bubbles(i, 15) = Rmin_stats_part(k)
+                    ! MPI_IO_DATA_lag_bubbles(i, 16) = bub_dphidt(k)
+                    ! MPI_IO_DATA_lag_bubbles(i, 17) = gas_p(k, 1)
+                    ! MPI_IO_DATA_lag_bubbles(i, 18) = gas_mv(k, 1)
+                    MPI_IO_DATA_lag_bubbles(i, 19) = particle_mass(k)
+                    ! MPI_IO_DATA_lag_bubbles(i, 20) = gas_betaT(k)
+                    ! MPI_IO_DATA_lag_bubbles(i, 21) = gas_betaC(k)
+                    i = i + 1
+                end if
+            end do
+
+            call MPI_TYPE_CREATE_SUBARRAY(2, gsizes, lsizes, start_idx_part, &
+                                          MPI_ORDER_FORTRAN, mpi_p, view, ierr)
+            call MPI_TYPE_COMMIT(view, ierr)
+
+            call MPI_FILE_OPEN(MPI_COMM_WORLD, file_loc, &
+                               ior(MPI_MODE_WRONLY, MPI_MODE_CREATE), &
+                               mpi_info_int, ifile, ierr)
+
+            ! Skip header (written by rank 0)
+            disp = int(sizeof(tot_part) + 2*sizeof(mytime) + sizeof(num_procs) + &
+                       num_procs*sizeof(proc_particle_counts(1)), MPI_OFFSET_KIND)
+            call MPI_FILE_SET_VIEW(ifile, disp, mpi_p, view, 'native', mpi_info_int, ierr)
+
+            call MPI_FILE_WRITE_ALL(ifile, MPI_IO_DATA_lag_bubbles, &
+                                    lag_io_vars*part_id, mpi_p, status, ierr)
+
+            call MPI_FILE_CLOSE(ifile, ierr)
+
+            deallocate (MPI_IO_DATA_lag_bubbles)
+
+        else
+            call MPI_TYPE_CONTIGUOUS(0, mpi_p, view, ierr)
+            call MPI_TYPE_COMMIT(view, ierr)
+
+            call MPI_FILE_OPEN(MPI_COMM_WORLD, file_loc, &
+                               ior(MPI_MODE_WRONLY, MPI_MODE_CREATE), &
+                               mpi_info_int, ifile, ierr)
+
+            ! Skip header (written by rank 0)
+            disp = int(sizeof(tot_part) + 2*sizeof(mytime) + sizeof(num_procs) + &
+                       num_procs*sizeof(proc_particle_counts(1)), MPI_OFFSET_KIND)
+            call MPI_FILE_SET_VIEW(ifile, disp, mpi_p, view, 'native', mpi_info_int, ierr)
+
+            call MPI_FILE_WRITE_ALL(ifile, dummy, 0, mpi_p, status, ierr)
+
+            call MPI_FILE_CLOSE(ifile, ierr)
+        end if
+
+#endif
+
+    end subroutine s_write_restart_lag_particles
+
+    !>  This procedure calculates the maximum and minimum radius of each bubble.
+    subroutine s_calculate_lag_particle_stats()
+
+        integer :: k
+
+        $:GPU_PARALLEL_LOOP(private='[k]', reduction='[[Rmax_glb], [Rmin_glb]]', &
+            & reductionOp='[MAX, MIN]', copy='[Rmax_glb,Rmin_glb]')
+        do k = 1, n_el_particles_loc
+            Rmax_glb = max(Rmax_glb, particle_rad(k, 1)/particle_R0(k))
+            Rmin_glb = min(Rmin_glb, particle_rad(k, 1)/particle_R0(k))
+            Rmax_stats_part(k) = max(Rmax_stats_part(k), particle_rad(k, 1)/particle_R0(k))
+            Rmin_stats_part(k) = min(Rmin_stats_part(k), particle_rad(k, 1)/particle_R0(k))
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+    end subroutine s_calculate_lag_particle_stats
+
+    impure subroutine s_open_lag_particle_stats()
+
+        character(LEN=path_len + 2*name_len) :: file_loc
+        character(LEN=20) :: FMT
+        logical :: file_exist
+
+        write (file_loc, '(A,I0,A)') 'stats_lag_bubbles_', proc_rank, '.dat'
+        file_loc = trim(case_dir)//'/D/'//trim(file_loc)
+        call my_inquire(trim(file_loc), file_exist)
+
+        if (precision == 1) then
+            FMT = "(A10,A14,5A16)"
+        else
+            FMT = "(A10,A14,5A24)"
+        end if
+
+        if (.not. file_exist) then
+            open (LAG_STATS_ID, FILE=trim(file_loc), FORM='formatted', position='rewind')
+            write (LAG_STATS_ID, *) 'proc_rank, particleID, x, y, z, Rmax_glb, Rmin_glb'
+        else
+            open (LAG_STATS_ID, FILE=trim(file_loc), FORM='formatted', position='append')
+        end if
+
+    end subroutine s_open_lag_particle_stats
+
+    !>  Subroutine that writes the maximum and minimum radius of each bubble.
+    impure subroutine s_write_lag_particle_stats()
+
+        integer :: k
+        character(LEN=path_len + 2*name_len) :: file_loc
+        character(LEN=20) :: FMT
+
+        $:GPU_UPDATE(host='[Rmax_glb,Rmin_glb]')
+
+        if (precision == 1) then
+            FMT = "(I10,I14,5F16.8)"
+        else
+            FMT = "(I10,I14,5F24.16)"
+        end if
+
+        do k = 1, n_el_particles_loc
+            write (LAG_STATS_ID, FMT) &
+                proc_rank, &
+                lag_part_id(k, 1), &
+                particle_pos(k, 1, 1), &
+                particle_pos(k, 2, 1), &
+                particle_pos(k, 3, 1), &
+                Rmax_stats_part(k), &
+                Rmin_stats_part(k)
+        end do
+
+    end subroutine s_write_lag_particle_stats
+
+    subroutine s_close_lag_particle_stats
+
+        close (LAG_STATS_ID)
+
+    end subroutine s_close_lag_particle_stats
+
+    !> The purpose of this subroutine is to remove one specific particle if dt is too small.
+          !! @param part_id Particle id
+    impure subroutine s_copy_lag_particle(dest, src)
+
+        integer, intent(in) :: src, dest
+
+        particle_R0(dest) = particle_R0(src)
+        Rmax_stats_part(dest) = Rmax_stats_part(src)
+        Rmin_stats_part(dest) = Rmin_stats_part(src)
+        particle_mass(dest) = particle_mass(src)
+        lag_part_id(dest, 1) = lag_part_id(src, 1)
+        particle_rad(dest, 1:2) = particle_rad(src, 1:2)
+        particle_vel(dest, 1:3, 1:2) = particle_vel(src, 1:3, 1:2)
+        particle_s(dest, 1:3, 1:2) = particle_s(src, 1:3, 1:2)
+        particle_pos(dest, 1:3, 1:2) = particle_pos(src, 1:3, 1:2)
+        particle_posPrev(dest, 1:3, 1:2) = particle_posPrev(src, 1:3, 1:2)
+        particle_draddt(dest, 1:lag_num_ts) = particle_draddt(src, 1:lag_num_ts)
+        f_p(dest, 1:3) = f_p(src, 1:3)
+        particle_dposdt(dest, 1:3, 1:lag_num_ts) = particle_dposdt(src, 1:3, 1:lag_num_ts)
+        particle_dveldt(dest, 1:3, 1:lag_num_ts) = particle_dveldt(src, 1:3, 1:lag_num_ts)
+
+    end subroutine s_copy_lag_particle
+
+    !> The purpose of this subroutine is to deallocate variables
+    impure subroutine s_finalize_particle_lagrangian_solver()
+
+        integer :: i
+
+        if (lag_params%write_void_evol) call s_close_void_evol
+        if (lag_params%write_bubbles) call s_close_lag_particle_evol()
+        if (lag_params%write_bubbles_stats) call s_close_lag_particle_stats()
+
+        do i = 1, q_particles_idx
+            @:DEALLOCATE(q_particles(i)%sf)
+            @:DEALLOCATE(kahan_comp(i)%sf)
+        end do
+        @:DEALLOCATE(q_particles)
+        @:DEALLOCATE(kahan_comp)
+
+        do i = 1, nField_vars
+            @:DEALLOCATE(field_vars(i)%sf)
+        end do
+        @:DEALLOCATE(field_vars)
+
+        do i = 1, nWeights_interp
+            @:DEALLOCATE(weights_x_interp(i)%sf)
+        end do
+        @:DEALLOCATE(weights_x_interp)
+
+        do i = 1, nWeights_interp
+            @:DEALLOCATE(weights_y_interp(i)%sf)
+        end do
+        @:DEALLOCATE(weights_y_interp)
+
+        do i = 1, nWeights_interp
+            @:DEALLOCATE(weights_z_interp(i)%sf)
+        end do
+        @:DEALLOCATE(weights_z_interp)
+
+        !Deallocating space
+        @:DEALLOCATE(lag_part_id)
+        @:DEALLOCATE(gid_to_local)
+        @:DEALLOCATE(particle_R0)
+        @:DEALLOCATE(Rmax_stats_part)
+        @:DEALLOCATE(Rmin_stats_part)
+        @:DEALLOCATE(particle_mass)
+        @:DEALLOCATE(p_AM)
+        @:DEALLOCATE(p_owner_rank)
+        @:DEALLOCATE(particle_rad)
+        @:DEALLOCATE(particle_pos)
+        @:DEALLOCATE(particle_posPrev)
+        @:DEALLOCATE(particle_vel)
+        @:DEALLOCATE(particle_s)
+        @:DEALLOCATE(particle_draddt)
+        @:DEALLOCATE(particle_dposdt)
+        @:DEALLOCATE(particle_dveldt)
+        @:DEALLOCATE(f_p)
+        @:DEALLOCATE(gSum)
+
+        @:DEALLOCATE(force_recv_ids)
+        @:DEALLOCATE(force_recv_vals)
+
+        @:DEALLOCATE(keep_bubble)
+        @:DEALLOCATE(wrap_bubble_loc, wrap_bubble_dir)
+
+        @:DEALLOCATE(linked_list)
+        @:DEALLOCATE(particle_head)
+
+        ! Deallocate cell list arrays
+        @:DEALLOCATE(cell_list_start)
+        @:DEALLOCATE(cell_list_count)
+        @:DEALLOCATE(cell_list_idx)
+
+    end subroutine s_finalize_particle_lagrangian_solver
+
+end module m_particles_EL
diff --git a/src/simulation/m_particles_EL_kernels.fpp b/src/simulation/m_particles_EL_kernels.fpp
new file mode 100644
index 0000000000..198e2fcf25
--- /dev/null
+++ b/src/simulation/m_particles_EL_kernels.fpp
@@ -0,0 +1,979 @@
+!>
+!! @file m_particles_EL_kernels.f90
+!! @brief Contains module m_particles_EL_kernels
+
+#:include 'macros.fpp'
+
+!> @brief This module contains kernel functions used to map the effect of the lagrangian particles
+!!        in the Eulerian framework.
+module m_particles_EL_kernels
+
+    use m_mpi_proxy            !< Message passing interface (MPI) module proxy
+
+    use ieee_arithmetic        !< For checking NaN
+
+    implicit none
+
+    ! Cell list for particle-to-cell mapping (rebuilt each RK stage before smearing)
+    integer, allocatable, dimension(:, :, :) :: cell_list_start  ! (0:m, 0:n, 0:p)
+    integer, allocatable, dimension(:, :, :) :: cell_list_count  ! (0:m, 0:n, 0:p)
+    integer, allocatable, dimension(:) :: cell_list_idx          ! (1:nParticles_glb) sorted particle indices
+    $:GPU_DECLARE(create='[cell_list_start, cell_list_count, cell_list_idx]')
+
+contains
+
+    ! !> The purpose of this subroutine is to compute each particles total contribution to the gaussian for proper normalization
+    subroutine s_compute_gaussian_contribution(rad, pos, cell, func_s)
+        $:GPU_ROUTINE(function_name='s_compute_gaussian_contribution',parallelism='[seq]', &
+            & cray_inline=True)
+
+        real(wp), intent(in) :: rad
+        real(wp), intent(in), dimension(3) :: pos
+        integer, intent(in), dimension(3) :: cell
+        real(wp), intent(out) :: func_s
+
+        real(wp) :: volpart, stddsv, Vol_loc, func
+        real(wp), dimension(3) :: nodecoord, center
+
+        integer :: ip, jp, kp, di, dj, dk, di_beg, di_end, dj_beg, dj_end, dk_beg, dk_end, mapCells_loc
+        integer, dimension(3) :: cellijk
+
+        mapCells_loc = 1
+
+        volpart = (4._wp/3._wp)*pi*rad**3._wp
+
+        call s_compute_stddsv(cell, volpart, stddsv)
+
+        ip = cell(1)
+        jp = cell(2)
+        kp = cell(3)
+
+        di_beg = ip - mapCells_loc
+        di_end = ip + mapCells_loc
+        dj_beg = jp - mapCells_loc
+        dj_end = jp + mapCells_loc
+        dk_beg = kp
+        dk_end = kp
+
+        if (num_dims == 3) then
+            dk_beg = kp - mapCells_loc
+            dk_end = kp + mapCells_loc
+        end if
+
+        func_s = 0._wp
+        do dk = dk_beg, dk_end
+            do dj = dj_beg, dj_end
+                do di = di_beg, di_end
+
+                    nodecoord(1) = x_cc(di)
+                    nodecoord(2) = y_cc(dj)
+                    nodecoord(3) = 0._wp
+                    if (p > 0) nodecoord(3) = z_cc(dk)
+
+                    cellijk(1) = di
+                    cellijk(2) = dj
+                    cellijk(3) = dk
+
+                    center(1:2) = pos(1:2)
+                    center(3) = 0._wp
+                    if (p > 0) center(3) = pos(3)
+
+                    Vol_loc = dx(cellijk(1))*dy(cellijk(2))
+                    if (num_dims == 3) Vol_loc = dx(cellijk(1))*dy(cellijk(2))*dz(cellijk(3))
+
+                    call s_applygaussian(center, cellijk, nodecoord, stddsv, 0._wp, func)
+
+                    func_s = func_s + func*Vol_loc
+                end do
+            end do
+        end do
+
+    end subroutine s_compute_gaussian_contribution
+
+    !> The purpose of this subroutine is to compute the gaussian smearing of particle volume fraction and source terms with atomic cell updates
+    subroutine s_gaussian_atomic(rad, vel, pos, force_p, gauSum, cell, updatedvar, onlyBeta)
+        $:GPU_ROUTINE(function_name='s_gaussian_atomic',parallelism='[seq]', &
+            & cray_inline=True)
+
+        real(wp), intent(in) :: rad, gauSum
+        real(wp), intent(in), dimension(3) :: pos, vel, force_p
+        integer, intent(in), dimension(3) :: cell
+        type(scalar_field), dimension(:), intent(inout) :: updatedvar
+
+        real(wp) :: volpart, stddsv, Vol_loc, func, weight
+        real(wp) :: fp_x, fp_y, fp_z, vp_x, vp_y, vp_z
+        real(wp) :: addFun_alphap, addFun_alphap_vp_x, addFun_alphap_vp_y, addFun_alphap_vp_z
+        real(wp) :: addFun2_x, addFun2_y, addFun2_z, addFun_E
+        real(wp), dimension(3) :: nodecoord, center
+
+        integer :: ip, jp, kp, di, dj, dk, di_beg, di_end, dj_beg, dj_end, dk_beg, dk_end, mapCells_loc
+        integer, dimension(3) :: cellijk
+        logical, intent(in) :: onlyBeta
+
+        mapCells_loc = 1
+
+        volpart = (4._wp/3._wp)*pi*rad**3
+
+        call s_compute_stddsv(cell, volpart, stddsv)
+
+        ip = cell(1)
+        jp = cell(2)
+        kp = cell(3)
+
+        di_beg = ip - mapCells_loc
+        di_end = ip + mapCells_loc
+        dj_beg = jp - mapCells_loc
+        dj_end = jp + mapCells_loc
+        dk_beg = kp
+        dk_end = kp
+
+        if (num_dims == 3) then
+            dk_beg = kp - mapCells_loc
+            dk_end = kp + mapCells_loc
+        end if
+
+        fp_x = -force_p(1)
+        fp_y = -force_p(2)
+        fp_z = -force_p(3)
+
+        vp_x = vel(1)
+        vp_y = vel(2)
+        vp_z = vel(3)
+
+        center(1:2) = pos(1:2)
+        center(3) = 0._wp
+        if (p > 0) center(3) = pos(3)
+
+        do dk = dk_beg, dk_end
+            do dj = dj_beg, dj_end
+                do di = di_beg, di_end
+
+                    nodecoord(1) = x_cc(di)
+                    nodecoord(2) = y_cc(dj)
+                    nodecoord(3) = 0._wp
+                    if (p > 0) nodecoord(3) = z_cc(dk)
+
+                    cellijk(1) = di
+                    cellijk(2) = dj
+                    cellijk(3) = dk
+
+                    Vol_loc = dx(cellijk(1))*dy(cellijk(2))
+                    if (num_dims == 3) Vol_loc = Vol_loc*dz(cellijk(3))
+
+                    call s_applygaussian(center, cellijk, nodecoord, stddsv, 0._wp, func)
+
+                    weight = func/gauSum
+
+                    addFun_alphap = weight*volpart
+                    $:GPU_ATOMIC(atomic='update')
+                    updatedvar(1)%sf(cellijk(1), cellijk(2), cellijk(3)) = &
+                        updatedvar(1)%sf(cellijk(1), cellijk(2), cellijk(3)) &
+                        + real(addFun_alphap, kind=stp)
+
+                    if (lag_params%solver_approach == 2 .and. .not. onlyBeta) then
+
+                        !Update particle momentum field(x)
+                        addFun_alphap_vp_x = weight*volpart*vp_x
+                        $:GPU_ATOMIC(atomic='update')
+                        updatedvar(2)%sf(cellijk(1), cellijk(2), cellijk(3)) = &
+                            updatedvar(2)%sf(cellijk(1), cellijk(2), cellijk(3)) &
+                            + real(addFun_alphap_vp_x, kind=stp)
+
+                        !Update particle momentum field(y)
+                        addFun_alphap_vp_y = weight*volpart*vp_y
+                        $:GPU_ATOMIC(atomic='update')
+                        updatedvar(3)%sf(cellijk(1), cellijk(2), cellijk(3)) = &
+                            updatedvar(3)%sf(cellijk(1), cellijk(2), cellijk(3)) &
+                            + real(addFun_alphap_vp_y, kind=stp)
+
+                        if (num_dims == 3) then
+                            !Update particle momentum field(z)
+                            addFun_alphap_vp_z = weight*volpart*vp_z
+                            $:GPU_ATOMIC(atomic='update')
+                            updatedvar(4)%sf(cellijk(1), cellijk(2), cellijk(3)) = &
+                                updatedvar(4)%sf(cellijk(1), cellijk(2), cellijk(3)) &
+                                + real(addFun_alphap_vp_z, kind=stp)
+                        end if
+
+                        !Update x-momentum source term
+                        addFun2_x = weight*fp_x
+                        $:GPU_ATOMIC(atomic='update')
+                        updatedvar(5)%sf(cellijk(1), cellijk(2), cellijk(3)) = &
+                            updatedvar(5)%sf(cellijk(1), cellijk(2), cellijk(3)) &
+                            + real(addFun2_x, kind=stp)
+
+                        !Update y-momentum source term
+                        addFun2_y = weight*fp_y
+                        $:GPU_ATOMIC(atomic='update')
+                        updatedvar(6)%sf(cellijk(1), cellijk(2), cellijk(3)) = &
+                            updatedvar(6)%sf(cellijk(1), cellijk(2), cellijk(3)) &
+                            + real(addFun2_y, kind=stp)
+
+                        if (num_dims == 3) then
+                            !Update z-momentum source term
+                            addFun2_z = weight*fp_z
+                            $:GPU_ATOMIC(atomic='update')
+                            updatedvar(7)%sf(cellijk(1), cellijk(2), cellijk(3)) = &
+                                updatedvar(7)%sf(cellijk(1), cellijk(2), cellijk(3)) &
+                                + real(addFun2_z, kind=stp)
+                        end if
+
+                        !Update energy source term
+                        addFun_E = 0._wp
+                        $:GPU_ATOMIC(atomic='update')
+                        updatedvar(8)%sf(cellijk(1), cellijk(2), cellijk(3)) = &
+                            updatedvar(8)%sf(cellijk(1), cellijk(2), cellijk(3)) &
+                            + real(addFun_E, kind=stp)
+
+                    end if
+
+                end do
+            end do
+        end do
+
+    end subroutine s_gaussian_atomic
+
+    !> The purpose of this subroutine is to apply the gaussian kernel function for each particle (Maeda and Colonius, 2018)).
+    subroutine s_applygaussian(center, cellaux, nodecoord, stddsv, strength_idx, func)
+        $:GPU_ROUTINE(function_name='s_applygaussian',parallelism='[seq]', &
+            & cray_inline=True)
+
+        real(wp), dimension(3), intent(in) :: center
+        integer, dimension(3), intent(in) :: cellaux
+        real(wp), dimension(3), intent(in) :: nodecoord
+        real(wp), intent(in) :: stddsv
+        real(wp), intent(in) :: strength_idx
+        real(wp), intent(out) :: func
+        integer :: i
+
+        real(wp) :: distance
+        real(wp) :: theta, dtheta, L2, dzp, Lz2, zc
+        real(wp) :: Nr, Nr_count
+
+        distance = sqrt((center(1) - nodecoord(1))**2._wp + (center(2) - nodecoord(2))**2._wp + (center(3) - nodecoord(3))**2._wp)
+
+        if (num_dims == 3) then
+            !< 3D gaussian function
+            func = exp(-0.5_wp*(distance/stddsv)**2._wp)/(sqrt(2._wp*pi)*stddsv)**3._wp
+        else
+            if (cyl_coord) then
+                !< 2D cylindrical function:
+                ! We smear particles in the azimuthal direction for given r
+                theta = 0._wp
+                Nr = ceiling(2._wp*pi*nodecoord(2)/(y_cb(cellaux(2)) - y_cb(cellaux(2) - 1)))
+                dtheta = 2._wp*pi/Nr
+                L2 = center(2)**2._wp + nodecoord(2)**2._wp - 2._wp*center(2)*nodecoord(2)*cos(theta)
+                distance = sqrt((center(1) - nodecoord(1))**2._wp + L2)
+                ! Factor 2._wp is for symmetry (upper half of the 2D field (+r) is considered)
+                func = dtheta/2._wp/pi*exp(-0.5_wp*(distance/stddsv)**2._wp)/(sqrt(2._wp*pi)*stddsv)**3._wp
+                Nr_count = 0._wp
+                do while (Nr_count < Nr - 1._wp)
+                    Nr_count = Nr_count + 1._wp
+                    theta = Nr_count*dtheta
+                    ! trigonometric relation
+                    L2 = center(2)**2._wp + nodecoord(2)**2._wp - 2._wp*center(2)*nodecoord(2)*cos(theta)
+                    distance = sqrt((center(1) - nodecoord(1))**2._wp + L2)
+                    ! nodecoord(2)*dtheta is the azimuthal width of the cell
+                    func = func + &
+                           dtheta/2._wp/pi*exp(-0.5_wp*(distance/stddsv)**2._wp)/(sqrt(2._wp*pi)*stddsv)**(3._wp*(strength_idx + 1._wp))
+                end do
+            else
+                !< 2D cartesian function: Equation (48) from Madea and Colonius 2018
+                ! We smear particles considering a virtual depth (lag_params%charwidth) with lag_params%charNz cells
+                dzp = (lag_params%charwidth/(lag_params%charNz + 1._wp))
+
+                func = 0._wp
+                do i = 0, lag_params%charNz
+                    zc = (-lag_params%charwidth/2._wp + dzp*(0.5_wp + i)) ! Center of virtual cell i in z-direction
+                    Lz2 = (center(3) - zc)**2._wp
+                    distance = sqrt((center(1) - nodecoord(1))**2._wp + (center(2) - nodecoord(2))**2._wp + Lz2)
+                    func = func + dzp/lag_params%charwidth*exp(-0.5_wp*(distance/stddsv)**2._wp)/(sqrt(2._wp*pi)*stddsv)**3._wp
+                end do
+            end if
+        end if
+
+    end subroutine s_applygaussian
+
+    !> Calculates the standard deviation of the particle being smeared in the Eulerian framework.
+            !! @param cell Cell where the particle is located
+            !! @param volpart Volume of the particle
+            !! @param stddsv Standard deviaton
+    subroutine s_compute_stddsv(cell, volpart, stddsv)
+        $:GPU_ROUTINE(function_name='s_compute_stddsv',parallelism='[seq]', &
+            & cray_inline=True)
+
+        integer, dimension(3), intent(in) :: cell
+        real(wp), intent(in) :: volpart
+        real(wp), intent(out) :: stddsv
+
+        real(wp) :: chardist, charvol
+        real(wp) :: rad
+
+        !< Compute characteristic distance
+        chardist = sqrt(dx(cell(1))*dy(cell(2)))
+        if (p > 0) chardist = (dx(cell(1))*dy(cell(2))*dz(cell(3)))**(1._wp/3._wp)
+
+        !< Compute characteristic volume
+        if (p > 0) then
+            charvol = dx(cell(1))*dy(cell(2))*dz(cell(3))
+        else
+            if (cyl_coord) then
+                charvol = dx(cell(1))*dy(cell(2))*y_cc(cell(2))*2._wp*pi
+            else
+                charvol = dx(cell(1))*dy(cell(2))*lag_params%charwidth
+            end if
+        end if
+
+        rad = (3._wp*volpart/(4._wp*pi))**(1._wp/3._wp)
+        stddsv = 1._wp*lag_params%epsilonb*max(chardist, rad)
+
+    end subroutine s_compute_stddsv
+
+    !> The purpose of this subroutine is to check if the current cell is outside the computational domain or not (including ghost cells).
+            !! @param cellaux Tested cell to smear the particle effect in.
+            !! @param celloutside If true, then cellaux is outside the computational domain.
+    subroutine s_check_celloutside(cellaux, celloutside)
+        $:GPU_ROUTINE(function_name='s_check_celloutside',parallelism='[seq]', &
+            & cray_inline=True)
+
+        integer, dimension(3), intent(inout) :: cellaux
+        logical, intent(out) :: celloutside
+
+        celloutside = .false.
+
+        if (num_dims == 2) then
+            if ((cellaux(1) < -buff_size) .or. (cellaux(2) < -buff_size)) then
+                celloutside = .true.
+            end if
+
+            if ((cellaux(1) > m + buff_size) .or. (cellaux(2) > n + buff_size)) then
+                celloutside = .true.
+            end if
+        else
+            if ((cellaux(1) < -buff_size) .or. (cellaux(2) < -buff_size) .or. (cellaux(3) < -buff_size)) then
+                celloutside = .true.
+            end if
+
+            if ((cellaux(1) > m + buff_size) .or. (cellaux(2) > n + buff_size) .or. (cellaux(3) > p + buff_size)) then
+                celloutside = .true.
+            end if
+        end if
+
+    end subroutine s_check_celloutside
+
+    !> This subroutine transforms the computational coordinates of the particle from
+            !!      real type into integer.
+            !! @param s Computational coordinates of the particle, real type
+            !! @param get_cell Computational coordinates of the particle, integer type
+    subroutine s_get_cell(s_cell, get_cell)
+        $:GPU_ROUTINE(function_name='s_get_cell',parallelism='[seq]', &
+            & cray_inline=True)
+
+        real(wp), dimension(3), intent(in) :: s_cell
+        integer, dimension(3), intent(out) :: get_cell
+        integer :: i
+
+        get_cell(:) = int(s_cell(:))
+        do i = 1, num_dims
+            if (s_cell(i) < 0._wp) get_cell(i) = get_cell(i) - 1
+        end do
+
+    end subroutine s_get_cell
+
+    !> The purpose of this procedure is to calculate the characteristic cell volume
+            !! @param cell Computational coordinates (x, y, z)
+            !! @param Charvol Characteristic volume
+    subroutine s_get_char_vol(cellx, celly, cellz, Charvol)
+        $:GPU_ROUTINE(function_name='s_get_char_vol',parallelism='[seq]', &
+            & cray_inline=True)
+
+        integer, intent(in) :: cellx, celly, cellz
+        real(wp), intent(out) :: Charvol
+
+        if (p > 0) then
+            Charvol = dx(cellx)*dy(celly)*dz(cellz)
+        else
+            if (cyl_coord) then
+                Charvol = dx(cellx)*dy(celly)*y_cc(celly)*2._wp*pi
+            else
+                Charvol = dx(cellx)*dy(celly)*lag_params%charwidth
+            end if
+        end if
+
+    end subroutine s_get_char_vol
+
+    !! This function calculates the force on a particle
+            !!      based on the pressure gradient, velocity, and drag model.
+            !! @param pos Position of the particle
+            !! @param rad Radius of the particle
+            !! @param vel_p Velocity of the particle
+            !! @param mass_p Particle mass
+            !! @param Re Viscosity!
+            !! @param rho Density of the fluid
+            !! @param vol_frac Particle Volume Fraction
+            !! @param cell Computational coordinates of the particle
+            !! @param q_prim_vf Eulerian field with primitive variables
+            !! @return a Acceleration of the particle in direction i
+    subroutine s_get_particle_force(pos, rad, vel_p, mass_p, Re, gamm, vol_frac, drhodt, cell, &
+                                    q_prim_vf, fieldvars, wx, wy, wz, force, rmass_add)
+        $:GPU_ROUTINE(parallelism='[seq]')
+        real(wp), intent(in) :: rad, mass_p, Re, gamm, vol_frac, drhodt
+        real(wp), dimension(3), intent(in) :: pos
+        integer, dimension(3), intent(in) :: cell
+        real(wp), dimension(3), intent(in) :: vel_p
+        type(scalar_field), dimension(:), intent(in) :: fieldvars
+        type(scalar_field), dimension(:), intent(in) :: wx, wy, wz
+        type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf
+
+        real(wp), dimension(3), intent(out) :: force
+        real(wp), intent(out) :: rmass_add
+
+        real(wp) :: a, vol, rho_fluid, pressure_fluid
+        real(wp), dimension(3) :: v_rel, dp
+        real(wp), dimension(fd_order) :: xi, eta, L
+        real(wp) :: particle_diam, gas_mu, vmag, cson
+        real(wp) :: slip_velocity_x, slip_velocity_y, slip_velocity_z, beta
+        real(wp), dimension(3) :: fluid_vel
+        integer :: dir
+
+        !Added pass params
+        real(wp) :: mach, Cam, flux_f, flux_b, div_u, SDrho, vpgradrho
+        real(wp), dimension(3) :: rhoDuDt, grad_rho, fam
+        integer, dimension(3) :: p1
+
+        force = 0._wp
+        dp = 0._wp
+        grad_rho = 0._wp
+        fam = 0._wp
+        fluid_vel = 0._wp
+        v_rel = 0._wp
+        rhoDuDt = 0._wp
+        SDrho = 0._wp
+        ! div_u = 0._wp
+
+        !!Interpolation - either even ordered barycentric or 0th order
+        if (lag_params%interpolation_order > 1) then
+            rho_fluid = f_interp_barycentric(pos, cell, q_prim_vf, 1, wx, wy, wz)
+            pressure_fluid = f_interp_barycentric(pos, cell, q_prim_vf, E_idx, wx, wy, wz)
+            do dir = 1, num_dims
+                if (lag_params%pressure_force .or. lag_params%added_mass_model > 0) then
+                    dp(dir) = f_interp_barycentric(pos, cell, fieldvars, dir, wx, wy, wz)
+                end if
+                if (lag_params%added_mass_model > 0) then
+                    grad_rho(dir) = f_interp_barycentric(pos, cell, fieldvars, 3 + dir, wx, wy, wz)
+                    ! div_u = div_u + f_interp_barycentric(pos, cell, fieldvars, 6 + dir, wx, wy, wz)
+                end if
+                fluid_vel(dir) = f_interp_barycentric(pos, cell, q_prim_vf, momxb + dir - 1, wx, wy, wz)
+            end do
+        else
+            rho_fluid = q_prim_vf(1)%sf(cell(1), cell(2), cell(3))
+            pressure_fluid = q_prim_vf(E_idx)%sf(cell(1), cell(2), cell(3))
+            do dir = 1, num_dims
+                if (lag_params%pressure_force .or. lag_params%added_mass_model > 0) then
+                    dp(dir) = fieldvars(dir)%sf(cell(1), cell(2), cell(3))
+                end if
+                if (lag_params%added_mass_model > 0) then
+                    grad_rho(dir) = fieldvars(3 + dir)%sf(cell(1), cell(2), cell(3))
+                    ! div_u = div_u + fieldvars(6 + dir)%sf(cell(1), cell(2), cell(3))
+                end if
+                fluid_vel(dir) = q_prim_vf(momxb + dir - 1)%sf(cell(1), cell(2), cell(3))
+            end do
+        end if
+
+        v_rel = vel_p - fluid_vel
+
+        if (lag_params%qs_drag_model > 0 .or. lag_params%added_mass_model > 0) then
+            ! Quasi-steady Drag Force Parameters
+            slip_velocity_x = fluid_vel(1) - vel_p(1)
+            slip_velocity_y = fluid_vel(2) - vel_p(2)
+            if (num_dims == 3) then
+                slip_velocity_z = fluid_vel(3) - vel_p(3)
+                vmag = sqrt(slip_velocity_x*slip_velocity_x + slip_velocity_y*slip_velocity_y + &
+                            slip_velocity_z*slip_velocity_z)
+            elseif (num_dims == 2) then
+                vmag = sqrt(slip_velocity_x*slip_velocity_x + slip_velocity_y*slip_velocity_y)
+            end if
+            particle_diam = rad*2._wp
+            cson = sqrt((gamm*pressure_fluid)/rho_fluid) !gamma*P/rho
+            gas_mu = Re
+        end if
+
+        if (lag_params%added_mass_model > 0) then
+            rhoDuDt = -dp
+            vpgradrho = dot_product(vel_p, grad_rho)
+            SDrho = drhodt + fluid_vel(1)*grad_rho(1) + fluid_vel(2)*grad_rho(2) + fluid_vel(3)*grad_rho(3)
+            mach = vmag/cson
+        end if
+
+        ! Step 1: Force component quasi-steady
+        if (lag_params%qs_drag_model == 1) then
+            beta = QS_Parmar(rho_fluid, cson, gas_mu, gamm, vmag, particle_diam, vol_frac)
+            force = force - beta*v_rel
+        else if (lag_params%qs_drag_model == 2) then
+            beta = QS_Osnes(rho_fluid, cson, gas_mu, gamm, vmag, particle_diam, vol_frac)
+            force = force - beta*v_rel
+        else if (lag_params%qs_drag_model == 3) then
+            beta = QS_ModifiedParmar(rho_fluid, cson, gas_mu, gamm, vmag, particle_diam, vol_frac)
+            force = force - beta*v_rel
+        else if (lag_params%qs_drag_model == 4) then
+            beta = QS_Gidaspow(rho_fluid, cson, gas_mu, gamm, vmag, particle_diam, vol_frac)
+            force = force - beta*v_rel
+        else
+            !No Quasi-Steady drag
+        end if
+
+        ! Step 1.1: Stokes drag
+        if (lag_params%stokes_drag == 1) then ! Free slip Stokes drag
+            force = force - 4._wp*pi*gas_mu*rad*v_rel
+        elseif (lag_params%stokes_drag == 2) then ! No slip Stokes drag
+            force = force - 6._wp*pi*gas_mu*rad*v_rel
+        else
+            !No stokes drag
+        end if
+
+        ! Step 2: Pressure Gradient Force
+        if (lag_params%pressure_force) then
+            vol = (4._wp/3._wp)*pi*(rad**3._wp)
+            force = force - vol*dp
+        end if
+
+        ! Step 3: Gravitational Force
+        if (lag_params%gravity_force) then
+            force = force + (mass_p)*accel_bf
+        end if
+
+        ! Step 4: Added Mass Force
+        if (lag_params%added_mass_model == 1) then
+            vol = (4._wp/3._wp)*pi*(rad**3._wp)
+            if (mach > 0.6_wp) then
+                Cam = 1._wp + 1.8_wp*(0.6_wp**2) + 7.6_wp* &
+                      (0.6_wp**4)
+            else
+                Cam = 1._wp + 1.8_wp*mach**2 + 7.6_wp*mach**4
+            end if
+
+            Cam = 0.5_wp*Cam*(1._wp + 0.68_wp*vol_frac**2)
+            rmass_add = rho_fluid*vol*Cam !(1._wp-vol_frac)*rho_fluid*vol*Cam
+
+            fam = Cam*vol*(vel_p*SDrho + rhoDuDt + &
+                           fluid_vel*(vpgradrho))
+
+            do dir = 1, num_dims
+                if (.not. ieee_is_finite(fam(dir))) then
+                    fam(dir) = 0._wp
+                    rmass_add = 0._wp
+                end if
+            end do
+            force = force + fam
+        else
+            rmass_add = 0._wp
+        end if
+
+        do dir = 1, num_dims
+            if (.not. ieee_is_finite(force(dir))) then
+                force(dir) = 0._wp
+            end if
+        end do
+
+    end subroutine s_get_particle_force
+
+    function f_interp_barycentric(pos, cell, field_vf, field_index, wx, wy, wz) result(val)
+        $:GPU_ROUTINE(parallelism='[seq]')
+
+        real(wp), dimension(3), intent(in) :: pos
+        integer, dimension(3), intent(in) :: cell
+        type(scalar_field), dimension(:), intent(in) :: field_vf
+        type(scalar_field), dimension(:), intent(in) :: wx, wy, wz
+        integer, intent(in) :: field_index
+
+        integer :: i, j, k, ix, jy, kz, npts, npts_z, N, a, b
+        integer :: ix_count, jy_count, kz_count
+        real(wp) :: weight, numerator, denominator, xBar, eps
+        real(wp) :: val, local_min, local_max, prod_x, prod_y, prod_z
+
+        i = cell(1)
+        j = cell(2)
+        k = cell(3)
+
+        N = lag_params%interpolation_order
+        npts = N/2
+        npts_z = npts
+        if (num_dims == 2) npts_z = 0
+        eps = 1.e-12_wp
+        numerator = 0._wp
+        denominator = 0._wp
+
+        ! if (abs(pos(1) - x_cc(i)) <= eps .and. &
+        !     abs(pos(2) - y_cc(j)) <= eps .and. &
+        !     abs(pos(3) - z_cc(k)) <= eps) then
+        !     val = field_vf(field_index)%sf(i, j, k)
+        !     return
+        ! end if
+
+        ix_count = 0
+        do ix = i - npts, i + npts
+            ix_count = ix_count + 1
+            jy_count = 0
+            do jy = j - npts, j + npts
+                jy_count = jy_count + 1
+                kz_count = 0
+                do kz = k - npts_z, k + npts_z
+                    kz_count = kz_count + 1
+                    if (num_dims == 3) then
+                        xBar = (pos(1) - x_cc(ix))*(pos(2) - y_cc(jy))*(pos(3) - z_cc(kz))
+                        weight = wx(ix_count)%sf(i, 1, 1)*wy(jy_count)%sf(j, 1, 1)*wz(kz_count)%sf(k, 1, 1)
+                    else
+                        xBar = (pos(1) - x_cc(ix))*(pos(2) - y_cc(jy))
+                        weight = wx(ix_count)%sf(i, 1, 1)*wy(jy_count)%sf(j, 1, 1)
+                    end if
+                    weight = weight/xBar
+                    numerator = numerator + weight*field_vf(field_index)%sf(ix, jy, kz)
+                    denominator = denominator + weight
+                end do
+            end do
+        end do
+
+        val = numerator/denominator
+
+        if (.not. ieee_is_finite(val)) then
+            val = field_vf(field_index)%sf(i, j, k)
+
+        elseif (abs(val) <= eps) then
+            val = 0._wp
+
+        end if
+
+    end function
+
+    ! Quasi-steady force (Re_p and Ma_p corrections):
+    !   Improved Drag Correlation for Spheres and Application
+    !   to Shock-Tube Experiments
+    !   - Parmar et al. (2010)
+    !   - AIAA Journal
+    !
+    ! Quasi-steady force (phi corrections):
+    !   The Added Mass, Basset, and Viscous Drag Coefficients
+    !   in Nondilute Bubbly Liquids Undergoing Small-Amplitude
+    !   Oscillatory Motion
+    !   - Sangani et al. (1991)
+    !   - Phys. Fluids A
+    function QS_Parmar(rho, cson, mu_fluid, gamma, vmag, dp, volume_fraction) result(beta)
+        $:GPU_ROUTINE(parallelism='[seq]')
+        real(wp), intent(in) :: rho, cson, mu_fluid, gamma, vmag, dp, volume_fraction
+        real(wp) :: rcd1, rmacr, rcd_mcr, rcd_std, rmach_rat, rcd_M1
+        real(wp) :: rcd_M2, C1, C2, C3, f1M, f2M, f3M, lrep, factor, cd, phi_corr
+        real(wp) :: beta
+        real(wp) :: rmachp, mp, phi, rep, re
+
+        rmachp = vmag/cson
+        mp = max(rmachp, 0.01_wp)
+        phi = max(volume_fraction, 0.0001_wp)
+        rep = vmag*dp*rho/mu_fluid
+        re = max(rep, 0.1_wp)
+
+        if (re < 1.e-14_wp) then
+            rcd1 = 1.0_wp
+        else
+            rmacr = 0.6_wp ! Critical rmachp no
+            rcd_mcr = (1.+0.15*re**(0.684)) + &
+                      (re/24.0)*(0.513/(1.+483./re**(0.669)))
+            if (mp <= rmacr) then
+                rcd_std = (1.+0.15*re**(0.687)) + &
+                          (re/24.0)*(0.42/(1.+42500./re**(1.16)))
+                rmach_rat = mp/rmacr
+                rcd1 = rcd_std + (rcd_mcr - rcd_std)*rmach_rat
+            else if (mp <= 1.0) then
+                rcd_M1 = (1.0 + 0.118*re**0.813) + &
+                         (re/24.0)*0.69/(1.0 + 3550.0/re**.793)
+                C1 = 6.48_wp
+                C2 = 9.28_wp
+                C3 = 12.21_wp
+                f1M = -1.884_wp + 8.422_wp*mp - 13.70_wp*mp**2 + 8.162_wp*mp**3
+                f2M = -2.228_wp + 10.35_wp*mp - 16.96_wp*mp**2 + 9.840_wp*mp**3
+                f3M = 4.362_wp - 16.91_wp*mp + 19.84_wp*mp**2 - 6.296_wp*mp**3
+                lrep = log(re)
+                factor = f1M*(lrep - C2)*(lrep - C3)/((C1 - C2)*(C1 - C3)) &
+                         + f2M*(lrep - C1)*(lrep - C3)/((C2 - C1)*(C2 - C3)) &
+                         + f3M*(lrep - C1)*(lrep - C2)/((C3 - C1)*(C3 - C2))
+                rcd1 = rcd_mcr + (rcd_M1 - rcd_mcr)*factor
+            else if (mp < 1.75) then
+                rcd_M1 = (1.0 + 0.118*re**0.813) + &
+                         (re/24.0)*0.69/(1.0 + 3550.0/re**.793)
+                rcd_M2 = (1.0 + 0.107*re**0.867) + &
+                         (re/24.0)*0.646/(1.0 + 861.0/re**.634)
+                C1 = 6.48_wp
+                C2 = 8.93_wp
+                C3 = 12.21_wp
+                f1M = -2.963 + 4.392*mp - 1.169*mp**2 - 0.027*mp**3 &
+                      - 0.233*exp((1.0 - mp)/0.011)
+                f2M = -6.617 + 12.11*mp - 6.501*mp**2 + 1.182*mp**3 &
+                      - 0.174*exp((1.0 - mp)/0.010)
+                f3M = -5.866 + 11.57*mp - 6.665*mp**2 + 1.312*mp**3 &
+                      - 0.350*exp((1.0 - mp)/0.012)
+                lrep = log(re)
+                factor = f1M*(lrep - C2)*(lrep - C3)/((C1 - C2)*(C1 - C3)) &
+                         + f2M*(lrep - C1)*(lrep - C3)/((C2 - C1)*(C2 - C3)) &
+                         + f3M*(lrep - C1)*(lrep - C2)/((C3 - C1)*(C3 - C2))
+                rcd1 = rcd_M1 + (rcd_M2 - rcd_M1)*factor
+            else
+                rcd1 = (1.0 + 0.107*re**0.867) + &
+                       (re/24.0)*0.646/(1.0 + 861.0/re**.634)
+            end if ! mp
+        end if    ! re
+
+        ! Sangani's volume fraction correction for dilute random arrays
+        ! Capping volume fraction at 0.5
+        phi_corr = (1.0 + 5.94*min(phi, 0.5))
+
+        cd = (24.0/re)*rcd1*phi_corr
+
+        beta = rcd1*3.0*pi*mu_fluid*dp
+
+        beta = beta*phi_corr
+
+    end function QS_Parmar
+
+    ! Quasi-steady force (Re_p and Ma_p corrections):
+    !   Improved Drag Correlation for Spheres and Application
+    !   to Shock-Tube Experiments
+    !   - Parmar et al. (2010)
+    !   - AIAA Journal
+    !
+    ! Quasi-steady force (phi corrections):
+    !   Sangani et al. (1991) volume fraction correction overshoots
+    !   the drag coefficient.
+    !
+    !   We adopt instead Osnes et al. (2023) volume fraction correction
+    !   based on Tenneti et al. with one extra term.
+    !
+    !   At Mach=0, the drag coefficient from this subroutine matches very
+    !   well with the one calculated using the Osnes subroutine, for various
+    !   Reynolds numbers and volume fractions.
+    function QS_ModifiedParmar(rho, cson, mu_fluid, gamma, vmag, dp, volume_fraction) result(beta)
+        $:GPU_ROUTINE(parallelism='[seq]')
+        real(wp), intent(in) :: rho, cson, mu_fluid, gamma, vmag, dp, volume_fraction
+        real(wp) :: rcd1, rmacr, rcd_mcr, rcd_std, rmach_rat, rcd_M1
+        real(wp) :: rcd_M2, C1, C2, C3, f1M, f2M, f3M, lrep, factor, cd, phi_corr
+        real(wp) :: b1, b2, b3
+        real(wp) :: beta
+        real(wp) :: rmachp, mp, phi, rep, re
+
+        rmachp = vmag/cson
+        mp = max(rmachp, 0.01_wp)
+        phi = max(volume_fraction, 0.0001_wp)
+        rep = vmag*dp*rho/mu_fluid
+        re = max(rep, 0.1_wp)
+
+        if (re < 1e-14) then
+            rcd1 = 1.0
+        else
+            rmacr = 0.6 ! Critical rmachp no.
+            rcd_mcr = (1.+0.15*re**(0.684)) + &
+                      (re/24.0)*(0.513/(1.+483./re**(0.669)))
+            if (mp <= rmacr) then
+                rcd_std = (1.+0.15*re**(0.687)) + &
+                          (re/24.0)*(0.42/(1.+42500./re**(1.16)))
+                rmach_rat = mp/rmacr
+                rcd1 = rcd_std + (rcd_mcr - rcd_std)*rmach_rat
+            else if (mp <= 1.0) then
+                rcd_M1 = (1.0 + 0.118*re**0.813) + &
+                         (re/24.0)*0.69/(1.0 + 3550.0/re**.793)
+                C1 = 6.48
+                C2 = 9.28
+                C3 = 12.21
+                f1M = -1.884 + 8.422*mp - 13.70*mp**2 + 8.162*mp**3
+                f2M = -2.228 + 10.35*mp - 16.96*mp**2 + 9.840*mp**3
+                f3M = 4.362 - 16.91*mp + 19.84*mp**2 - 6.296*mp**3
+                lrep = log(re)
+                factor = f1M*(lrep - C2)*(lrep - C3)/((C1 - C2)*(C1 - C3)) &
+                         + f2M*(lrep - C1)*(lrep - C3)/((C2 - C1)*(C2 - C3)) &
+                         + f3M*(lrep - C1)*(lrep - C2)/((C3 - C1)*(C3 - C2))
+                rcd1 = rcd_mcr + (rcd_M1 - rcd_mcr)*factor
+            else if (mp < 1.75) then
+                rcd_M1 = (1.0 + 0.118*re**0.813) + &
+                         (re/24.0)*0.69/(1.0 + 3550.0/re**.793)
+                rcd_M2 = (1.0 + 0.107*re**0.867) + &
+                         (re/24.0)*0.646/(1.0 + 861.0/re**.634)
+                C1 = 6.48
+                C2 = 8.93
+                C3 = 12.21
+                f1M = -2.963 + 4.392*mp - 1.169*mp**2 - 0.027*mp**3 &
+                      - 0.233*exp((1.0 - mp)/0.011)
+                f2M = -6.617 + 12.11*mp - 6.501*mp**2 + 1.182*mp**3 &
+                      - 0.174*exp((1.0 - mp)/0.010)
+                f3M = -5.866 + 11.57*mp - 6.665*mp**2 + 1.312*mp**3 &
+                      - 0.350*exp((1.0 - mp)/0.012)
+                lrep = log(re)
+                factor = f1M*(lrep - C2)*(lrep - C3)/((C1 - C2)*(C1 - C3)) &
+                         + f2M*(lrep - C1)*(lrep - C3)/((C2 - C1)*(C2 - C3)) &
+                         + f3M*(lrep - C1)*(lrep - C2)/((C3 - C1)*(C3 - C2))
+                rcd1 = rcd_M1 + (rcd_M2 - rcd_M1)*factor
+            else
+                rcd1 = (1.0 + 0.107*re**0.867) + &
+                       (re/24.0)*0.646/(1.0 + 861.0/re**.634)
+            end if ! mp
+        end if    ! re
+
+        ! Osnes's volume fraction correction
+        b1 = 5.81*phi/((1.0 - phi)**2) + &
+             0.48*(phi**(1._wp/3._wp))/((1.0 - phi)**3)
+
+        b2 = ((1.0 - phi)**2)*(phi**3)* &
+             re*(0.95 + 0.61*(phi**3)/((1.0 - phi)*2))
+
+        b3 = min(sqrt(20.0_wp*mp), 1.0_wp)* &
+             (5.65*phi - 22.0*(phi**2) + 23.4*(phi**3))* &
+             (1 + tanh((mp - (0.65 - 0.24*phi))/0.35))
+
+        cd = (24.0/re)*rcd1
+
+        cd = cd/(1.0 - phi) + b3 + (24.0/re)*(1.0 - phi)*(b1 + b2)
+
+        beta = 3.0*pi*mu_fluid*dp*(re/24.0)*cd
+
+    end function QS_ModifiedParmar
+
+    ! QS Force calculated as a function of Re, Ma and phi
+    !
+    ! Use Osnes etal (2023) correlations
+    ! A.N. Osnes, M. Vartdal, M. Khalloufi,
+    !    J. Capecelatro, and S. Balachandar.
+    ! Comprehensive quasi-steady force correlations for compressible flow
+    !    through random particle suspensions.
+    ! International Journal of Multiphase Flow, Vol. 165, 104485, (2023).
+    ! doi: https://doi.org/10.1016/j.imultiphaseflow.2023.104485.
+    !
+    ! E. Loth, J.T. Daspit, M. Jeong, T. Nagata, and T. Nonomura.
+    ! Supersonic and hypersonic drag coefficients for a sphere.
+    ! AIAA Journal, Vol. 59(8), pp. 3261-3274, (2021).
+    ! doi: https://doi.org/10.2514/1.J060153.
+    !
+    ! NOTE: Re<45 Rarefied fomu_fluidla of Loth et al has been redefined by Balachandar
+    ! to avoid singularity as Ma -> 0.
+    function QS_Osnes(rho, cson, mu_fluid, gamma, vmag, dp, volume_fraction) result(beta)
+        $:GPU_ROUTINE(parallelism='[seq]')
+        real(wp), intent(in) :: rho, cson, mu_fluid, gamma, vmag, dp, volume_fraction
+        real(wp) :: rmachp, mp, phi, rep, re
+        real(wp) :: Knp, fKn, CD1, s, JM, CD2, cd_loth, CM, GM, HM, b1, b2, b3, cd, sgby2, JMt
+        real(wp) :: beta
+
+        rmachp = vmag/cson
+        mp = max(rmachp, 0.01_wp)
+        phi = max(volume_fraction, 0.0001_wp)
+        rep = vmag*dp*rho/mu_fluid
+        re = max(rep, 0.1_wp)
+
+        ! Loth's correlation
+        if (re <= 45.0_wp) then
+            ! Rarefied-dominated regime
+            Knp = sqrt(0.5_wp*pi*gamma)*mp/re
+            if (Knp > 0.01_wp) then
+                fKn = 1.0_wp/(1.0_wp + Knp*(2.514_wp + 0.8_wp*exp(-0.55_wp/Knp)))
+            else
+                fKn = 1.0_wp/(1.0_wp + Knp*(2.514_wp + 0.8_wp*exp(-0.55_wp/0.01_wp)))
+
+            end if
+            CD1 = (24.0_wp/re)*(1.0_wp + 0.15_wp*re**(0.687_wp))*fKn
+            s = mp*sqrt(0.5_wp*gamma)
+            sgby2 = sqrt(0.5_wp*gamma)
+            if (mp <= 1) then
+                !JMt = 2.26_wp*(mp**4) - 0.1_wp*(mp**3) + 0.14_wp*mp
+                JMt = 2.26_wp*(mp**4) + 0.14_wp*mp
+            else
+                JMt = 1.6_wp*(mp**4) + 0.25_wp*(mp**3) + 0.11_wp*(mp**2) + 0.44_wp*mp
+            end if
+            !
+            ! Refomu_fluidlated version of Loth et al. to avoid singularity at mp = 0
+            !
+            CD2 = (1.0_wp + 2.0_wp*(s**2))*exp(-s**2)*mp &
+                  /((sgby2**3)*sqrt(pi)) &
+                  + (4.0_wp*(s**4) + 4.0_wp*(s**2) - 1.0_wp) &
+                  *erf(s)/(2.0_wp*(sgby2**4)) &
+                  + (2.0_wp*(mp**3)/(3.0_wp*sgby2))*sqrt(pi)
+
+            CD2 = CD2/(1.0_wp + (((CD2/JMt) - 1.0_wp)*sqrt(re/45.0_wp)))
+            cd_loth = CD1/(1.0_wp + (mp**4)) &
+                      + CD2/(1.0_wp + (mp**4))
+        else
+            ! Compression-dominated regime
+            ! TLJ: coefficients tweaked to get continuous values
+            !      on the two branches at the critical points
+            if (mp < 1.5_wp) then
+                CM = 1.65_wp + 0.65_wp*tanh(4._wp*mp - 3.4_wp)
+            else
+                !CM = 2.18_wp - 0.13_wp*tanh(0.9_wp*mp - 2.7_wp)
+                CM = 2.18_wp - 0.12913149918318745_wp*tanh(0.9_wp*mp - 2.7_wp)
+            end if
+            if (mp < 0.8) then
+                GM = 166.0_wp*(mp**3) + 3.29_wp*(mp**2) - 10.9_wp*mp + 20._wp
+            else
+                !GM = 5.0_wp + 40._wp*(mp**(-3))
+                GM = 5.0_wp + 47.809331200000017_wp*(mp**(-3))
+            end if
+            if (mp < 1) then
+                HM = 0.0239_wp*(mp**3) + 0.212_wp*(mp**2) &
+                     - 0.074_wp*mp + 1._wp
+            else
+                !HM =   0.93_wp + 1.0_wp / (3.5_wp + (mp**5))
+                HM = 0.93967777777777772_wp + 1.0_wp/(3.5_wp + (mp**5))
+            end if
+
+            cd_loth = (24.0_wp/re)*(1._wp + 0.15_wp*(re**(0.687_wp)))*HM + &
+                      0.42_wp*CM/(1._wp + 42500._wp/re**(1.16_wp*CM) + GM/sqrt(re))
+
+        end if
+
+        b1 = 5.81_wp*phi/((1.0_wp - phi)**2) + &
+             0.48_wp*(phi**(1._wp/3._wp))/((1.0_wp - phi)**3)
+
+        b2 = ((1.0_wp - phi)**2)*(phi**3)* &
+             re*(0.95_wp + 0.61_wp*(phi**3)/((1.0_wp - phi)*2))
+
+        b3 = min(sqrt(20.0_wp*mp), 1.0_wp)* &
+             (5.65_wp*phi - 22.0_wp*(phi**2) + 23.4_wp*(phi**3))* &
+             (1._wp + tanh((mp - (0.65_wp - 0.24_wp*phi))/0.35_wp))
+
+        cd = cd_loth/(1.0_wp - phi) + b3 + (24.0_wp/re)*(1.0_wp - phi)*(b1 + b2)
+
+        beta = 3.0_wp*pi*mu_fluid*dp*(re/24.0_wp)*cd
+
+    end function QS_Osnes
+
+! Subroutine for Quasi-Steady Drag Model of Gidaspow
+!
+! D. Gidaspow, Multiphase Flow and Fluidization (Academic Press, 1994)
+!
+! Note: Model is provided per cell volume. We convert that to per
+! particle using the particle volume fraction and volume
+    function QS_Gidaspow(rho, cson, mu_fluid, gamma, vmag, dp, volume_fraction) result(beta)
+        $:GPU_ROUTINE(parallelism='[seq]')
+        real(wp), intent(in) :: rho, cson, mu_fluid, gamma, vmag, dp, volume_fraction
+        real(wp) :: cd, phifRep, phif
+        real(wp) :: phi, rep, re
+        real(wp) :: beta
+
+        rep = vmag*dp*rho/mu_fluid
+        phi = max(volume_fraction, 0.0001_wp)
+        phif = max(1._wp - volume_fraction, 0.0001_wp)
+        re = max(rep, 0.1_wp)
+
+        phifRep = phif*re
+
+        if (phifRep < 1000.0) then
+            cd = 24.0/phifRep*(1.0 + 0.15*(phifRep)**0.687)
+        else
+            cd = 0.44
+        end if
+
+        if (phif < 0.8) then
+            beta = 150.0*((phi**2)*mu_fluid)/(phif*dp**2) &
+                   + 1.75*(rho*phi*vmag/dp)
+        else
+            beta = 0.75*cd*phi*rho*vmag/(dp*phif**1.65)
+        end if
+
+        beta = beta*(pi*dp**3)/(6.0*phi)
+
+    end function QS_Gidaspow
+
+end module m_particles_EL_kernels
diff --git a/src/simulation/m_rhs.fpp b/src/simulation/m_rhs.fpp
index e6fd6348ed..8901a2c8bf 100644
--- a/src/simulation/m_rhs.fpp
+++ b/src/simulation/m_rhs.fpp
@@ -30,6 +30,8 @@ module m_rhs
 
     use m_bubbles_EL
 
+    use m_particles_EL
+
     use m_qbmm                 !< Moment inversion
 
     use m_hypoelastic
@@ -1033,21 +1035,46 @@ contains
         end if
 
         if (bubbles_lagrange) then
-            ! RHS additions for sub-grid bubbles_lagrange
-            call nvtxStartRange("RHS-EL-BUBBLES-SRC")
-            call s_compute_bubbles_EL_source( &
-                q_cons_qp%vf(1:sys_size), &
-                q_prim_qp%vf(1:sys_size), &
-                rhs_vf)
-            call nvtxEndRange
             ! Compute bubble dynamics
             if (.not. adap_dt) then
                 call nvtxStartRange("RHS-EL-BUBBLES-DYN")
                 call s_compute_bubble_EL_dynamics( &
                     q_prim_qp%vf(1:sys_size), &
+                    bc_type, &
                     stage)
                 call nvtxEndRange
             end if
+
+            ! RHS additions for sub-grid bubbles_lagrange
+            if (lag_params%solver_approach == 2) then
+                call nvtxStartRange("RHS-EL-BUBBLES-SRC")
+                call s_compute_bubbles_EL_source( &
+                    q_cons_qp%vf(1:sys_size), &
+                    q_prim_qp%vf(1:sys_size), &
+                    rhs_vf)
+                call nvtxEndRange
+            end if
+        end if
+
+        if (particles_lagrange) then
+            ! Compute particle dynamics, forces, dvdt
+            call nvtxStartRange("RHS-EL-PARTICLES-DYN")
+            call s_compute_particle_EL_dynamics( &
+                q_prim_qp%vf(1:sys_size), &
+                bc_type, stage, &
+                qL_rsx_vf, qL_rsy_vf, qL_rsz_vf, &
+                qR_rsx_vf, qR_rsy_vf, qR_rsz_vf, rhs_vf)
+            call nvtxEndRange
+
+            ! RHS additions for sub-grid particles_lagrange
+            if (lag_params%solver_approach == 2) then
+                call nvtxStartRange("RHS-EL-PARTICLES-SRC")
+                call s_compute_particles_EL_source( &
+                    q_cons_qp%vf(1:sys_size), &
+                    q_prim_qp%vf(1:sys_size), &
+                    rhs_vf, stage)
+                call nvtxEndRange
+            end if
         end if
 
         if (chemistry .and. chem_params%reactions) then
@@ -1060,7 +1087,7 @@ contains
 
         ! END: Additional pphysics and source terms
 
-        if (run_time_info .or. probe_wrt .or. ib .or. bubbles_lagrange) then
+        if (run_time_info .or. probe_wrt .or. ib .or. bubbles_lagrange .or. particles_lagrange) then
             if (.not. igr .or. dummy) then
                 $:GPU_PARALLEL_LOOP(private='[i,j,k,l]', collapse=4)
                 do i = 1, sys_size
diff --git a/src/simulation/m_sim_helpers.fpp b/src/simulation/m_sim_helpers.fpp
index a7680f84e9..f8e8526ced 100644
--- a/src/simulation/m_sim_helpers.fpp
+++ b/src/simulation/m_sim_helpers.fpp
@@ -175,61 +175,56 @@ contains
         !! @param j x index
         !! @param k y index
         !! @param l z index
-        !! @param icfl_sf cell-centered inviscid cfl number
-        !! @param vcfl_sf (optional) cell-centered viscous CFL number
-        !! @param Rc_sf (optional) cell centered Rc
-    subroutine s_compute_stability_from_dt(vel, c, rho, Re_l, j, k, l, icfl_sf, vcfl_sf, Rc_sf)
+        !! @param icfl cell-centered inviscid cfl number
+        !! @param vcfl (optional) cell-centered viscous CFL number
+        !! @param Rc (optional) cell centered Rc
+    subroutine s_compute_stability_from_dt(vel, c, rho, Re_l, j, k, l, icfl, vcfl, Rc)
         $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in), dimension(num_vels) :: vel
         real(wp), intent(in) :: c, rho
-        real(wp), dimension(0:m, 0:n, 0:p), intent(inout) :: icfl_sf
-        real(wp), dimension(0:m, 0:n, 0:p), intent(inout), optional :: vcfl_sf, Rc_sf
+        real(wp), intent(inout) :: icfl
+        real(wp), intent(inout), optional :: vcfl, Rc
         real(wp), dimension(2), intent(in) :: Re_l
         integer, intent(in) :: j, k, l
 
         real(wp) :: fltr_dtheta
 
         ! Inviscid CFL calculation
-        if (p > 0 .or. n > 0) then
-            ! 2D/3D
-            icfl_sf(j, k, l) = dt/f_compute_multidim_cfl_terms(vel, c, j, k, l)
-        else
-            ! 1D
-            icfl_sf(j, k, l) = (dt/dx(j))*(abs(vel(1)) + c)
+        if (p > 0 .or. n > 0) then ! 2D/3D
+            icfl = dt/f_compute_multidim_cfl_terms(vel, c, j, k, l)
+        else ! 1D
+            icfl = (dt/dx(j))*(abs(vel(1)) + c)
         end if
 
         ! Viscous calculations
         if (viscous) then
-            if (p > 0) then
+            if (p > 0) then !3D
                 #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
-                    !3D
                     if (grid_geometry == 3) then
                         fltr_dtheta = f_compute_filtered_dtheta(k, l)
-                        vcfl_sf(j, k, l) = maxval(dt/Re_l/rho) &
-                                           /min(dx(j), dy(k), fltr_dtheta)**2._wp
-                        Rc_sf(j, k, l) = min(dx(j)*(abs(vel(1)) + c), &
-                                             dy(k)*(abs(vel(2)) + c), &
-                                             fltr_dtheta*(abs(vel(3)) + c)) &
-                                         /maxval(1._wp/Re_l)
+                        vcfl = maxval(dt/Re_l/rho) &
+                               /min(dx(j), dy(k), fltr_dtheta)**2._wp
+                        Rc = min(dx(j)*(abs(vel(1)) + c), &
+                                 dy(k)*(abs(vel(2)) + c), &
+                                 fltr_dtheta*(abs(vel(3)) + c)) &
+                             /maxval(1._wp/Re_l)
                     else
-                        vcfl_sf(j, k, l) = maxval(dt/Re_l/rho) &
-                                           /min(dx(j), dy(k), dz(l))**2._wp
-                        Rc_sf(j, k, l) = min(dx(j)*(abs(vel(1)) + c), &
-                                             dy(k)*(abs(vel(2)) + c), &
-                                             dz(l)*(abs(vel(3)) + c)) &
-                                         /maxval(1._wp/Re_l)
+                        vcfl = maxval(dt/Re_l/rho) &
+                               /min(dx(j), dy(k), dz(l))**2._wp
+                        Rc = min(dx(j)*(abs(vel(1)) + c), &
+                                 dy(k)*(abs(vel(2)) + c), &
+                                 dz(l)*(abs(vel(3)) + c)) &
+                             /maxval(1._wp/Re_l)
                     end if
                 #:endif
-            elseif (n > 0) then
-                !2D
-                vcfl_sf(j, k, l) = maxval(dt/Re_l/rho)/min(dx(j), dy(k))**2._wp
-                Rc_sf(j, k, l) = min(dx(j)*(abs(vel(1)) + c), &
-                                     dy(k)*(abs(vel(2)) + c)) &
-                                 /maxval(1._wp/Re_l)
-            else
-                !1D
-                vcfl_sf(j, k, l) = maxval(dt/Re_l/rho)/dx(j)**2._wp
-                Rc_sf(j, k, l) = dx(j)*(abs(vel(1)) + c)/maxval(1._wp/Re_l)
+            elseif (n > 0) then !2D
+                vcfl = maxval(dt/Re_l/rho)/min(dx(j), dy(k))**2._wp
+                Rc = min(dx(j)*(abs(vel(1)) + c), &
+                         dy(k)*(abs(vel(2)) + c)) &
+                     /maxval(1._wp/Re_l)
+            else !1D
+                vcfl = maxval(dt/Re_l/rho)/dx(j)**2._wp
+                Rc = dx(j)*(abs(vel(1)) + c)/maxval(1._wp/Re_l)
             end if
         end if
 
@@ -248,7 +243,7 @@ contains
         $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), dimension(num_vels), intent(in) :: vel
         real(wp), intent(in) :: c, rho
-        real(wp), dimension(0:m, 0:n, 0:p), intent(inout) :: max_dt
+        real(wp), intent(inout) :: max_dt
         real(wp), dimension(2), intent(in) :: Re_l
         integer, intent(in) :: j, k, l
 
@@ -256,18 +251,15 @@ contains
         real(wp) :: fltr_dtheta
 
         ! Inviscid CFL calculation
-        if (p > 0 .or. n > 0) then
-            ! 2D/3D cases
+        if (p > 0 .or. n > 0) then ! 2D/3D cases
             icfl_dt = cfl_target*f_compute_multidim_cfl_terms(vel, c, j, k, l)
-        else
-            ! 1D case
+        else ! 1D cases
             icfl_dt = cfl_target*(dx(j)/(abs(vel(1)) + c))
         end if
 
         ! Viscous calculations
         if (viscous) then
-            if (p > 0) then
-                !3D
+            if (p > 0) then !3D
                 if (grid_geometry == 3) then
                     fltr_dtheta = f_compute_filtered_dtheta(k, l)
                     vcfl_dt = cfl_target*(min(dx(j), dy(k), fltr_dtheta)**2._wp) &
@@ -276,19 +268,17 @@ contains
                     vcfl_dt = cfl_target*(min(dx(j), dy(k), dz(l))**2._wp) &
                               /maxval(1/(rho*Re_l))
                 end if
-            elseif (n > 0) then
-                !2D
+            elseif (n > 0) then !2D
                 vcfl_dt = cfl_target*(min(dx(j), dy(k))**2._wp)/maxval((1/Re_l)/rho)
-            else
-                !1D
+            else !1D
                 vcfl_dt = cfl_target*(dx(j)**2._wp)/maxval(1/(rho*Re_l))
             end if
         end if
 
-        if (any(Re_size > 0)) then
-            max_dt(j, k, l) = min(icfl_dt, vcfl_dt)
+        if (viscous) then
+            max_dt = min(icfl_dt, vcfl_dt)
         else
-            max_dt(j, k, l) = icfl_dt
+            max_dt = icfl_dt
         end if
 
     end subroutine s_compute_dt_from_cfl
diff --git a/src/simulation/m_start_up.fpp b/src/simulation/m_start_up.fpp
index 925ad55db0..ae6755db6d 100644
--- a/src/simulation/m_start_up.fpp
+++ b/src/simulation/m_start_up.fpp
@@ -56,6 +56,8 @@ module m_start_up
 
     use m_bubbles_EL            !< Lagrange bubble dynamics routines
 
+    use m_particles_EL            !< Lagrange particle dynamics routines
+
     use ieee_arithmetic
 
     use m_helper_basic          !< Functions to compare floating point numbers
@@ -140,8 +142,6 @@ contains
             rdma_mpi, teno_CT, mp_weno, weno_avg, &
             riemann_solver, low_Mach, wave_speeds, avg_state, &
             bc_x, bc_y, bc_z, &
-            x_a, y_a, z_a, x_b, y_b, z_b, &
-            x_domain, y_domain, z_domain, &
             hypoelasticity, &
             ib, num_ibs, patch_ib, &
             fluid_pp, bub_pp, probe_wrt, prim_vars_wrt, &
@@ -175,7 +175,8 @@ contains
             hyper_cleaning, hyper_cleaning_speed, hyper_cleaning_tau, &
             alf_factor, num_igr_iters, num_igr_warm_start_iters, &
             int_comp, ic_eps, ic_beta, nv_uvm_out_of_core, &
-            nv_uvm_igr_temps_on_gpu, nv_uvm_pref_gpu, down_sample, fft_wrt
+            nv_uvm_igr_temps_on_gpu, nv_uvm_pref_gpu, down_sample, fft_wrt, &
+            particles_lagrange, particle_pp
 
         ! Checking that an input file has been provided by the user. If it
         ! has, then the input file is read in, otherwise, simulation exits.
@@ -211,11 +212,15 @@ contains
 
             if (cfl_adap_dt .or. cfl_const_dt) cfl_dt = .true.
 
-            if (any((/bc_x%beg, bc_x%end, bc_y%beg, bc_y%end, bc_z%beg, bc_z%end/) == -17) .or. &
+            if (any((/bc_x%beg, bc_x%end, bc_y%beg, bc_y%end, bc_z%beg, bc_z%end/) == BC_DIRICHLET) .or. &
                 num_bc_patches > 0) then
                 bc_io = .true.
             end if
 
+            if (bc_x%beg == BC_PERIODIC .and. bc_x%end == BC_PERIODIC) periodic_bc(1) = .true.
+            if (bc_y%beg == BC_PERIODIC .and. bc_y%end == BC_PERIODIC) periodic_bc(2) = .true.
+            if (bc_z%beg == BC_PERIODIC .and. bc_z%end == BC_PERIODIC) periodic_bc(3) = .true.
+
         else
             call s_mpi_abort(trim(file_path)//' is missing. Exiting.')
         end if
@@ -987,7 +992,7 @@ contains
             $:GPU_UPDATE(host='[lag_id, mtn_pos, mtn_posPrev, mtn_vel, intfc_rad, &
                 & intfc_vel, bub_R0, Rmax_stats, Rmin_stats, bub_dphidt, gas_p, &
                 & gas_mv, gas_mg, gas_betaT, gas_betaC]')
-            do i = 1, nBubs
+            do i = 1, n_el_bubs_loc
                 if (ieee_is_nan(intfc_rad(i, 1)) .or. intfc_rad(i, 1) <= 0._wp) then
                     call s_mpi_abort("Bubble radius is negative or NaN, please reduce dt.")
                 end if
@@ -998,6 +1003,23 @@ contains
             $:GPU_UPDATE(host='[Rmax_stats,Rmin_stats,gas_p,gas_mv,intfc_vel]')
             call s_write_restart_lag_bubbles(save_count) !parallel
             if (lag_params%write_bubbles_stats) call s_write_lag_bubble_stats()
+
+        elseif (particles_lagrange) then
+            $:GPU_UPDATE(host='[lag_part_id, particle_pos, particle_posPrev, particle_vel, particle_rad, &
+                & particle_R0, Rmax_stats_part, Rmin_stats_part, &
+                & particle_mass]')
+            ! do i = 1, n_el_particles_loc
+            !     if (ieee_is_nan(particle_rad(i, 1)) .or. particle_rad(i, 1) <= 0._wp) then
+            !         call s_mpi_abort("Particle radius is negative or NaN, please reduce dt.")
+            !     end if
+            ! end do
+
+            $:GPU_UPDATE(host='[q_particles(1)%sf]')
+            call s_write_data_files(q_cons_ts(stor)%vf, q_T_sf, q_prim_vf, save_count, bc_type, q_particles(1))
+            $:GPU_UPDATE(host='[Rmax_stats_part,Rmin_stats_part]')
+            call s_write_restart_lag_particles(save_count) !parallel
+            if (lag_params%write_bubbles_stats) call s_write_lag_particle_stats()
+
         else
             call s_write_data_files(q_cons_ts(stor)%vf, q_T_sf, q_prim_vf, save_count, bc_type)
         end if
@@ -1035,6 +1057,11 @@ contains
         if (bubbles_euler .or. bubbles_lagrange) then
             call s_initialize_bubbles_model()
         end if
+
+        if (particles_lagrange) then
+            call s_initialize_particles_model()
+        end if
+
         call s_initialize_mpi_common_module()
         call s_initialize_mpi_proxy_module()
         call s_initialize_variables_conversion_module()
@@ -1121,7 +1148,9 @@ contains
         end if
 
         call s_initialize_derived_variables()
-        if (bubbles_lagrange) call s_initialize_bubbles_EL_module(q_cons_ts(1)%vf)
+
+        if (bubbles_lagrange) call s_initialize_bubbles_EL_module(q_cons_ts(1)%vf, bc_type)
+        if (particles_lagrange) call s_initialize_particles_EL_module(q_cons_ts(1)%vf, bc_type)
 
         if (hypoelasticity) call s_initialize_hypoelastic_module()
         if (hyperelasticity) call s_initialize_hyperelastic_module()
@@ -1254,6 +1283,10 @@ contains
         $:GPU_UPDATE(device='[bc_y%vb1,bc_y%vb2,bc_y%vb3,bc_y%ve1,bc_y%ve2,bc_y%ve3]')
         $:GPU_UPDATE(device='[bc_z%vb1,bc_z%vb2,bc_z%vb3,bc_z%ve1,bc_z%ve2,bc_z%ve3]')
 
+        $:GPU_UPDATE(device='[bc_x%beg, bc_x%end]')
+        $:GPU_UPDATE(device='[bc_y%beg, bc_y%end]')
+        $:GPU_UPDATE(device='[bc_z%beg, bc_z%end]')
+
         $:GPU_UPDATE(device='[bc_x%grcbc_in,bc_x%grcbc_out,bc_x%grcbc_vel_out]')
         $:GPU_UPDATE(device='[bc_y%grcbc_in,bc_y%grcbc_out,bc_y%grcbc_vel_out]')
         $:GPU_UPDATE(device='[bc_z%grcbc_in,bc_z%grcbc_out,bc_z%grcbc_vel_out]')
@@ -1307,6 +1340,7 @@ contains
         call s_finalize_boundary_common_module()
         if (relax) call s_finalize_relaxation_solver_module()
         if (bubbles_lagrange) call s_finalize_lagrangian_solver()
+        if (particles_lagrange) call s_finalize_particle_lagrangian_solver()
         if (viscous .and. (.not. igr)) then
             call s_finalize_viscous_module()
         end if
diff --git a/src/simulation/m_time_steppers.fpp b/src/simulation/m_time_steppers.fpp
index 2835055667..62f3821b2d 100644
--- a/src/simulation/m_time_steppers.fpp
+++ b/src/simulation/m_time_steppers.fpp
@@ -22,6 +22,8 @@ module m_time_steppers
 
     use m_bubbles_EL           !< Lagrange bubble dynamics routines
 
+    use m_particles_EL           !< Lagrange particle dynamics routines
+
     use m_ibm
 
     use m_hyperelastic
@@ -68,8 +70,6 @@ module m_time_steppers
 
     real(wp), allocatable, dimension(:, :, :, :, :) :: rhs_mv
 
-    real(wp), allocatable, dimension(:, :, :) :: max_dt
-
     integer, private :: num_ts !<
     !! Number of time stages in the time-stepping scheme
 
@@ -77,7 +77,7 @@ module m_time_steppers
     real(wp), allocatable, dimension(:, :) :: rk_coef
     integer, private :: num_probe_ts
 
-    $:GPU_DECLARE(create='[q_cons_ts,q_prim_vf,q_T_sf,rhs_vf,q_prim_ts1,q_prim_ts2,rhs_mv,rhs_pb,max_dt,rk_coef,stor,bc_type]')
+    $:GPU_DECLARE(create='[q_cons_ts,q_prim_vf,q_T_sf,rhs_vf,q_prim_ts1,q_prim_ts2,rhs_mv,rhs_pb,rk_coef,stor,bc_type]')
 
 !> @cond
 #if defined(__NVCOMPILER_GPU_UNIFIED_MEM)
@@ -463,10 +463,6 @@ contains
             call s_open_run_time_information_file()
         end if
 
-        if (cfl_dt) then
-            @:ALLOCATE(max_dt(0:m, 0:n, 0:p))
-        end if
-
         ! Allocating arrays to store the bc types
         @:ALLOCATE(bc_type(1:num_dims,1:2))
 
@@ -560,7 +556,12 @@ contains
                 end if
             end if
 
-            if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=s)
+            if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(q_prim_vf, bc_type, stage=s)
+
+            if (particles_lagrange) then
+                call s_update_lagrange_particles_tdv_rk(q_prim_vf, bc_type, stage=s)
+            end if
+
             $:GPU_PARALLEL_LOOP(collapse=4)
             do i = 1, sys_size
                 do l = 0, p
@@ -586,6 +587,7 @@ contains
                 end do
             end do
             $:END_GPU_PARALLEL_LOOP()
+
             !Evolve pb and mv for non-polytropic qbmm
             if (qbmm .and. (.not. polytropic)) then
                 $:GPU_PARALLEL_LOOP(collapse=5)
@@ -672,7 +674,7 @@ contains
             q_cons_ts(1)%vf, &
             q_T_sf, &
             q_prim_vf, &
-            idwint)
+            idwbuff)
 
         if (bubbles_euler) then
 
@@ -682,16 +684,15 @@ contains
         elseif (bubbles_lagrange) then
 
             call s_populate_variables_buffers(bc_type, q_prim_vf, pb_ts(1)%sf, mv_ts(1)%sf)
-            call s_compute_bubble_EL_dynamics(q_prim_vf, stage)
-            call s_transfer_data_to_tmp()
-            call s_smear_voidfraction()
+            call s_compute_bubble_EL_dynamics(q_prim_vf, bc_type, stage)
+
             if (stage == 3) then
                 if (lag_params%write_bubbles_stats) call s_calculate_lag_bubble_stats()
                 if (lag_params%write_bubbles) then
                     $:GPU_UPDATE(host='[gas_p,gas_mv,intfc_rad,intfc_vel]')
-                    call s_write_lag_particles(mytime)
+                    call s_write_lag_bubble_evol(mytime)
                 end if
-                call s_write_void_evol(mytime)
+                if (lag_params%write_void_evol) call s_write_void_evol(mytime)
             end if
 
         end if
@@ -718,7 +719,7 @@ contains
         real(wp) :: H          !< Cell-avg. enthalpy
         real(wp), dimension(2) :: Re         !< Cell-avg. Reynolds numbers
         type(vector_field) :: gm_alpha_qp
-
+        real(wp) :: max_dt
         real(wp) :: dt_local
         integer :: j, k, l !< Generic loop iterators
 
@@ -730,7 +731,9 @@ contains
                 idwint)
         end if
 
-        $:GPU_PARALLEL_LOOP(collapse=3, private='[vel, alpha, Re, rho, vel_sum, pres, gamma, pi_inf, c, H, qv]')
+        dt_local = huge(1.0_wp)
+        $:GPU_PARALLEL_LOOP(collapse=3, private='[vel, alpha, Re, rho, vel_sum, pres, gamma, pi_inf, c, H, qv]', &
+            & reduction='[[dt_local]]', reductionOp='[min]')
         do l = 0, p
             do k = 0, n
                 do j = 0, m
@@ -744,15 +747,12 @@ contains
                     call s_compute_speed_of_sound(pres, rho, gamma, pi_inf, H, alpha, vel_sum, 0._wp, c, qv)
 
                     call s_compute_dt_from_cfl(vel, c, max_dt, rho, Re, j, k, l)
+                    dt_local = min(dt_local, max_dt)
                 end do
             end do
         end do
         $:END_GPU_PARALLEL_LOOP()
 
-        #:call GPU_PARALLEL(copyout='[dt_local]', copyin='[max_dt]')
-            dt_local = minval(max_dt)
-        #:endcall GPU_PARALLEL
-
         if (num_procs == 1) then
             dt = dt_local
         else
diff --git a/toolchain/mfc/params/definitions.py b/toolchain/mfc/params/definitions.py
index 7c456b36fc..5236cb27de 100644
--- a/toolchain/mfc/params/definitions.py
+++ b/toolchain/mfc/params/definitions.py
@@ -639,8 +639,8 @@ def get_value_label(param_name: str, value: int) -> str:
 
     # Bubbles
     "bubble_model": {
-        "choices": [1, 2, 3],
-        "value_labels": {1: "Gilmore", 2: "Keller-Miksis", 3: "Rayleigh-Plesset"},
+        "choices": [0, 1, 2, 3],
+        "value_labels": {0: "Particle", 1: "Gilmore", 2: "Keller-Miksis", 3: "Rayleigh-Plesset"},
     },
 
     # Output
@@ -898,6 +898,10 @@ def _load():  # pylint: disable=too-many-locals,too-many-statements
     for n in ["polytropic", "bubbles_euler", "polydisperse", "qbmm", "bubbles_lagrange"]:
         _r(n, LOG, {"bubbles"})
 
+    # --- Particles ---
+    for n in ["particles_lagrange"]:
+        _r(n, LOG, {"particles"})
+
     # --- Viscosity ---
     _r("viscous", LOG, {"viscosity"})
 
@@ -992,6 +996,13 @@ def _load():  # pylint: disable=too-many-locals,too-many-statements
         _r(f"p_{d}", REAL, math=r"\f$\phi_" + d + r"\f$")
         _r(f"bf_{d}", LOG)
 
+    # Interfacial flow inputs
+    _r("normMag", REAL)
+    _r("p0_ic", REAL)
+    _r("g0_ic", REAL)
+    _r("normFac", REAL)
+    _r("interface_file", STR)
+
     # ==========================================================================
     # INDEXED PARAMETERS
     # ==========================================================================
@@ -1072,6 +1083,10 @@ def _load():  # pylint: disable=too-many-locals,too-many-statements
                    ("R_v", r"\f$R_v\f$"), ("R_g", r"\f$R_g\f$")]:
         _r(f"bub_pp%{a}", REAL, {"bubbles"}, math=sym)
 
+    # --- particle_pp (particle properties) ---
+    for a in ["rho0ref_particle", "cp_particle"]:
+        _r(f"particle_pp%{a}", REAL, {"particles"})
+
     # --- patch_ib (10 immersed boundaries) ---
     for i in range(1, NI + 1):
         px = f"patch_ib({i})%"
@@ -1160,12 +1175,23 @@ def _load():  # pylint: disable=too-many-locals,too-many-statements
 
     # --- lag_params (Lagrangian bubbles) ---
     for a in ["heatTransfer_model", "massTransfer_model", "pressure_corrector",
-              "write_bubbles", "write_bubbles_stats"]:
+              "write_bubbles", "write_bubbles_stats", "pressure_force", "gravity_force",
+              "write_void_evol" ]:
         _r(f"lag_params%{a}", LOG, {"bubbles"})
-    for a in ["solver_approach", "cluster_type", "smooth_type", "nBubs_glb"]:
+    for a in ["solver_approach", "cluster_type", "smooth_type", "nBubs_glb", "drag_model",
+              "vel_model", "charNz"]:
         _r(f"lag_params%{a}", INT, {"bubbles"})
     for a in ["epsilonb", "valmaxvoid", "charwidth", "c0", "rho0", "T0", "x0", "Thost"]:
         _r(f"lag_params%{a}", REAL, {"bubbles"})
+    _r(f"lag_params%input_path", STR, {"bubbles"})
+
+    # --- lag_params (Lagrangian particles) ---
+    for a in ["nParticles_glb", "stokes_drag", "qs_drag_model", "added_mass_model",
+              "interpolation_order"]:
+        _r(f"lag_params%{a}", INT, {'particles'})
+
+    for a in ["collision_force"]:
+        _r(f"lag_params%{a}", LOG, {'particles'})
 
     # --- chem_params ---
     for a in ["diffusion", "reactions"]: