From 6a259ea92af78e8001323ea8b5a5546ac41989a9 Mon Sep 17 00:00:00 2001
From: William Lipscomb <lipscomb@ucar.edu>
Date: Thu, 13 Nov 2025 08:21:47 -0700
Subject: [PATCH 01/21] Fixed a temperature initialization bug

For temp_init = 1, 2 or 3, the initial ice temperature profile depends on artm,
which therefore needs to be set correctly before initializing the temperature.

For artm_input_function = ARTM_INPUT_FUNCTION_XY = 0 (the default),
artm is read from the input file at startup with the correct value.
But for the other artm_input_function options (XY_GRADZ, XYZ, XY_LAPSE),
artm is derived from input fields such as artm_ref, artm_gradz and artm_3d.

With this commit, CISM computes artm (if needed) at initialization, before
computing the ice temperature profile. Without this computation, the temperature
was initialized based on an erroneous artm = 0 and was too warm.

I also added an SMB computation at initialization. This is not strictly needed
but can be a helpful diagnostic.

I moved the downscaling computations to new subroutines called downscale_artm
and downscale_smb to avoid repeating code.
---
 libglissade/glissade_mass_balance.F90 | 567 +++++++++++++++++---------
 1 file changed, 366 insertions(+), 201 deletions(-)

diff --git a/libglissade/glissade_mass_balance.F90 b/libglissade/glissade_mass_balance.F90
index 09c1b3ce..ea4c5fac 100644
--- a/libglissade/glissade_mass_balance.F90
+++ b/libglissade/glissade_mass_balance.F90
@@ -68,8 +68,8 @@ subroutine glissade_mass_balance_init(model)
 
     ! Initialize some fields related to the surface mass balance
 
-    use glimmer_paramets, only: eps11
     use glimmer_physcon, only: rhow, rhoi, scyr
+    use cism_parallel, only: parallel_is_zero
 
     ! input/output arguments
 
@@ -77,9 +77,23 @@ subroutine glissade_mass_balance_init(model)
 
     ! local variables
 
-    real(dp) :: local_maxval, global_maxval
     character(len=100) :: message
-      
+
+    ! Initialize artm for the case that we are reading in artm_ref or artm_3d.
+    ! For some temp_init options, this is needed for correct interior temperatures.
+    if (model%options%artm_input_function /= ARTM_INPUT_FUNCTION_XY) then
+       call downscale_artm(model)
+    endif
+
+    ! Initialize smb for the case that we are reading in smb_ref or smb_3d.
+    ! This is not strictly needed, since the SMB will be recomputed before it is used,
+    !  but can be a helpful diagnostic.
+    !TODO - Do this also for the PDD option?
+    if (model%options%smb_input_function == SMB_INPUT_FUNCTION_XY_GRADZ .or. &
+        model%options%smb_input_function == SMB_INPUT_FUNCTION_XYZ) then
+       call downscale_smb(model)
+    endif
+
     ! Initialize acab, if SMB (with different units) was read in
     if (model%options%smb_input == SMB_INPUT_MMYR_WE) then
        ! Convert units from mm/yr w.e. to m/s ice
@@ -87,6 +101,7 @@ subroutine glissade_mass_balance_init(model)
     endif
 
     ! Initialize artm_corrected.  This is equal to artm, plus any prescribed temperature anomaly.
+    !TODO - Not sure this is needed
     model%climate%artm_corrected(:,:) = model%climate%artm(:,:)
     
     if (model%options%enable_artm_anomaly) then
@@ -96,9 +111,7 @@ subroutine glissade_mass_balance_init(model)
        ! Note: The artm_anomaly field does not change during the run,
        !       but it is possible to ramp up the anomaly using artm_anomaly_timescale.
 
-       local_maxval = maxval(abs(model%climate%artm_anomaly))
-       global_maxval = parallel_reduce_max(local_maxval)
-       if (global_maxval < eps11) then
+       if (parallel_is_zero(model%climate%artm_anomaly)) then
           model%climate%artm_anomaly = model%climate%artm_anomaly_const
           write(message,*) &
                'Setting artm_anomaly = constant value (degC):', model%climate%artm_anomaly_const
@@ -109,7 +122,6 @@ subroutine glissade_mass_balance_init(model)
           endif
        endif
     endif
-    !TODO - Write a short utility function to compute global_maxval of any field.
     !TODO - Repeat for snow and precip anomalies
 
     ! If acab is to be overwritten for some cells, then set overwrite_acab_mask = 1 for these cells.
@@ -206,52 +218,9 @@ subroutine glissade_prepare_climate_forcing(model)
     ! Downscaling of artm_ref to artm (at the ice surface) happens below, followed by the SMB calculation.
     !-------------------------------------------------------
 
-    ! Downscale artm to the current surface elevation if needed.
-    ! The downscaling options are:
-    ! (0) artm(x,y); no dependence on surface elevation
-    ! (1) artm(x,y) + d(artm)/dz(x,y) * dz; artm depends on input field at reference elevation, plus vertical correction
-    ! (2) artm(x,y,z); artm obtained by linear interpolation between values prescribed at adjacent vertical levels
-    ! (3) artm(x,y) adjusted with a uniform lapse rate
-    ! For options (1) - (3), the elevation-dependent artm is computed here.
-
-    if (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XY_GRADZ) then
-
-       ! compute artm by a lapse-rate correction to the reference value
-       model%climate%artm(:,:) = model%climate%artm_ref(:,:) + &
-            (model%geometry%usrf(:,:) - model%climate%usrf_ref(:,:)) * model%climate%artm_gradz(:,:)
-
-    elseif (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XYZ) then
-
-       if (parallel_is_zero(model%climate%artm_3d)) then
-          write(message,*) 'Error: artm_3d = 0 everywhere with artm_input_function =', model%options%artm_input_function
-          call write_log(trim(message), GM_FATAL)
-       endif
-
-       ! Note: With linear_extrapolate_in = T, the values outside the range are obtained by linear extrapolation
-       !        from the top two or bottom two values.
-       !       For temperature, which varies roughly linearly with elevation, this is more accurate
-       !        than simply extending the top and bottom values.
-       !       This call includes a halo update.
-
-       call glissade_vertical_interpolate(&
-            ewn,                   nsn,                 &
-            nzatm,                 model%climate%zatm,  &
-            model%geometry%usrf,                        &
-            model%climate%artm_3d,                      &
-            model%climate%artm,                         &
-            linear_extrapolate_in = .true.)
-
-    elseif (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XY_LAPSE) then
-
-       ! compute artm by a lapse-rate correction to artm_ref
-       ! T_lapse is defined as positive for T decreasing with height
-
-       model%climate%artm(:,:) = model%climate%artm_ref(:,:) - &
-            (model%geometry%usrf(:,:) - model%climate%usrf_ref(:,:)) * model%climate%t_lapse
-
-    endif   ! artm_input_function
-
-    call parallel_halo(model%climate%artm, parallel)
+    if (model%options%artm_input_function /= ARTM_INPUT_FUNCTION_XY) then
+       call downscale_artm(model)
+    endif
 
     ! Optionally, add an anomaly to the surface air temperature
     ! Typically, artm_corrected = artm, but sometimes (e.g., for ISMIP6 forcing experiments),
@@ -348,155 +317,10 @@ subroutine glissade_prepare_climate_forcing(model)
     !  which is passed to the main mass balance driver.
     !-------------------------------------------------------------------------
 
-    ! ------------------------------------------------------------------------
-    ! Depending on the SMB input options, compute model%climate%acab at the ice surface.
-    ! The options are:
-    ! (0) SMB(x,y); no dependence on surface elevation
-    ! (1) SMB(x,y) + dSMB/dz(x,y) * dz; SMB depends on input field at reference elevation, plus vertical correction
-    ! (2) SMB(x,y,z); SMB obtained by linear interpolation between values prescribed at adjacent vertical levels
-    ! (3) SMB obtained from precip and artm using a positive-degree scheme
-    !
-    ! Options (1) and (2) require input fields with SMB units of mm/yr w.e. (SMB_INPUT_MMYR_WE)
-    ! For these options, the elevation-dependent SMB is computed here.
-    ! ------------------------------------------------------------------------
-
-    if (model%options%smb_input_function == SMB_INPUT_FUNCTION_XY_GRADZ) then
-
-       ! downscale SMB to the local surface elevation
-       model%climate%smb(:,:) = model%climate%smb_ref(:,:) + &
-            (model%geometry%usrf(:,:) - model%climate%usrf_ref(:,:)) * model%climate%smb_gradz(:,:)
-
-    elseif (model%options%smb_input_function == SMB_INPUT_FUNCTION_XYZ) then
-
-       ! downscale SMB to the local surface elevation
-       ! Note: With linear_extrapolate_in = F, the values at top and bottom levels are simply extended upward and downward.
-       !       For SMB, this is safer than linear extrapolation (especially when extrapolating upward).
-
-       if (parallel_is_zero(model%climate%smb_3d)) then
-          write(message,*) 'Error: smb_3d = 0 everywhere with smb_input_function =', model%options%smb_input_function
-          call write_log(trim(message), GM_FATAL)
-       endif
-
-       call glissade_vertical_interpolate(&
-            ewn,                   nsn,                 &
-            nzatm,                 model%climate%zatm,  &
-            model%geometry%usrf,                        &
-            model%climate%smb_3d,                       &
-            model%climate%smb,                          &
-            linear_extrapolate_in = .false.)
-
-    elseif  (model%options%smb_input_function == SMB_INPUT_FUNCTION_PDD) then
-
-       ! Compute SMB using a simple PDD scheme:
-       ! (1) Partition precip as rain or snow based on the downscaled artm
-       ! (2) Compute ablation based on artm and a degree factor
-       ! Assume that artm has already been downscaled, if needed, based on artm_input_function.
-
-       ! Note: This is similar to the SMB calculation for glaciers, but that calculation is done in the glacier module.
-       ! TODO: Put the glacier values of snow_threshold_min and snow_threshold_max in the climate derived type.
-
-       ! compute snow accumulation (mm/yr w.e.)
-       where (model%climate%artm > model%climate%snow_threshold_max)
-          model%climate%snow = 0.0d0   ! all precip falls as rain
-       elsewhere (model%climate%artm < model%climate%snow_threshold_min)
-          model%climate%snow = model%climate%precip   ! all precip falls as snow
-       elsewhere (model%climate%artm > model%climate%snow_threshold_min)
-          model%climate%snow = model%climate%precip * (model%climate%snow_threshold_max - model%climate%artm)  &
-               / (model%climate%snow_threshold_max - model%climate%snow_threshold_min)
-       endwhere
-
-       ! compute ablation (mm/yr w.e.)
-       ! Note: degree_factor has units of mm/yr w.e./degC to be consistent with other mass-balance variables.
-       !       It is like mu_star for glaciers.
-       model%climate%ablation = model%climate%degree_factor * max(model%climate%artm - model%climate%tmlt, 0.0d0)
-
-       ! compute smb (mm/yr w.e.)
-       model%climate%smb = model%climate%snow - model%climate%ablation
-
-       ! set smb = 0 for open ocean
-       where (model%geometry%thck == 0.0d0 .and. (model%geometry%topg - model%climate%eus) < 0.0d0)
-          model%climate%smb = 0.0d0
-       endwhere
-
-    endif   ! smb_input_function
-
-    ! For the non-default smb_input_function options, make sure that model%climate%smb is nonzero somewhere; else abort.
-    ! For the default option, do not abort, since idealized tests often have a zero SMB.
-
-    call parallel_halo(model%climate%smb, parallel)
-
-    if (model%options%smb_input_function == SMB_INPUT_FUNCTION_XY_GRADZ .or. &
-        model%options%smb_input_function == SMB_INPUT_FUNCTION_XYZ .or. &
-        model%options%smb_input_function == SMB_INPUT_FUNCTION_PDD) then
-       if (parallel_is_zero(model%climate%smb)) then
-          write(message,*) 'Error: smb = 0 everywhere with smb_input_function =', model%options%smb_input_function
-          call write_log(trim(message), GM_FATAL)
-       endif
+    if (model%options%smb_input_function /= SMB_INPUT_FUNCTION_XY) then
+       call downscale_smb(model)
     endif
 
-    ! optional diagnostics
-    if (verbose_smb) then
-
-       if (this_rank == rtest) then
-          write(iulog,*) 'Computing runtime smb with smb_input_function =', model%options%smb_input_function
-       endif
-       call point_diag(model%geometry%usrf, 'usrf (m)', itest, jtest, rtest, 7, 7)
-
-       if (model%options%smb_input_function == SMB_INPUT_FUNCTION_XY) then
-          call point_diag(model%climate%smb, 'smb (mm/yr)', itest, jtest, rtest, 7, 7)
-       elseif (model%options%smb_input_function == SMB_INPUT_FUNCTION_XY_GRADZ) then
-          call point_diag(model%geometry%usrf - model%climate%usrf_ref, 'usrf - usrf_ref (m)', &
-               itest, jtest, rtest, 7, 7)
-          call point_diag(model%climate%smb_ref, 'reference smb (mm/yr)', itest, jtest, rtest, 7, 7)
-          call point_diag(model%climate%smb_gradz, 'smb_gradz (mm/yr per m)', itest, jtest, rtest, 7, 7)
-          call point_diag(model%climate%smb, 'downscaled smb (mm/yr)', itest, jtest, rtest, 7, 7)
-       elseif (model%options%smb_input_function == SMB_INPUT_FUNCTION_XYZ) then
-          if (this_rank == rtest) then
-             write(iulog,*) ' '
-             write(iulog,*) 'smb_3d at each level:'
-          endif
-          do k = 1, nzatm
-             if (this_rank == rtest) then
-                write(iulog,*) ' '
-                write(iulog,*) 'k =', k
-             endif
-             call point_diag(model%climate%smb_3d(k,:,:), 'smb_3d (mm/yr)', itest, jtest, rtest, 7, 7)
-          enddo
-          call point_diag(model%climate%smb, 'downscaled smb (mm/yr)', itest, jtest, rtest, 7, 7)
-       elseif (model%options%smb_input_function == SMB_INPUT_FUNCTION_PDD) then
-          call point_diag(model%climate%artm, 'artm (deg C)', itest, jtest, rtest, 7, 7)
-          call point_diag(model%climate%precip, 'precip (mm/yr)', itest, jtest, rtest, 7, 7)
-          call point_diag(model%climate%snow, 'snow (mm/yr)', itest, jtest, rtest, 7, 7)
-          call point_diag(model%climate%ablation, 'ablation (mm/yr)', itest, jtest, rtest, 7, 7)
-          call point_diag(model%climate%smb,'smb (mm/yr)', itest, jtest, rtest, 7, 7)
-       endif  ! smb_input_function
-
-       if (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XY) then
-          call point_diag(model%climate%artm, 'artm (deg C)', itest, jtest, rtest, 7, 7)
-       elseif (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XY_GRADZ) then
-          call point_diag(model%climate%artm_ref, 'reference artm (deg C)', itest, jtest, rtest, 7, 7)
-          call point_diag(model%climate%artm_gradz*1000.d0, 'artm_gradz (deg C per km)', itest, jtest, rtest, 7, 7)
-          call point_diag(model%climate%artm, 'downscaled artm (deg C)', itest, jtest, rtest, 7, 7)
-       elseif (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XYZ) then
-          if (this_rank == rtest) then
-             write(iulog,*) ' '
-             write(iulog,*) 'artm_3d at each level:'
-          endif
-          do k = 1, nzatm
-             if (this_rank == rtest) then
-                write(iulog,*) ' '
-                write(iulog,*) 'k =', k
-             endif
-             call point_diag(model%climate%artm_3d(k,:,:), 'artm_3d (deg C)', itest, jtest, rtest, 7, 7)
-          enddo
-          call point_diag(model%climate%artm, 'downscaled artm (deg C)', itest, jtest, rtest, 7, 7)
-       elseif (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XY_LAPSE) then
-          call point_diag(model%climate%artm_ref, 'reference artm (deg C)', itest, jtest, rtest, 7, 7)
-          call point_diag(model%climate%artm, 'downscaled artm (deg C)', itest, jtest, rtest, 7, 7)
-       endif   ! artm_input_function
-
-    endif  ! verbose_smb
-
     ! Compute a corrected smb field that includes any anomalies or correction factors.
 
     ! initialize
@@ -614,13 +438,354 @@ subroutine glissade_prepare_climate_forcing(model)
 
   end subroutine glissade_prepare_climate_forcing
 
+!=======================================================================
+
+  subroutine downscale_artm(model)
+
+    use glissade_grid_operators, only: glissade_vertical_interpolate
+    use cism_parallel, only: parallel_is_zero
+
+    ! input/output arguments
+
+    type(glide_global_type), intent(inout) :: model   ! model instance
+
+    ! local variables
+
+    integer :: itest, jtest, rtest      ! coordinates of diagnostic cell
+    integer :: i, j, k
+    integer :: ewn, nsn
+    integer :: nzatm                    ! number of atmosphere levels at which smb_3d and artm_3d are provided
+
+    type(parallel_type) :: parallel     ! info for parallel communication
+
+    character(len=100) :: message
+
+    ! initialize
+
+    rtest = -999
+    itest = 1
+    jtest = 1
+    if (this_rank == model%numerics%rdiag_local) then
+       rtest = model%numerics%rdiag_local
+       itest = model%numerics%idiag_local
+       jtest = model%numerics%jdiag_local
+    endif
+
+    ewn = model%general%ewn
+    nsn = model%general%nsn
+    nzatm = model%climate%nzatm
+    parallel = model%parallel
+
+    ! Downscale artm to the current surface elevation if needed.
+    ! The downscaling options are:
+    ! (0) artm(x,y); no dependence on surface elevation
+    ! (1) artm(x,y) + d(artm)/dz(x,y) * dz; artm depends on input field at reference elevation, plus vertical correction
+    ! (2) artm(x,y,z); artm obtained by linear interpolation between values prescribed at adjacent vertical levels
+    ! (3) artm(x,y) adjusted with a uniform lapse rate
+    ! For options (1) - (3), the elevation-dependent artm is computed here.
+
+    ! Make sure the required input fields are present with nonzero values
+
+    if (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XY_GRADZ) then
+
+       if (parallel_is_zero(model%climate%artm_ref) .or. &
+           parallel_is_zero(model%climate%usrf_ref) .or. &
+           parallel_is_zero(model%climate%artm_gradz)) then
+          write(message,*) &
+               'Error: Must have nonzero artm_ref, artm_gradz and usrf_ref with artm_input_function =', &
+               model%options%artm_input_function
+          call write_log(trim(message), GM_FATAL)
+       endif
+
+    elseif (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XYZ) then
+
+       if (parallel_is_zero(model%climate%artm_3d)) then
+          write(message,*) &
+               'Error: Must have nonzero artm_3d with artm_input_function =', &
+               model%options%artm_input_function
+          call write_log(trim(message), GM_FATAL)
+       endif
+
+    elseif (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XY_LAPSE) then
+
+       if (parallel_is_zero(model%climate%artm_ref) .or. &
+           parallel_is_zero(model%climate%usrf_ref) .or. &
+           model%climate%t_lapse <= 0.0d0) then
+          write(message,*) &
+               'Error: Must have t_lapse > 0 and nonzero artm_ref, usrf_ref with artm_input_function =', &
+               model%options%artm_input_function
+          call write_log(trim(message), GM_FATAL)
+       endif
+
+    endif
+
+    ! Do the downscaling
+
+    if (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XY_GRADZ) then
+
+       ! compute artm by a lapse-rate correction to the reference value
+       model%climate%artm(:,:) = model%climate%artm_ref(:,:) + &
+            (model%geometry%usrf(:,:) - model%climate%usrf_ref(:,:)) * model%climate%artm_gradz(:,:)
+
+    elseif (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XYZ) then
+
+       ! Note: With linear_extrapolate_in = T, the values outside the range are obtained
+       !        by linear extrapolation from the top two or bottom two values.
+       !       For temperature, which varies roughly linearly with elevation, this is more accurate
+       !        than simply extending the top and bottom values.
+
+       call glissade_vertical_interpolate(&
+            ewn,                   nsn,                 &
+            nzatm,                 model%climate%zatm,  &
+            model%geometry%usrf,                        &
+            model%climate%artm_3d,                      &
+            model%climate%artm,                         &
+            linear_extrapolate_in = .true.)
+
+    elseif (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XY_LAPSE) then
+
+       ! compute artm by a lapse-rate correction to artm_ref
+       ! T_lapse is defined as positive for T decreasing with height
+
+       model%climate%artm(:,:) = model%climate%artm_ref(:,:) - &
+            (model%geometry%usrf(:,:) - model%climate%usrf_ref(:,:)) * model%climate%t_lapse
+
+    endif   ! artm_input_function
+
+    call parallel_halo(model%climate%artm, parallel)
+
+    ! optional diagnostics
+
+    if (verbose_smb) then
+       if (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XY) then
+          call point_diag(model%climate%artm, 'artm (deg C)', itest, jtest, rtest, 7, 7)
+       elseif (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XY_GRADZ) then
+          call point_diag(model%climate%artm_ref, 'reference artm (deg C)', itest, jtest, rtest, 7, 7)
+          call point_diag(model%climate%artm_gradz*1000.d0, 'artm_gradz (deg C per km)', itest, jtest, rtest, 7, 7)
+          call point_diag(model%climate%artm, 'downscaled artm (deg C)', itest, jtest, rtest, 7, 7)
+       elseif (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XYZ) then
+          if (this_rank == rtest) then
+             write(iulog,*) ' '
+             write(iulog,*) 'artm_3d at each level:'
+          endif
+          do k = 1, nzatm
+             if (this_rank == rtest) then
+                write(iulog,*) ' '
+                write(iulog,*) 'k =', k
+             endif
+             call point_diag(model%climate%artm_3d(k,:,:), 'artm_3d (deg C)', itest, jtest, rtest, 7, 7)
+          enddo
+          call point_diag(model%climate%artm, 'downscaled artm (deg C)', itest, jtest, rtest, 7, 7)
+       elseif (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XY_LAPSE) then
+          call point_diag(model%climate%artm_ref, 'reference artm (deg C)', itest, jtest, rtest, 7, 7)
+          call point_diag(model%climate%artm, 'downscaled artm (deg C)', itest, jtest, rtest, 7, 7)
+       endif   ! artm_input_function
+    endif
+
+  end subroutine downscale_artm
+
+!=======================================================================
+
+  subroutine downscale_smb(model)
+
+    use glissade_grid_operators, only: glissade_vertical_interpolate
+    use cism_parallel, only: parallel_is_zero
+
+    ! input/output arguments
+
+    type(glide_global_type), intent(inout) :: model   ! model instance
+
+    ! local variables
+
+    integer :: itest, jtest, rtest      ! coordinates of diagnostic cell
+    integer :: i, j, k
+    integer :: ewn, nsn
+    integer :: nzatm                    ! number of atmosphere levels at which smb_3d and artm_3d are provided
+
+    type(parallel_type) :: parallel     ! info for parallel communication
+
+    character(len=100) :: message
+
+    ! initialize
+
+    rtest = -999
+    itest = 1
+    jtest = 1
+    if (this_rank == model%numerics%rdiag_local) then
+       rtest = model%numerics%rdiag_local
+       itest = model%numerics%idiag_local
+       jtest = model%numerics%jdiag_local
+    endif
+
+    ewn = model%general%ewn
+    nsn = model%general%nsn
+    nzatm = model%climate%nzatm
+    parallel = model%parallel
+
+    ! ------------------------------------------------------------------------
+    ! Depending on the SMB input options, compute model%climate%acab at the ice surface.
+    ! The options are:
+    ! (0) SMB(x,y); no dependence on surface elevation
+    ! (1) SMB(x,y) + dSMB/dz(x,y) * dz; SMB depends on input field at reference elevation, plus vertical correction
+    ! (2) SMB(x,y,z); SMB obtained by linear interpolation between values prescribed at adjacent vertical levels
+    ! (3) SMB obtained from precip and artm using a positive-degree scheme
+    !
+    ! Options (1) and (2) require input fields with SMB units of mm/yr w.e. (SMB_INPUT_MMYR_WE)
+    ! For these options, the elevation-dependent SMB is computed here.
+    ! ------------------------------------------------------------------------
+
+    ! Make sure the required input fields are present with nonzero values
+
+    if (model%options%smb_input_function == SMB_INPUT_FUNCTION_XY_GRADZ) then
+
+       if (parallel_is_zero(model%climate%smb_ref) .or. &
+           parallel_is_zero(model%climate%usrf_ref) .or. &
+           parallel_is_zero(model%climate%smb_gradz)) then
+          write(message,*) &
+               'Error: Must have nonzero smb_ref, smb_gradz and usrf_ref with smb_input_function =', &
+               model%options%smb_input_function
+          call write_log(trim(message), GM_FATAL)
+       endif
+
+    elseif (model%options%smb_input_function == SMB_INPUT_FUNCTION_XYZ) then
+
+       if (parallel_is_zero(model%climate%smb_3d)) then
+          write(message,*) &
+               'Error: Must have nonzero smb_3d with smb_input_function =', &
+               model%options%smb_input_function
+          call write_log(trim(message), GM_FATAL)
+       endif
+
+    elseif (model%options%smb_input_function == SMB_INPUT_FUNCTION_PDD) then
+
+       if (parallel_is_zero(model%climate%artm) .or. &
+           parallel_is_zero(model%climate%precip)) then
+          write(message,*) &
+               'Error: Must have nonzero artm and precip with smb_input_function =', &
+               model%options%smb_input_function
+          call write_log(trim(message), GM_FATAL)
+       endif
+
+    endif
+
+    ! Do the downscaling
+
+    if (model%options%smb_input_function == SMB_INPUT_FUNCTION_XY_GRADZ) then
+
+       ! downscale SMB to the local surface elevation
+       model%climate%smb(:,:) = model%climate%smb_ref(:,:) + &
+            (model%geometry%usrf(:,:) - model%climate%usrf_ref(:,:)) * model%climate%smb_gradz(:,:)
+
+    elseif (model%options%smb_input_function == SMB_INPUT_FUNCTION_XYZ) then
+
+       ! downscale SMB to the local surface elevation
+       ! Note: With linear_extrapolate_in = F, the values at top and bottom levels are simply extended upward and downward.
+       !       For SMB, this is safer than linear extrapolation (especially when extrapolating upward).
+
+       call glissade_vertical_interpolate(&
+            ewn,                   nsn,                 &
+            nzatm,                 model%climate%zatm,  &
+            model%geometry%usrf,                        &
+            model%climate%smb_3d,                       &
+            model%climate%smb,                          &
+            linear_extrapolate_in = .false.)
+
+    elseif (model%options%smb_input_function == SMB_INPUT_FUNCTION_PDD) then
+
+       ! Compute SMB using a simple PDD scheme:
+       ! (1) Partition precip as rain or snow based on the downscaled artm
+       ! (2) Compute ablation based on artm and a degree factor
+       ! Assume that artm has already been downscaled, if needed, based on artm_input_function.
+
+       ! Note: This is similar to the SMB calculation for glaciers, but that calculation is done in the glacier module.
+       ! TODO: Put the glacier values of snow_threshold_min and snow_threshold_max in the climate derived type.
+
+       ! compute snow accumulation (mm/yr w.e.)
+       where (model%climate%artm > model%climate%snow_threshold_max)
+          model%climate%snow = 0.0d0   ! all precip falls as rain
+       elsewhere (model%climate%artm < model%climate%snow_threshold_min)
+          model%climate%snow = model%climate%precip   ! all precip falls as snow
+       elsewhere (model%climate%artm > model%climate%snow_threshold_min)
+          model%climate%snow = model%climate%precip * (model%climate%snow_threshold_max - model%climate%artm)  &
+               / (model%climate%snow_threshold_max - model%climate%snow_threshold_min)
+       endwhere
+
+       ! compute ablation (mm/yr w.e.)
+       ! Note: degree_factor has units of mm/yr w.e./degC to be consistent with other mass-balance variables.
+       !       It is like mu_star for glaciers.
+       model%climate%ablation = model%climate%degree_factor * max(model%climate%artm - model%climate%tmlt, 0.0d0)
+
+       ! compute smb (mm/yr w.e.)
+       model%climate%smb = model%climate%snow - model%climate%ablation
+
+       ! set smb = 0 for open ocean
+       where (model%geometry%thck == 0.0d0 .and. (model%geometry%topg - model%climate%eus) < 0.0d0)
+          model%climate%smb = 0.0d0
+       endwhere
+
+    endif   ! smb_input_function
+
+    call parallel_halo(model%climate%smb, parallel)
+
+    ! For the non-default smb_input_function options, make sure the SMB is nonzero somewhere.
+    ! For the default option, do not abort, since idealized tests often have a zero SMB.
+
+    if (model%options%smb_input_function /= SMB_INPUT_FUNCTION_XY) then
+       if (parallel_is_zero(model%climate%smb)) then
+          write(message,*) 'Error: smb = 0 everywhere with smb_input_function =', &
+               model%options%smb_input_function
+          call write_log(trim(message), GM_FATAL)
+       endif
+    endif
+
+    ! optional diagnostics
+
+    if (verbose_smb) then
+
+       if (this_rank == rtest) then
+          write(iulog,*) 'Computing runtime smb with smb_input_function =', model%options%smb_input_function
+       endif
+       call point_diag(model%geometry%usrf, 'usrf (m)', itest, jtest, rtest, 7, 7)
+
+       if (model%options%smb_input_function == SMB_INPUT_FUNCTION_XY) then
+          call point_diag(model%climate%smb, 'smb (mm/yr)', itest, jtest, rtest, 7, 7)
+       elseif (model%options%smb_input_function == SMB_INPUT_FUNCTION_XY_GRADZ) then
+          call point_diag(model%geometry%usrf - model%climate%usrf_ref, 'usrf - usrf_ref (m)', &
+               itest, jtest, rtest, 7, 7)
+          call point_diag(model%climate%smb_ref, 'reference smb (mm/yr)', itest, jtest, rtest, 7, 7)
+          call point_diag(model%climate%smb_gradz, 'smb_gradz (mm/yr per m)', itest, jtest, rtest, 7, 7)
+          call point_diag(model%climate%smb, 'downscaled smb (mm/yr)', itest, jtest, rtest, 7, 7)
+       elseif (model%options%smb_input_function == SMB_INPUT_FUNCTION_XYZ) then
+          if (this_rank == rtest) then
+             write(iulog,*) ' '
+             write(iulog,*) 'smb_3d at each level:'
+          endif
+          do k = 1, nzatm
+             if (this_rank == rtest) then
+                write(iulog,*) ' '
+                write(iulog,*) 'k =', k
+             endif
+             call point_diag(model%climate%smb_3d(k,:,:), 'smb_3d (mm/yr)', itest, jtest, rtest, 7, 7)
+          enddo
+          call point_diag(model%climate%smb, 'downscaled smb (mm/yr)', itest, jtest, rtest, 7, 7)
+       elseif (model%options%smb_input_function == SMB_INPUT_FUNCTION_PDD) then
+          call point_diag(model%climate%artm, 'artm (deg C)', itest, jtest, rtest, 7, 7)
+          call point_diag(model%climate%precip, 'precip (mm/yr)', itest, jtest, rtest, 7, 7)
+          call point_diag(model%climate%snow, 'snow (mm/yr)', itest, jtest, rtest, 7, 7)
+          call point_diag(model%climate%ablation, 'ablation (mm/yr)', itest, jtest, rtest, 7, 7)
+          call point_diag(model%climate%smb,'smb (mm/yr)', itest, jtest, rtest, 7, 7)
+       endif  ! smb_input_function
+
+    endif  ! verbose_smb
+
+  end subroutine downscale_smb
+
 !=======================================================================
 
   subroutine glissade_apply_smb(model)
 
     ! Apply the SMB at the upper and lower surfaces, and recompute tracer values.
 
-    use glimmer_paramets, only: eps11
     use glimmer_physcon, only: rhow, rhoi, scyr
     use glissade_masks, only: glissade_get_masks
     use glissade_calving, only: verbose_calving

From 6db094c75ef2119c50a0efd7fe3e44ee151dc33a Mon Sep 17 00:00:00 2001
From: William Lipscomb <lipscomb@ucar.edu>
Date: Sun, 30 Nov 2025 10:34:18 -0700
Subject: [PATCH 02/21] Work toward reproducible global sums

Until now, CISM results have varied with the number of processors.
This is because CISM computes many global sums, not just for diagnostics but
also for prognostic calculations (e.g., in the PCG velocity solver).
Two runs with different processor counts will have roundoff-level differences
in the results of global sums, which propagate to the rest of the code.
We want CISM to be able to compute global sums in a reproducible way,
independent of processor count, like other CESM components.

Reproducible sums are not yet operational, but this commit adds most of the infrastructure:

* There is a new config option called 'reproducible_sums', which is false by default.
  To activate reproducible sums, the user simply sets this to true.

* The parallel derived type now includes a logical variable called reprosum, whose value
  is set at initialization based on the config option 'reproducible_sums'.

* There are two new files in libglimmer: cism_reprosum_mod.F90 and cism_infnan_mod.F90.
  These are adapted, with minor changes, from the files shr_reprosum_mod and shr_infnan_mod.F90
  in the CESM shared code written by Pat Worley.

* Many global sums now pass through an interface that supports reproducible sums:
  parallel_global_sum, parallel_global_sum_stagger, or parallel_global_sum_patch.
  These interfaces contain functions with reprosum logic.
  If parallel%reprosum is true, these functions will call subroutime cism_reprosum_calcs,
  which converts sums of floating-point variables to sums involving integers only,
  and in this way computed global sums independent of processor count.
  If parallel%reprosum is false, these subroutines do a local sum and then call
  parallel_reduce_sum, as before.

Note the following changes:

* Replaced the interface parallel_global_sum_staggered (consisting of four subroutines)
  with parallel_global_sum_stagger (consisting of four functions).
  The new functions yield the same results as the old subroutines if reprosum = F.

* Replaced many calls to parallel_reduce_sum with calls to function parallel_global_sum,
  which can call either cism_reprosum_calcs or parallel_reduce_sum.
  The remaining direct calls to parallel_reduce_sum are mostly sums over integers,
  which are independent of processor count.

* Added an interface called parallel_global_sum_patch.
  The parallel_global_sum_patch functions compute the global sum of a given field
  on an arbitrary number of 'patches', each of which has a patch ID denoted
  by a positive integer. These functions are useful for computing sums over
  (1) glaciers and (2) ocean basins, where each glacier or basin is considered a patch.

* Added a subroutine parallel_reprosum_calc in the cism_parallel module; still to be tested.

* Removed subroutines get_area_vol and calc_iareaf_iareag in glide_mask.F90

Collectively, these commits are answer-changing at the roundoff level for both prognostic
and diagnostic variables.

A future commit will add the calls to cism_reprosum_calc and verify that the sums are reproducible.
---
 libglide/glide.F90                       |   25 +-
 libglide/glide_diagnostics.F90           |  190 +--
 libglide/glide_mask.F90                  |  111 +-
 libglide/glide_setup.F90                 |    5 +
 libglide/glide_types.F90                 |    4 +
 libglimmer/cism_infnan_mod.F90           |  423 +++++
 libglimmer/cism_reprosum_mod.F90         | 1789 ++++++++++++++++++++++
 libglimmer/glimmer_global.F90            |   14 +-
 libglimmer/ncdf_template.F90.in          |   64 +-
 libglimmer/parallel_mpi.F90              |  666 +++++---
 libglissade/glissade.F90                 |   70 +-
 libglissade/glissade_basal_water.F90     |    3 +-
 libglissade/glissade_bmlt_float.F90      |    3 +
 libglissade/glissade_calving.F90         |   22 +-
 libglissade/glissade_glacier.F90         |  456 +++---
 libglissade/glissade_inversion.F90       |   25 +
 libglissade/glissade_mass_balance.F90    |   26 +-
 libglissade/glissade_transport.F90       |   44 +-
 libglissade/glissade_utils.F90           |   56 +-
 libglissade/glissade_velo_higher.F90     |   11 +-
 libglissade/glissade_velo_higher_pcg.F90 |  152 +-
 21 files changed, 3176 insertions(+), 983 deletions(-)
 create mode 100644 libglimmer/cism_infnan_mod.F90
 create mode 100644 libglimmer/cism_reprosum_mod.F90

diff --git a/libglide/glide.F90 b/libglide/glide.F90
index 69f7f5e3..aca02953 100644
--- a/libglide/glide.F90
+++ b/libglide/glide.F90
@@ -384,8 +384,7 @@ subroutine glide_initialise(model)
     call glide_set_mask(model%numerics,                                   &
                         model%geometry%thck,  model%geometry%topg,     &
                         model%general%ewn,    model%general%nsn,       &
-                        model%climate%eus,    model%geometry%thkmask,  &
-                        model%geometry%iarea, model%geometry%ivol)
+                        model%climate%eus,    model%geometry%thkmask)
  
     ! calculate lower and upper ice surface
     call glide_calclsrf(model%geometry%thck, model%geometry%topg, model%climate%eus,model%geometry%lsrf)
@@ -503,16 +502,10 @@ subroutine glide_init_state_diagnostic(model, evolve_ice)
           call glide_set_mask(model%numerics,                                &
                               model%geometry%thck,  model%geometry%topg,     &
                               model%general%ewn,    model%general%nsn,       &
-                              model%climate%eus,    model%geometry%thkmask,  &
-                              model%geometry%iarea, model%geometry%ivol)
+                              model%climate%eus,    model%geometry%thkmask)
           
        endif  ! calving_init
 
-       ! Compute total areas of grounded and floating ice
-       call calc_iareaf_iareag(model%numerics%dew,    model%numerics%dns,     &
-                               model%geometry%thkmask,                        &
-                               model%geometry%iareaf, model%geometry%iareag)
-
     ! ------------------------------------------------------------------------ 
     ! ***Part 2: Calculate geometry related fields
     ! ------------------------------------------------------------------------    
@@ -874,13 +867,10 @@ subroutine glide_tstep_p2(model)
 
     call glide_prof_start(model,model%glide_prof%ice_mask2)
 
-    !TODO - Calculate area and vol separately from glide_set_mask?
-
     call glide_set_mask(model%numerics,                                &
                         model%geometry%thck,  model%geometry%topg,     &
                         model%general%ewn,    model%general%nsn,       &
-                        model%climate%eus,    model%geometry%thkmask,  &
-                        model%geometry%iarea, model%geometry%ivol)
+                        model%climate%eus,    model%geometry%thkmask)
 
     call glide_prof_stop(model,model%glide_prof%ice_mask2)
 
@@ -909,14 +899,7 @@ subroutine glide_tstep_p2(model)
     call glide_set_mask(model%numerics,                                &
                         model%geometry%thck,  model%geometry%topg,     &
                         model%general%ewn,    model%general%nsn,       &
-                        model%climate%eus,    model%geometry%thkmask,  &
-                        model%geometry%iarea, model%geometry%ivol)
- endif   ! oldglide = F
-
- if (.not. oldglide) then   ! calculate area of floating and grounded ice
-    call calc_iareaf_iareag(model%numerics%dew,    model%numerics%dns,     &
-                            model%geometry%thkmask,                        &
-                            model%geometry%iareaf, model%geometry%iareag)
+                        model%climate%eus,    model%geometry%thkmask)
  endif   ! oldglide = F
 
     ! ------------------------------------------------------------------------
diff --git a/libglide/glide_diagnostics.F90 b/libglide/glide_diagnostics.F90
index e49ebb2f..cd72557c 100644
--- a/libglide/glide_diagnostics.F90
+++ b/libglide/glide_diagnostics.F90
@@ -39,7 +39,7 @@ module glide_diagnostics
   use cism_parallel, only: this_rank, main_task, lhalo, uhalo, nhalo, &
        parallel_type, broadcast, &
        parallel_localindex, parallel_globalindex, &
-       parallel_reduce_sum, parallel_reduce_max, &
+       parallel_global_sum, parallel_reduce_max, &
        parallel_reduce_maxloc, parallel_reduce_minloc, &
        parallel_is_zero
 
@@ -231,6 +231,7 @@ subroutine glide_write_diag (model,       time)
     integer, dimension(model%general%ewn,model%general%nsn) ::  &
          ice_mask,                 & ! = 1 where ice is present with thck > minthck, else = 0
          floating_mask,            & ! = 1 where ice is present and floating, else = 0
+         grounded_mask,            & ! = 1 where ice is present and grounded, else = 0
          glacier_ice_mask            ! = 1 where glacier ice is present, initially and/or currently
 
     real(dp), dimension(model%general%upn) ::  &
@@ -273,8 +274,12 @@ subroutine glide_write_diag (model,       time)
          velo_ew_ubound, velo_ns_ubound          ! upper bounds for velocity variables
 
     real(dp), dimension(model%general%ewn, model%general%nsn) ::  &
-         velo_sfc,            & ! surface ice speed
-         thck_obs               ! observed ice thickness, derived from usrf_obs and topg
+         mass_above_flotation,& ! ice mass above flotation (kg)
+         velo_sfc,            & ! surface ice speed (m/s)
+         thck_obs               ! observed ice thickness (m), derived from usrf_obs and topg
+
+    real(dp), dimension(:,:,:), allocatable :: &
+         local_energy           ! internal energy (J) per layer in a column
 
     character(len=100) :: message
     
@@ -337,12 +342,15 @@ subroutine glide_write_diag (model,       time)
              ice_mask(i,j) = 1
              if (model%geometry%topg(i,j) - model%climate%eus < (-rhoi/rhoo)*model%geometry%thck(i,j)) then
                 floating_mask(i,j) = 1
+                grounded_mask(i,j) = 0
              else
                 floating_mask(i,j) = 0
+                grounded_mask(i,j) = 1
              endif
           else
              ice_mask(i,j) = 0
              floating_mask(i,j) = 0
+             grounded_mask(i,j) = 0
           endif
        enddo
     enddo
@@ -358,109 +366,75 @@ subroutine glide_write_diag (model,       time)
     call write_log(' ')
 
     ! total ice area (m^2)
-
-    tot_area = 0.d0
-    tot_area_ground = 0.d0
-    tot_area_float = 0.d0
-    do j = lhalo+1, nsn-uhalo
-       do i = lhalo+1, ewn-uhalo
-          if (ice_mask(i,j) == 1) then
-             tot_area = tot_area + cell_area(i,j)
-             if (floating_mask(i,j) == 1) then
-                tot_area_float = tot_area_float + cell_area(i,j)
-             else
-                tot_area_ground = tot_area_ground + cell_area(i,j)
-             endif
-          endif
-       enddo
-    enddo
-
-    tot_area = parallel_reduce_sum(tot_area)
-    tot_area_ground = parallel_reduce_sum(tot_area_ground)
-    tot_area_float = parallel_reduce_sum(tot_area_float)
+    tot_area = parallel_global_sum(cell_area, parallel, ice_mask)
+    tot_area_float = parallel_global_sum(cell_area, parallel, floating_mask)
+    tot_area_ground = parallel_global_sum(cell_area, parallel, grounded_mask)
 
     ! total ice volume (m^3)
- 
-    tot_volume = 0.d0
-    do j = lhalo+1, nsn-uhalo
-       do i = lhalo+1, ewn-uhalo
-          if (ice_mask(i,j) == 1) then
-             tot_volume = tot_volume + model%geometry%thck(i,j) * cell_area(i,j)
-          endif
-       enddo
-    enddo
-    tot_volume = parallel_reduce_sum(tot_volume)
+    tot_volume = parallel_global_sum(model%geometry%thck*cell_area, parallel, ice_mask)
 
     ! total ice mass (kg)
     tot_mass = tot_volume * rhoi
 
     ! total ice mass above flotation (kg)
-    tot_mass_above_flotation = 0.d0
-
-    do j = lhalo+1, nsn-uhalo
-       do i = lhalo+1, ewn-uhalo
-          if (ice_mask(i,j) == 1) then
-             if (floating_mask(i,j) == 0) then  ! grounded ice
-                if (model%geometry%topg(i,j) - model%climate%eus < 0.0d0) then  ! grounded below sea level
-                   thck_floating = (-rhoo/rhoi) * (model%geometry%topg(i,j) - model%climate%eus)  ! exactly floating
-                   thck_above_flotation = model%geometry%thck(i,j) - thck_floating
-                   tot_mass_above_flotation = tot_mass_above_flotation    &
-                                            + thck_above_flotation * cell_area(i,j)
-                else   ! grounded above sea level
-                   tot_mass_above_flotation = tot_mass_above_flotation    &
-                                            + model%geometry%thck(i,j) * cell_area(i,j)
-                endif
+    mass_above_flotation = 0.0d0
+    do j = 1, nsn
+       do i = 1, ewn
+          if (ice_mask(i,j) == 1 .and. floating_mask(i,j) == 0) then
+             if (model%geometry%topg(i,j) - model%climate%eus < 0.0d0) then  ! grounded below sea level
+                thck_floating = (-rhoo/rhoi) * (model%geometry%topg(i,j) - model%climate%eus)  ! exactly floating
+                mass_above_flotation(i,j) = (model%geometry%thck(i,j) - thck_floating) * cell_area(i,j)
+             else   ! grounded above sea level
+                mass_above_flotation(i,j) = model%geometry%thck(i,j) * cell_area(i,j)
              endif
           endif
        enddo
     enddo
-
-    tot_mass_above_flotation = tot_mass_above_flotation * rhoi            ! convert from m^3 to kg
-    tot_mass_above_flotation = parallel_reduce_sum(tot_mass_above_flotation)
+    mass_above_flotation = mass_above_flotation * rhoi
+    tot_mass_above_flotation = parallel_global_sum(mass_above_flotation, parallel)
 
     ! total ice energy relative to T = 0 deg C (J)
- 
-    tot_energy = 0.d0
-    if (size(model%temper%temp,1) == upn+1) then  ! temps are staggered in vertical
-       do j = lhalo+1, nsn-uhalo
-          do i = lhalo+1, ewn-uhalo
+    local_energy = 0.0d0
+    if (size(model%temper%temp,1) == upn+1) then  ! temps are staggered in vertical, located at layer centers
+       allocate(local_energy(model%general%upn-1, model%general%ewn, model%general%nsn))
+       do j = 1, nsn
+          do i = 1, ewn
              if (ice_mask(i,j) == 1) then
-                do k = 1, upn-1
-                   tot_energy = tot_energy +   &
-                                model%geometry%thck(i,j) * model%temper%temp(k,i,j) * cell_area(i,j)   &
-                                *(model%numerics%sigma(k+1) - model%numerics%sigma(k))
+                do k = 1, upn-1   ! (upn-1) layers
+                   local_energy(k,i,j) = &
+                        model%geometry%thck(i,j) * model%temper%temp(k,i,j) * cell_area(i,j)   &
+                        * (model%numerics%sigma(k+1) - model%numerics%sigma(k))
                 enddo
              endif
           enddo
        enddo
-    
-    else   ! temps are unstaggered in vertical
-       do j = lhalo+1, nsn-uhalo
-          do i = lhalo+1, ewn-uhalo
+    else   ! temps are unstaggered in vertical, located at layer interfaces
+       allocate(local_energy(model%general%upn, model%general%ewn, model%general%nsn))
+       do j = 1, nsn
+          do i = 1, ewn
              if (ice_mask(i,j) == 1) then
-                ! upper half-layer, T = upper sfc temp
-                tot_energy = tot_energy +   &
-                             model%geometry%thck(i,j) * model%temper%temp(1,i,j) * cell_area(i,j)    &
-                             * 0.5d0 * model%numerics%sigma(2)
-                do k = 2, upn-1
-                   tot_energy = tot_energy +   &
-                             model%geometry%thck(i,j) * model%temper%temp(k,i,j) * cell_area(i,j)  &
-                             * 0.5d0*(model%numerics%sigma(k+1) - model%numerics%sigma(k-1))
+                k = 1   ! top interface; assign this temperature to the top half layer
+                local_energy(k,i,j) = &
+                     model%geometry%thck(i,j) * model%temper%temp(k,i,j) * cell_area(i,j)    &
+                     * 0.5d0 * model%numerics%sigma(k+1)
+                do k = 2, upn-1   ! interior layers
+                   local_energy(k,i,j) = &
+                        model%geometry%thck(i,j) * model%temper%temp(k,i,j) * cell_area(i,j)  &
+                        * 0.5d0*(model%numerics%sigma(k+1) - model%numerics%sigma(k-1))
                 enddo
-                ! lower half-layer, T = lower sfc temp
-                tot_energy = tot_energy +   &
-                             model%geometry%thck(i,j) * model%temper%temp(upn,i,j) * cell_area(i,j)  &
-                             * 0.5d0 * (1.0d0 - model%numerics%sigma(upn-1))
+                k = upn   ! bottom interface; assign this temperature to the bottom half layer
+                local_energy(k,i,j) = &
+                     model%geometry%thck(i,j) * model%temper%temp(k,i,j) * cell_area(i,j)  &
+                     * 0.5d0 * (1.0d0 - model%numerics%sigma(k-1))
              endif
           enddo
        enddo
     endif
-
-    tot_energy = tot_energy * rhoi * shci
-    tot_energy = parallel_reduce_sum(tot_energy)
+    local_energy = local_energy * rhoi * shci
+    tot_energy = parallel_global_sum(local_energy, parallel, ice_mask)
+    deallocate(local_energy)
 
     ! mean thickness
-
     if (tot_area > eps) then
        mean_thck = tot_volume/tot_area
     else
@@ -468,7 +442,6 @@ subroutine glide_write_diag (model,       time)
     endif
 
     ! mean temperature
- 
     if (tot_volume > eps) then
        mean_temp = tot_energy/ (rhoi*shci*tot_volume)
     else
@@ -490,19 +463,11 @@ subroutine glide_write_diag (model,       time)
     if (model%options%whichdycore == DYCORE_GLISSADE) then
 
        ! total surface accumulation/ablation rate (m^3/yr ice)
- 
-       tot_acab = 0.d0
-       do j = lhalo+1, nsn-uhalo
-          do i = lhalo+1, ewn-uhalo
-             tot_acab = tot_acab + model%climate%acab_applied(i,j) * cell_area(i,j)
-          enddo
-       enddo
-
-       tot_acab = tot_acab * scyr               ! convert to m^3/yr
-       tot_acab = parallel_reduce_sum(tot_acab)
+       tot_acab = parallel_global_sum(model%climate%acab_applied*cell_area, parallel)
+       tot_acab = tot_acab * scyr                ! convert from m^3/s to m^3/yr
 
        ! total surface mass balance flux (kg/s)
-       tot_smb_flux = tot_acab * rhoi / scyr   ! convert m^3/yr to kg/s
+       tot_smb_flux = tot_acab * rhoi / scyr     ! convert m^3/yr to kg/s
 
        ! mean accumulation/ablation rate (m/yr)
        ! Note: This will be only approximate if some ice has melted completely during the time step
@@ -513,15 +478,8 @@ subroutine glide_write_diag (model,       time)
        endif
 
        ! total basal melting rate (positive for ice loss)
-       tot_bmlt = 0.d0
-       do j = lhalo+1, nsn-uhalo
-          do i = lhalo+1, ewn-uhalo
-             tot_bmlt = tot_bmlt + model%basal_melt%bmlt_applied(i,j) * cell_area(i,j)
-          enddo
-       enddo
-
-       tot_bmlt = tot_bmlt * scyr               ! convert to m^3/yr
-       tot_bmlt = parallel_reduce_sum(tot_bmlt)
+       tot_bmlt = parallel_global_sum(model%basal_melt%bmlt_applied*cell_area, parallel)
+       tot_bmlt = tot_bmlt * scyr                ! convert from m^3/s to m^3/yr
 
        ! total basal mass balance (kg/s, positive for freeze-on, negative for melt)
        tot_bmb_flux = -tot_bmlt * rhoi / scyr   ! convert m^3/yr to kg/s
@@ -536,14 +494,7 @@ subroutine glide_write_diag (model,       time)
 
        ! total calving rate (m^3/yr ice)
        ! Note: calving%calving_rate has units of m/yr ice
-
-       tot_calving = 0.d0
-       do j = lhalo+1, nsn-uhalo
-          do i = lhalo+1, ewn-uhalo
-             tot_calving = tot_calving + model%calving%calving_rate(i,j) * cell_area(i,j)  ! m^3/yr ice
-          enddo
-       enddo
-       tot_calving = parallel_reduce_sum(tot_calving)
+       tot_calving = parallel_global_sum(model%calving%calving_rate*cell_area, parallel)
 
        ! total calving mass balance flux (kg/s, negative for ice loss by calving)
        tot_calving_flux = -tot_calving * rhoi / scyr   ! convert m^3/yr to kg/s
@@ -559,15 +510,10 @@ subroutine glide_write_diag (model,       time)
        ! total grounding line mass balance flux (< 0 by definition)
        ! Note: At this point, gl_flux_east and gl_flux_north are already dimensionalized in kg/m/s,
        !       so tot_gl_flux will have units of kg/s
-
-       tot_gl_flux = 0.d0
-       do j = lhalo+1, nsn-uhalo
-          do i = lhalo+1, ewn-uhalo
-             tot_gl_flux = tot_gl_flux - abs(model%geometry%gl_flux_east(i,j))  * model%numerics%dns &
-                                       - abs(model%geometry%gl_flux_north(i,j)) * model%numerics%dew
-          enddo
-       enddo
-       tot_gl_flux = parallel_reduce_sum(tot_gl_flux)
+       tot_gl_flux = parallel_global_sum(abs(model%geometry%gl_flux_east)  * model%numerics%dns  &
+                                       + abs(model%geometry%gl_flux_north) * model%numerics%dew, &
+                                       parallel)
+       tot_gl_flux = -tot_gl_flux   ! negative by definition
 
        ! total rate of change of ice mass (kg/s)
        ! Note: dthck_dt has units of m/s
@@ -575,14 +521,8 @@ subroutine glide_write_diag (model,       time)
        !       in successive time steps, instead of summing over dthck_dt.
        !      Note that dthck_dt does not account for global outflow fluxes (i.e., removal of ice
        !       near the global boundary in halo updates).
-       tot_dmass_dt = 0.d0
-       do j = lhalo+1, nsn-uhalo
-          do i = lhalo+1, ewn-uhalo
-             tot_dmass_dt = tot_dmass_dt + model%geometry%dthck_dt(i,j) * cell_area(i,j)
-          enddo
-       enddo
+       tot_dmass_dt = parallel_global_sum(model%geometry%dthck_dt*cell_area, parallel)
        tot_dmass_dt = tot_dmass_dt * rhoi              ! convert to kg/s
-       tot_dmass_dt = parallel_reduce_sum(tot_dmass_dt)
 
        ! mass conservation error
        ! Note: For most runs, this should be close to zero.
diff --git a/libglide/glide_mask.F90 b/libglide/glide_mask.F90
index c08dfd17..1e4ef585 100644
--- a/libglide/glide_mask.F90
+++ b/libglide/glide_mask.F90
@@ -41,14 +41,11 @@ module glide_mask
 
 contains
 
-!TODO - Remove iarea and ivol calculations?  They are now computed in glide_write_diag..
-
 !TODO - Write a new subroutine (in addition to glide_set_mask) to compute mask for staggered grid?
 !       This subroutine is now called from glissade_velo_driver with stagthck and stagtopg
 !       as input arguments.
 
-  subroutine glide_set_mask(numerics, thck, topg, ewn, nsn, eus, mask, iarea, ivol, &
-                            exec_serial, parallel)
+  subroutine glide_set_mask(numerics, thck, topg, ewn, nsn, eus, mask)
 
     use glide_types
     use glimmer_physcon, only : rhoi, rhoo
@@ -60,14 +57,9 @@ subroutine glide_set_mask(numerics, thck, topg, ewn, nsn, eus, mask, iarea, ivol
     integer, intent(in) :: ewn, nsn              ! Grid size
     real(dp), intent(in) :: eus                  ! Sea level
     integer, dimension(:,:), intent(inout) :: mask   ! Output mask
-    real(dp), intent(inout), optional :: ivol, iarea ! Area and volume of ice
-
-    logical, optional :: exec_serial             !JEFF If executing in serial in MPI program.
-    type(parallel_type), optional :: parallel    ! info for parallel communication
 
     ! local variables
     integer ew,ns
-    logical :: exec_serial_flag
 
     !Note - This array may not be needed, at least in parallel.
 
@@ -76,21 +68,8 @@ subroutine glide_set_mask(numerics, thck, topg, ewn, nsn, eus, mask, iarea, ivol
 
     integer, dimension(0:ewn+1,0:nsn+1) :: maskWithBounds;
 
-    !TODO - What is the exec_serial option?  Is it still needed?
-
-    !JEFF Handle exec_serial optional parameter
-    if ( present(exec_serial) ) then
-       exec_serial_flag = exec_serial
-    else
-       ! Default to off
-       exec_serial_flag = .FALSE.
-    endif
-
     mask = 0
 
-    if (present(iarea)) iarea = 0.d0
-    if (present(ivol)) ivol = 0.d0
-
 !Note - This mask is confusing.  Wondering if we should replace it by a series of logical masks.
 
 ! Would need the following:
@@ -138,10 +117,6 @@ subroutine glide_set_mask(numerics, thck, topg, ewn, nsn, eus, mask, iarea, ivol
         mask = ior(mask, GLIDE_MASK_LAND)    ! GLIDE_MASK_LAND = 4
     endwhere
 
-    if (present(iarea) .and. present(ivol)) then
-        call get_area_vol(thck, numerics%dew, numerics%dns, numerics%thklim, iarea, ivol, exec_serial_flag)
-    end if
-
     !TODO - Replace the following with a halo call for 'mask', with appropriate global BC?
     
     maskWithBounds = 0
@@ -185,14 +160,9 @@ subroutine glide_set_mask(numerics, thck, topg, ewn, nsn, eus, mask, iarea, ivol
        end do
     end do
 
-    !JEFF Don't call halo update if running in serial mode
-    !WHL - I think the halo update will now work in serial mode.
-    if (.NOT. exec_serial_flag .and. present(parallel)) then
-       call parallel_halo(mask, parallel)
-    endif
-
   end subroutine glide_set_mask
 
+
   subroutine augment_kinbc_mask(mask, kinbcmask)
 
     !  Augments the Glide mask with the location of kinematic (dirichlet) boundary
@@ -217,83 +187,8 @@ subroutine augment_kinbc_mask(mask, kinbcmask)
     endwhere
   end subroutine augment_kinbc_mask
 
-  subroutine get_area_vol(thck, dew, dns, thklim, iarea, ivol, exec_serial)
-
-    implicit none
-    real(dp), dimension(:,:) :: thck
-    real(dp) :: dew, dns, thklim
-    real(dp) :: iarea, ivol, sum(2)
-    logical :: exec_serial
-
-    integer :: i,j
-
-    do i = 1+lhalo, size(thck,1)-uhalo
-        do j = 1+lhalo, size(thck,2)-uhalo
-            if (thck(i,j) > thklim ) then
-                iarea = iarea + 1
-                ivol = ivol + thck(i,j)
-            end if
-        end do
-    end do
-
-    iarea = iarea  * dew * dns
-    ivol = ivol * dew * dns
-    
-    if (.NOT. exec_serial) then
-       sum(1) = iarea
-       sum(2) = ivol
-       sum = parallel_reduce_sum(sum)
-       iarea = sum(1)
-       ivol  = sum(2)
-    endif
-
-  end subroutine get_area_vol
- 
-  subroutine calc_iareaf_iareag(dew, dns, mask, iareaf, iareag, exec_serial)
-    
-    implicit none
-    real(dp), intent(in) :: dew, dns
-    real(dp), intent(out) :: iareaf, iareag
-    integer, dimension(:,:), intent(in) :: mask 
-    logical, optional :: exec_serial  ! If executing in serial in MPI program.
-
-    integer :: i,j
-    logical :: exec_serial_flag
-    real(dp) :: sum(2)
- 
-    !TODO - exec_serial option may not be needed
-    if ( present(exec_serial) ) then
-      exec_serial_flag = exec_serial
-    else
-      ! Default to off
-      exec_serial_flag = .FALSE.
-    endif
-
-    iareaf = 0.d0
-    iareag = 0.d0 
-
-    !loop over locally owned scalars
-    do j = 1+lhalo, size(mask,2)-uhalo
-      do i = 1+lhalo, size(mask,1)-uhalo
-        if (GLIDE_IS_FLOAT(mask(i,j))) then
-          iareaf = iareaf + dew * dns
-        else if(GLIDE_IS_GROUND_OR_GNDLINE(mask(i,j))) then
-          iareag = iareag + dew * dns
-        end if
-      end do
-    end do
-
-    if (.NOT. exec_serial_flag) then
-       sum(1) = iareaf
-       sum(2) = iareag
-       sum = parallel_reduce_sum(sum)
-       iareaf = sum(1)
-       iareag = sum(2)
-    endif
-
-  end subroutine calc_iareaf_iareag
 
-    subroutine glide_marine_margin_normal(thck, mask, marine_bc_normal, &
+  subroutine glide_marine_margin_normal(thck, mask, marine_bc_normal, &
                                           exec_serial, parallel)
 
       !TODO - Remove subroutine glide_marine_margin_normal?  Old PBJ routine.
diff --git a/libglide/glide_setup.F90 b/libglide/glide_setup.F90
index 1f5fd0a2..3b2aa1e4 100644
--- a/libglide/glide_setup.F90
+++ b/libglide/glide_setup.F90
@@ -907,6 +907,7 @@ subroutine handle_ho_options(section, model)
     call GetValue(section, 'linear_solve_ncheck',         model%options%linear_solve_ncheck)
     call GetValue(section, 'linear_maxiters',             model%options%linear_maxiters)
     call GetValue(section, 'linear_tolerance',            model%options%linear_tolerance)
+    call GetValue(section, 'reproducible_sums',           model%options%reproducible_sums)
 
   end subroutine handle_ho_options
 
@@ -2183,6 +2184,10 @@ subroutine print_options(model)
           write(message,*) 'linear_tolerance        : ',model%options%linear_tolerance
           call write_log(message)
 
+          if (model%options%reproducible_sums) then
+             call write_log('Global sums will be reproducible')
+          endif
+
        end if   ! DYCORE_GLISSADE
 
        if (model%options%whichdycore == DYCORE_GLISSADE .and.   &
diff --git a/libglide/glide_types.F90 b/libglide/glide_types.F90
index 0be4b0fe..2d001f06 100644
--- a/libglide/glide_types.F90
+++ b/libglide/glide_types.F90
@@ -1157,6 +1157,10 @@ module glide_types
     real(dp) :: linear_tolerance = 1.0d-08
     !> error tolerance for linear solver
 
+    logical :: reproducible_sums = .false.
+    !> if true, then compute reproducible global sums
+    !> (independent of the number of tasks)
+
     ! The remaining options are not currently supported
 
     !integer :: which_bproc = 0
diff --git a/libglimmer/cism_infnan_mod.F90 b/libglimmer/cism_infnan_mod.F90
new file mode 100644
index 00000000..124b051f
--- /dev/null
+++ b/libglimmer/cism_infnan_mod.F90
@@ -0,0 +1,423 @@
+! Flag representing compiler support of Fortran 2003's
+! ieee_arithmetic intrinsic module.
+#if defined CPRIBM || defined CPRPGI || defined CPRINTEL ||  defined CPRCRAY || defined CPRNAG
+#define HAVE_IEEE_ARITHMETIC
+#endif
+
+!WHL, Nov. 2025: Adapted from shr_infnan_mod.F90, part of CESM shared code
+!                Changed 'shr' to 'cism' to avoid name conflicts with shared code
+!                I kept only the r8 interfaces (not r4, i8 or i4).
+!                Also, I assumed that all input arrays are 1d.
+
+module cism_infnan_mod
+!---------------------------------------------------------------------
+! Module to test for IEEE Inf and NaN values, which also provides a
+! method of setting +/-Inf and signaling or quiet NaN.
+!
+! All functions are elemental, and thus work on arrays.
+!---------------------------------------------------------------------
+! To test for these values, just call the corresponding function, e.g:
+!
+!   var_is_nan = cism_infnan_isnan(x)
+!
+! You can also use it on arrays:
+!
+!   array_contains_nan = any(cism_infnan_isnan(my_array))
+!
+!---------------------------------------------------------------------
+! To generate these values, assign one of the provided derived-type
+! variables to a real:
+!
+!   use cism_infnan_mod, only: nan => cism_infnan_nan, &
+!                             inf => cism_infnan_inf, &
+!                             assignment(=)
+!   real(r4) :: my_nan
+!   real(r8) :: my_inf_array(2,2)
+!   my_nan = nan
+!   my_inf_array = inf
+!
+! Keep in mind that "cism_infnan_nan" and "cism_infnan_inf" cannot be
+! passed to functions that expect real arguments. To pass a real
+! NaN, you will have to use cism_infnan_nan to set a local real of
+! the correct kind.
+!---------------------------------------------------------------------
+
+  use glimmer_global, only: r4 => sp, r8 => dp
+  use glimmer_global, only: i4, i8
+!!use shr_kind_mod, only: &
+!!     r4 => SHR_KIND_R4, &
+!!     r8 => SHR_KIND_R8
+
+#ifdef HAVE_IEEE_ARITHMETIC
+
+! If we have IEEE_ARITHMETIC, the NaN test is provided for us.
+use, intrinsic :: ieee_arithmetic, only: &
+     cism_infnan_isnan => ieee_is_nan
+
+#else
+
+! Integers of correct size for bit patterns below.
+!!use shr_kind_mod, only: i4 => shr_kind_i4, i8 => shr_kind_i8
+
+#endif
+
+implicit none
+private
+save
+
+! Test functions for NaN/Inf values.
+public :: cism_infnan_isnan
+public :: cism_infnan_isinf
+public :: cism_infnan_isposinf
+public :: cism_infnan_isneginf
+
+! Locally defined isnan.
+#ifndef HAVE_IEEE_ARITHMETIC
+interface cism_infnan_isnan
+   ! TYPE double,real
+   module procedure cism_infnan_isnan_r8
+end interface
+#endif
+
+interface cism_infnan_isinf
+   ! TYPE double,real
+   module procedure cism_infnan_isinf_r8
+end interface
+
+interface cism_infnan_isposinf
+   ! TYPE double,real
+   module procedure cism_infnan_isposinf_r8
+end interface
+
+interface cism_infnan_isneginf
+   ! TYPE double,real
+   module procedure cism_infnan_isneginf_r8
+end interface
+
+! Derived types for generation of NaN/Inf
+! Even though there's no reason to "use" the types directly, some compilers
+! might have trouble with an object being used without its type.
+public :: cism_infnan_nan_type
+public :: cism_infnan_inf_type
+public :: assignment(=)
+public :: cism_infnan_to_r4
+public :: cism_infnan_to_r8
+
+! Type representing Not A Number.
+type :: cism_infnan_nan_type
+   logical :: quiet = .false.
+end type cism_infnan_nan_type
+
+! Type representing +/-Infinity.
+type :: cism_infnan_inf_type
+   logical :: positive = .true.
+end type cism_infnan_inf_type
+
+! Allow assigning reals to NaN or Inf.
+interface assignment(=)
+   ! TYPE double,real
+   ! DIMS 0,1,2,3,4,5,6,7
+   module procedure set_nan_1d_r8
+   ! TYPE double,real
+   ! DIMS 0,1,2,3,4,5,6,7
+   module procedure set_inf_1d_r8
+end interface
+
+! Conversion functions.
+interface cism_infnan_to_r8
+   module procedure nan_r8
+   module procedure inf_r8
+end interface
+
+interface cism_infnan_to_r4
+   module procedure nan_r4
+   module procedure inf_r4
+end interface
+
+! Initialize objects of NaN/Inf type for other modules to use.
+
+! Default NaN is signaling, but also provide snan and qnan to choose
+! explicitly.
+type(cism_infnan_nan_type), public, parameter :: cism_infnan_nan = &
+     cism_infnan_nan_type(.false.)
+type(cism_infnan_nan_type), public, parameter :: cism_infnan_snan = &
+     cism_infnan_nan_type(.false.)
+type(cism_infnan_nan_type), public, parameter :: cism_infnan_qnan = &
+     cism_infnan_nan_type(.true.)
+
+! Default Inf is positive, but provide posinf to go with neginf.
+type(cism_infnan_inf_type), public, parameter :: cism_infnan_inf = &
+     cism_infnan_inf_type(.true.)
+type(cism_infnan_inf_type), public, parameter :: cism_infnan_posinf = &
+     cism_infnan_inf_type(.true.)
+type(cism_infnan_inf_type), public, parameter :: cism_infnan_neginf = &
+     cism_infnan_inf_type(.false.)
+
+! Bit patterns for implementation without ieee_arithmetic.
+! Note that in order to satisfy gfortran's range check, we have to use
+! ibset to set the sign bit from a BOZ pattern.
+#ifndef HAVE_IEEE_ARITHMETIC
+! Single precision.
+integer(i4), parameter :: ssnan_pat = int(Z'7FA00000',i4)
+integer(i4), parameter :: sqnan_pat = int(Z'7FC00000',i4)
+integer(i4), parameter :: sposinf_pat = int(Z'7F800000',i4)
+integer(i4), parameter :: sneginf_pat = ibset(sposinf_pat,bit_size(1_i4)-1)
+! Double precision.
+integer(i8), parameter :: dsnan_pat = int(Z'7FF4000000000000',i8)
+integer(i8), parameter :: dqnan_pat = int(Z'7FF8000000000000',i8)
+integer(i8), parameter :: dposinf_pat = int(Z'7FF0000000000000',i8)
+integer(i8), parameter :: dneginf_pat = ibset(dposinf_pat,bit_size(1_i8)-1)
+#endif
+
+contains
+
+!---------------------------------------------------------------------
+! TEST FUNCTIONS
+!---------------------------------------------------------------------
+! The "isinf" function simply calls "isposinf" and "isneginf".
+!---------------------------------------------------------------------
+
+! TYPE double,real
+elemental function cism_infnan_isinf_r8(x) result(isinf)
+  real(r8), intent(in) :: x
+  logical :: isinf
+
+  isinf = cism_infnan_isposinf(x) .or. cism_infnan_isneginf(x)
+
+end function cism_infnan_isinf_r8
+
+#ifdef HAVE_IEEE_ARITHMETIC
+
+!---------------------------------------------------------------------
+! The "isposinf" and "isneginf" functions get the IEEE class of a
+! real, and test to see if the class is equal to ieee_positive_inf
+! or ieee_negative_inf.
+!---------------------------------------------------------------------
+
+! TYPE double,real
+elemental function cism_infnan_isposinf_r8(x) result(isposinf)
+  use, intrinsic :: ieee_arithmetic, only: &
+       ieee_class, &
+       ieee_positive_inf, &
+       operator(==)
+  real(r8), intent(in) :: x
+  logical :: isposinf
+
+  isposinf = (ieee_positive_inf == ieee_class(x))
+
+end function cism_infnan_isposinf_r8
+
+! TYPE double,real
+elemental function cism_infnan_isneginf_r8(x) result(isneginf)
+  use, intrinsic :: ieee_arithmetic, only: &
+       ieee_class, &
+       ieee_negative_inf, &
+       operator(==)
+  real(r8), intent(in) :: x
+  logical :: isneginf
+
+  isneginf = (ieee_negative_inf == ieee_class(x))
+
+end function cism_infnan_isneginf_r8
+
+#else
+! Don't have ieee_arithmetic.
+
+!!#ifdef CPRGNU   !WHL - Assume this is true
+! NaN testing on gfortran.
+! TYPE double,real
+elemental function cism_infnan_isnan_r8(x) result(is_nan)
+  real(r8), intent(in) :: x
+  logical :: is_nan
+
+  is_nan = isnan(x)
+
+end function cism_infnan_isnan_r8
+! End GNU section.
+!!#endif
+
+!---------------------------------------------------------------------
+! The "isposinf" and "isneginf" functions just test against a known
+! bit pattern if we don't have ieee_arithmetic.
+!---------------------------------------------------------------------
+
+! TYPE double,real
+elemental function cism_infnan_isposinf_r8(x) result(isposinf)
+  real(r8), intent(in) :: x
+  logical :: isposinf
+!!#if ({ITYPE} == TYPEREAL)
+!!  integer(i4), parameter :: posinf_pat = sposinf_pat
+!!#else
+  integer(i8), parameter :: posinf_pat = dposinf_pat
+!!#endif
+
+  isposinf = (x == transfer(posinf_pat,x))
+
+end function cism_infnan_isposinf_r8
+
+! TYPE double,real
+elemental function cism_infnan_isneginf_r8(x) result(isneginf)
+  real(r8), intent(in) :: x
+  logical :: isneginf
+!!#if ({ITYPE} == TYPEREAL)
+!!  integer(i4), parameter :: neginf_pat = sneginf_pat
+!!#else
+  integer(i8), parameter :: neginf_pat = dneginf_pat
+!!#endif
+
+  isneginf = (x == transfer(neginf_pat,x))
+
+end function cism_infnan_isneginf_r8
+
+! End ieee_arithmetic conditional.
+#endif
+
+!---------------------------------------------------------------------
+! GENERATION FUNCTIONS
+!---------------------------------------------------------------------
+! Two approaches for generation of NaN and Inf values:
+!   1. With Fortran 2003, use the ieee_value intrinsic to get a value
+!      from the corresponding class. These are:
+!       - ieee_signaling_nan
+!       - ieee_quiet_nan
+!       - ieee_positive_inf
+!       - ieee_negative_inf
+!   2. Without Fortran 2003, set the IEEE bit patterns directly.
+!      Use BOZ literals to get an integer with the correct bit
+!      pattern, then use "transfer" to transfer those bits into a
+!      real.
+!---------------------------------------------------------------------
+
+! TYPE double,real
+! DIMS 0,1,2,3,4,5,6,7
+pure subroutine set_nan_1d_r8(output, nan)
+#ifdef HAVE_IEEE_ARITHMETIC
+  use, intrinsic :: ieee_arithmetic, only: &
+       ieee_signaling_nan, &
+       ieee_quiet_nan, &
+       ieee_value
+#else
+!!#if ({ITYPE} == TYPEREAL)
+!!  integer(i4), parameter :: snan_pat = ssnan_pat
+!!  integer(i4), parameter :: qnan_pat = sqnan_pat
+!!#else
+  integer(i8), parameter :: snan_pat = dsnan_pat
+  integer(i8), parameter :: qnan_pat = dqnan_pat
+!!#endif
+#endif
+!!  real(r8), intent(out) :: output{DIMSTR}
+  real(r8), intent(out) :: output
+  type(cism_infnan_nan_type), intent(in) :: nan
+
+  ! Use scalar temporary for performance reasons, to reduce the cost of
+  ! the ieee_value call.
+  real(r8) :: tmp
+
+#ifdef HAVE_IEEE_ARITHMETIC
+  if (nan%quiet) then
+     tmp = ieee_value(tmp, ieee_quiet_nan)
+  else
+     tmp = ieee_value(tmp, ieee_signaling_nan)
+  end if
+#else
+  if (nan%quiet) then
+     tmp = transfer(qnan_pat, tmp)
+  else
+     tmp = transfer(snan_pat, tmp)
+  end if
+#endif
+
+  output = tmp
+
+end subroutine set_nan_1d_r8
+
+! TYPE double,real
+! DIMS 0,1,2,3,4,5,6,7
+pure subroutine set_inf_1d_r8(output, inf)
+#ifdef HAVE_IEEE_ARITHMETIC
+  use, intrinsic :: ieee_arithmetic, only: &
+       ieee_positive_inf, &
+       ieee_negative_inf, &
+       ieee_value
+#else
+!!#if ({ITYPE} == TYPEREAL)
+!!  integer(i4), parameter :: posinf_pat = sposinf_pat
+!!  integer(i4), parameter :: neginf_pat = sneginf_pat
+!!#else
+  integer(i8), parameter :: posinf_pat = dposinf_pat
+  integer(i8), parameter :: neginf_pat = dneginf_pat
+!!#endif
+#endif
+!!  real(r8), intent(out) :: output{DIMSTR}
+  real(r8), intent(out) :: output
+  type(cism_infnan_inf_type), intent(in) :: inf
+
+  ! Use scalar temporary for performance reasons, to reduce the cost of
+  ! the ieee_value call.
+  real(r8) :: tmp
+
+#ifdef HAVE_IEEE_ARITHMETIC
+  if (inf%positive) then
+     tmp = ieee_value(tmp,ieee_positive_inf)
+  else
+     tmp = ieee_value(tmp,ieee_negative_inf)
+  end if
+#else
+  if (inf%positive) then
+     tmp = transfer(posinf_pat, tmp)
+  else
+     tmp = transfer(neginf_pat, tmp)
+  end if
+#endif
+
+  output = tmp
+
+end subroutine set_inf_1d_r8
+
+!---------------------------------------------------------------------
+! CONVERSION INTERFACES.
+!---------------------------------------------------------------------
+! Function methods to get reals from nan/inf types.
+!---------------------------------------------------------------------
+
+pure function nan_r8(nan) result(output)
+  class(cism_infnan_nan_type), intent(in) :: nan
+  real(r8) :: output
+
+!!  output = nan
+  !WHL kluge
+  output = 0._r8
+
+end function nan_r8
+
+pure function nan_r4(nan) result(output)
+  class(cism_infnan_nan_type), intent(in) :: nan
+  real(r4) :: output
+
+!!  output = nan
+  !WHL kluge
+  output = 0._r8
+
+end function nan_r4
+
+pure function inf_r8(inf) result(output)
+  class(cism_infnan_inf_type), intent(in) :: inf
+  real(r8) :: output
+
+!!  output = inf
+  !WHL kluge
+  output = 0._r8
+
+end function inf_r8
+
+pure function inf_r4(inf) result(output)
+  class(cism_infnan_inf_type), intent(in) :: inf
+  real(r4) :: output
+
+!!  output = inf
+  !WHL kluge
+  output = 0._r8
+
+end function inf_r4
+
+end module cism_infnan_mod
diff --git a/libglimmer/cism_reprosum_mod.F90 b/libglimmer/cism_reprosum_mod.F90
new file mode 100644
index 00000000..55a034c1
--- /dev/null
+++ b/libglimmer/cism_reprosum_mod.F90
@@ -0,0 +1,1789 @@
+!WHL, Nov. 2025: Adapted from shr_reprosum_mod.F90, part of CESM shared code
+!                Changed 'shr' to 'cism' to avoid name conflicts with shared code
+  
+module cism_reprosum_mod
+!-----------------------------------------------------------------------
+!
+! Purpose:
+! Compute reproducible global sums of a set of arrays across an MPI
+! subcommunicator
+!
+! Methods:
+! Compute using either or both a scalable, reproducible algorithm and a
+! scalable, nonreproducible algorithm:
+! * Reproducible (scalable):
+!    Convert to fixed point (integer vector representation) to enable
+!    reproducibility when using MPI_Allreduce
+! * Alternative usually reproducible (scalable):
+!    Use parallel double-double algorithm due to Helen He and
+!    Chris Ding, based on David Bailey's/Don Knuth's DDPDD algorithm
+! * Nonreproducible (scalable):
+!    Floating point and MPI_Allreduce based.
+! If computing both reproducible and nonreproducible sums, compare
+! these and report relative difference (if absolute difference
+! less than sum) or absolute difference back to calling routine.
+!
+! Author: P. Worley (based on suggestions from J. White for fixed
+!                    point algorithm and on He/Ding paper for ddpdd
+!                    algorithm)
+!
+!-----------------------------------------------------------------------
+
+!-----------------------------------------------------------------------
+!- use statements ------------------------------------------------------
+!-----------------------------------------------------------------------
+!!#if ( defined noI8 )
+!!  ! Workaround for when shr_kind_i8 is not supported.
+!!   use shr_kind_mod,  only: r8 => shr_kind_r8, i8 => shr_kind_i4
+!!#else
+!!   use shr_kind_mod,  only: r8 => shr_kind_r8, i8 => shr_kind_i8
+!!#endif
+   !WHL - Use CISM modules instead of shared modules if possible
+   use glimmer_global, only: r8 => dp
+   use glimmer_global, only: i8
+   use glimmer_paramets, only: iulog
+!!   use cism_parallel, only: main_task
+!   use shr_log_mod,   only: s_loglev  => shr_log_Level
+!   use shr_log_mod,   only: s_logunit => shr_log_Unit
+!   use shr_sys_mod,   only: shr_sys_abort
+
+   use cism_infnan_mod,only: cism_infnan_inf_type, assignment(=), &
+                            cism_infnan_posinf, cism_infnan_neginf, &
+                            cism_infnan_nan, &
+                            cism_infnan_isnan, cism_infnan_isinf, &
+                            cism_infnan_isposinf, cism_infnan_isneginf
+#ifdef TIMING
+   use perf_mod
+#endif
+!-----------------------------------------------------------------------
+!- module boilerplate --------------------------------------------------
+!-----------------------------------------------------------------------
+   implicit none
+   private
+
+!-----------------------------------------------------------------------
+!- include statements --------------------------------------------------
+!-----------------------------------------------------------------------
+#include <mpif.h>
+
+   save
+
+!-----------------------------------------------------------------------
+! Public interfaces ----------------------------------------------------
+!-----------------------------------------------------------------------
+   public :: &
+!!      shr_reprosum_setopts,        &! set runtime options
+!!      shr_reprosum_calc,           &! calculate distributed sum
+!!      shr_reprosum_tolExceeded      ! utility function to check relative
+!!                                    !  differences against the tolerance
+      cism_reprosum_setopts,        &! set runtime options
+      cism_reprosum_calc,           &! calculate distributed sum
+      cism_reprosum_tolExceeded      ! utility function to check relative
+                                     !  differences against the tolerance
+
+!-----------------------------------------------------------------------
+! Public data ----------------------------------------------------------
+!-----------------------------------------------------------------------
+   logical, public     :: cism_reprosum_recompute = .false.
+
+   real(r8), public    :: cism_reprosum_reldiffmax = -1.0_r8
+
+   logical, parameter, public :: verbose_reprosum = .false.
+!!   logical, parameter, public :: verbose_reprosum = .true.
+
+!-----------------------------------------------------------------------
+! Private interfaces ----------------------------------------------------
+!-----------------------------------------------------------------------
+   private :: &
+      ddpdd,           &! double-double sum routine
+      split_indices     ! split indices among OMP threads
+
+!-----------------------------------------------------------------------
+! Private data ----------------------------------------------------------
+!-----------------------------------------------------------------------
+
+   !----------------------------------------------------------------------------
+   ! cism_reprosum_mod options
+   !----------------------------------------------------------------------------
+   logical            :: repro_sum_use_ddpdd = .false.
+
+   logical            :: repro_sum_allow_infnan = .false.
+
+   !WHLmod
+!!   integer :: s_logunit = 6
+
+   CONTAINS
+
+!
+!========================================================================
+!
+   subroutine cism_reprosum_setopts(repro_sum_use_ddpdd_in,    &
+                                   repro_sum_allow_infnan_in, &
+                                   repro_sum_rel_diff_max_in, &
+                                   repro_sum_recompute_in,    &
+                                   repro_sum_master,          &
+                                   repro_sum_logunit          )
+
+!-----------------------------------------------------------------------
+! Purpose: Set runtime options
+! Author: P. Worley
+!-----------------------------------------------------------------------
+!------------------------------Arguments--------------------------------
+      ! Use DDPDD algorithm instead of fixed precision algorithm
+      logical, intent(in), optional :: repro_sum_use_ddpdd_in
+
+      ! Allow INF or NaN in summands
+      logical, intent(in), optional :: repro_sum_allow_infnan_in
+
+      ! maximum permissible difference between reproducible and
+      ! nonreproducible sums
+      real(r8), intent(in), optional :: repro_sum_rel_diff_max_in
+
+      ! recompute using different algorithm when difference between
+      ! reproducible and nonreproducible sums is too great
+      logical, intent(in), optional  :: repro_sum_recompute_in
+
+      ! flag indicating whether this process should output log messages
+      logical, intent(in), optional  :: repro_sum_master
+
+      ! unit number for log messages
+      integer, intent(in), optional  :: repro_sum_logunit
+
+!---------------------------Local Workspace-----------------------------
+      integer logunit            ! unit number for log messages
+      logical master             ! local master?
+      logical,save :: firstcall = .true.  ! first call
+      integer :: ierr            ! MPI error return
+!-----------------------------------------------------------------------
+
+      if ( present(repro_sum_master) ) then
+         master = repro_sum_master
+      else
+         master = .false.  !WHL master = main_task?
+      endif
+
+      if ( present(repro_sum_logunit) ) then
+         logunit = repro_sum_logunit
+      else
+!!         logunit = s_logunit
+         logunit = iulog   ! CISM default
+      endif
+
+      if (.not. firstcall) then
+!!         call shr_sys_abort('shr_reprosum_setopts ERROR: multiple calls')
+         write(logunit,*) 'cism_reprosum_setopts: ERROR can only be called once'
+         call mpi_abort(MPI_COMM_WORLD, 1001, ierr)
+      endif
+      firstcall = .false.
+
+      if ( present(repro_sum_use_ddpdd_in) ) then
+         repro_sum_use_ddpdd = repro_sum_use_ddpdd_in
+      endif
+      if ( present(repro_sum_allow_infnan_in) ) then
+         repro_sum_allow_infnan = repro_sum_allow_infnan_in
+      endif
+      if ( present(repro_sum_rel_diff_max_in) ) then
+         cism_reprosum_reldiffmax = repro_sum_rel_diff_max_in
+      endif
+      if ( present(repro_sum_recompute_in) ) then
+         cism_reprosum_recompute = repro_sum_recompute_in
+      endif
+      if (master) then
+         if ( repro_sum_use_ddpdd ) then
+            write(logunit,*) 'CISM_REPROSUM_SETOPTS: ',&
+              'Using double-double-based (scalable) usually reproducible ', &
+              'distributed sum algorithm'
+         else
+            write(logunit,*) 'CISM_REPROSUM_SETOPTS: ',&
+              'Using fixed-point-based (scalable) reproducible ', &
+              'distributed sum algorithm'
+         endif
+
+         if ( repro_sum_allow_infnan ) then
+            write(logunit,*) 'CISM_REPROSUM_SETOPTS: ',&
+              'Will calculate sum when INF or NaN are included in summands'
+         else
+            write(logunit,*) 'CISM_REPROSUM_SETOPTS: ',&
+              'Will abort if INF or NaN are included in summands'
+         endif
+
+         if (cism_reprosum_reldiffmax >= 0._r8) then
+            write(logunit,*) '                    ',&
+              'with a maximum relative error tolerance of ', &
+               cism_reprosum_reldiffmax
+            if (cism_reprosum_recompute) then
+               write(logunit,*) '                    ',&
+                 'If tolerance exceeded, sum is recomputed using ', &
+                 'a serial algorithm.'
+            else
+               write(logunit,*) '                    ',&
+                 'If tolerance exceeded, fixed-precision is sum used ', &
+                 'but a warning is output.'
+            endif
+         else
+            write(logunit,*) '                    ',&
+              'and not comparing with floating point algorithms.'
+         endif
+
+      endif
+   end subroutine cism_reprosum_setopts
+
+!
+!========================================================================
+!
+
+   subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands,     &
+                                 nflds, allow_infnan, ddpdd_sum,          &
+                                 arr_gbl_max, arr_gbl_max_out,            &
+                                 arr_max_levels, arr_max_levels_out,      &
+                                 gbl_max_nsummands, gbl_max_nsummands_out,&
+                                 gbl_count, repro_sum_validate,           &
+                                 repro_sum_stats, rel_diff, commid        )
+!----------------------------------------------------------------------
+!
+! Purpose:
+! Compute the global sum of each field in "arr" using the indicated
+! communicator with a reproducible yet scalable implementation based
+! on a fixed point algorithm. An alternative is to use an "almost
+! always reproducible" floating point algorithm, as described below.
+!
+! The accuracy of the fixed point algorithm is controlled by the
+! number of "levels" of integer expansion. The algorithm will calculate
+! the number of levels that is required for the sum to be essentially
+! exact. (The sum as represented by the integer expansion will be exact,
+! but roundoff may perturb the least significant digit of the returned
+! real*8 representation of the sum.) The optional parameter arr_max_levels
+! can be used to override the calculated value. The optional parameter
+! arr_max_levels_out can be used to return the values used.
+!
+! The algorithm also requires an upper bound on
+! the maximum summand (in absolute value) for each field, and will
+! calculate this internally. However, if the optional parameters
+! arr_max_levels and arr_gbl_max are both set, then the algorithm will
+! use the values in arr_gbl_max for the upper bounds instead. If these
+! are not upper bounds, or if the upper bounds are not tight enough
+! to achieve the requisite accuracy, and if the optional parameter
+! repro_sum_validate is NOT set to .false., the algorithm will repeat the
+! computation with appropriate upper bounds. If only arr_gbl_max is present,
+! then the maxima are computed internally (and the specified values are
+! ignored). The optional parameter arr_gbl_max_out can be
+! used to return the values used.
+!
+! Finally, the algorithm requires an upper bound on the number of
+! local summands across all processes. This will be calculated internally,
+! using an MPI collective, but the value in the optional argument
+! gbl_max_nsummands will be used instead if (1) it is present, (2)
+! it is > 0, and (3) the maximum value and required number of levels
+! are also specified. (If the maximum value is calculated, the same
+! MPI collective is used to determine the maximum number of local
+! summands.) The accuracy of the user-specified value is not checked.
+! However, if set to < 1, the value will instead be calculated. If the
+! optional parameter gbl_max_nsummands_out is present, then the value
+! used (gbl_max_nsummands if >= 1; calculated otherwise) will be
+! returned.
+!
+! If requested (by setting cism_reprosum_reldiffmax >= 0.0 and passing in
+! the optional rel_diff parameter), results are compared with a
+! nonreproducible floating point algorithm.
+!
+! Note that the cost of the algorithm is not strongly correlated with
+! the number of levels, which primarily shows up as a (modest) increase
+! in cost of the MPI_Allreduce as a function of vector length. Rather the
+! cost is more a function of (a) the number of integers required to
+! represent an individual summand and (b) the number of MPI_Allreduce
+! calls. The number of integers required to represent an individual
+! summand is 1 or 2 when using 8-byte integers for 8-byte real summands
+! when the number of local summands is not too large. As the number of
+! local summands increases, the number of integers required increases.
+! The number of MPI_Allreduce calls is either 2 (specifying nothing) or
+! 1 (specifying gbl_max_nsummands, arr_max_levels, and arr_gbl_max
+! correctly). When specifying arr_max_levels and arr_gbl_max
+! incorrectly, 3 or 4 MPI_Allreduce calls will be required.
+!
+! The alternative algorithm is a minor modification of a parallel
+! implementation of David Bailey's routine DDPDD by Helen He
+! and Chris Ding. Bailey uses the Knuth trick to implement quadruple
+! precision summation of double precision values with 10 double
+! precision operations. The advantage of this algorithm is that
+! it requires a single MPI_Allreduce and is less expensive per summand
+! than is the fixed precision algorithm. The disadvantage is that it
+! is not guaranteed to be reproducible (though it is reproducible
+! much more often than is the standard algorithm). This alternative
+! is used when the optional parameter ddpdd_sum is set to .true. It is
+! also used if the fixed precision algorithm radix assumption does not
+! hold.
+!
+!----------------------------------------------------------------------
+!
+! Arguments
+!
+      integer,  intent(in) :: nsummands  ! number of local summands
+      integer,  intent(in) :: dsummands  ! declared first dimension
+      integer,  intent(in) :: nflds      ! number of fields
+      real(r8), intent(in) :: arr(dsummands,nflds)
+                                         ! input array
+
+      real(r8), intent(out):: arr_gsum(nflds)
+                                         ! global sums
+
+      logical,  intent(in),    optional :: ddpdd_sum
+                                         ! use ddpdd algorithm instead
+                                         ! of fixed precision algorithm
+
+      logical,  intent(in),    optional :: allow_infnan
+         ! if .true., allow INF or NaN input values.
+         ! if .false. (the default), then abort.
+
+      real(r8), intent(in),    optional :: arr_gbl_max(nflds)
+                                         ! upper bound on max(abs(arr))
+
+      real(r8), intent(out),   optional :: arr_gbl_max_out(nflds)
+                                         ! calculated upper bound on
+                                         ! max(abs(arr))
+
+      integer,  intent(in),    optional :: arr_max_levels(nflds)
+                                         ! maximum number of levels of
+                                         ! integer expansion to use
+
+      integer,  intent(out),   optional :: arr_max_levels_out(nflds)
+                                         ! output of number of levels of
+                                         ! integer expansion to used
+
+      integer,  intent(in),    optional :: gbl_max_nsummands
+                                         ! maximum of nsummand over all
+                                         ! processes
+
+      integer,  intent(out),   optional :: gbl_max_nsummands_out
+                                         ! calculated maximum nsummands
+                                         ! over all processes
+
+      integer,  intent(in),    optional :: gbl_count
+                                         ! was total number of summands;
+                                         ! now is ignored; use
+                                         ! gbl_max_nsummands instead
+
+      logical,  intent(in),    optional :: repro_sum_validate
+         ! flag enabling/disabling testing that gmax and  max_levels are
+         ! accurate/sufficient. Default is enabled.
+
+      integer,  intent(inout), optional :: repro_sum_stats(6)
+                                   ! increment running totals for
+                                   !  (1) one-reduction repro_sum
+                                   !  (2) two-reduction repro_sum
+                                   !  (3) both types in one call
+                                   !  (4) nonrepro_sum
+                                   !  (5) global max nsummands reduction
+                                   !  (6) global lor 3*nflds reduction
+
+      real(r8), intent(out),   optional :: rel_diff(2,nflds)
+                                         ! relative and absolute
+                                         !  differences between fixed
+                                         !  and floating point sums
+
+      integer,  intent(in),    optional :: commid
+                                         ! MPI communicator
+
+!
+! Local workspace
+!
+      logical :: abort_inf_nan           ! flag indicating whether to
+                                         !  abort if INF or NaN found in input
+      logical :: use_ddpdd_sum           ! flag indicating whether to
+                                         !  use cism_reprosum_ddpdd or not
+      logical :: recompute               ! flag indicating need to
+                                         !  determine gmax/gmin before
+                                         !  computing sum
+      logical :: validate                ! flag indicating need to
+                                         !  verify gmax and max_levels
+                                         !  are accurate/sufficient
+      logical :: nan_check, inf_check    ! flag on whether there are
+                                         !  NaNs and INFs in input array
+      logical :: inf_nan_lchecks(3,nflds)! flags on whether there are
+                                         !  NaNs, positive INFs, or negative INFs
+                                         !  for each input field locally
+      logical :: inf_nan_gchecks(3,nflds)! flags on whether there are
+                                         !  NaNs, positive INFs, or negative INFs
+                                         !  for each input field
+      logical :: arr_gsum_infnan(nflds)  ! flag on whether field sum is a
+                                         !  NaN or INF
+
+      integer :: gbl_lor_red             ! global lor reduction? (0/1)
+      integer :: gbl_max_red             ! global max reduction? (0/1)
+      integer :: repro_sum_fast          ! 1 reduction repro_sum? (0/1)
+      integer :: repro_sum_slow          ! 2 reduction repro_sum? (0/1)
+      integer :: repro_sum_both          ! both fast and slow? (0/1)
+      integer :: nonrepro_sum            ! nonrepro_sum? (0/1)
+
+      integer :: nan_count, inf_count    ! local count of NaNs and INFs in
+                                         !  input array
+      integer :: omp_nthreads            ! number of OpenMP threads
+      integer :: mpi_comm                ! MPI subcommunicator
+      integer :: mypid                   ! MPI process ID (COMM_WORLD)
+      integer :: tasks                   ! number of MPI processes
+      integer :: ierr                    ! MPI error return
+      integer :: ifld, isum, ithread     ! loop variables
+      integer :: max_nsummands           ! max nsummands over all processes
+                                         !  or threads (used in both ways)
+
+      integer, allocatable :: isum_beg(:), isum_end(:)
+                                         ! range of summand indices for each
+                                         !  OpenMP thread
+      integer, allocatable :: arr_tlmin_exp(:,:)
+                                         ! per thread local exponent minima
+      integer, allocatable :: arr_tlmax_exp(:,:)
+                                         ! per thread local exponent maxima
+      integer :: arr_exp, arr_exp_tlmin, arr_exp_tlmax
+                                         ! summand exponent and working min/max
+      integer :: arr_lmin_exp(nflds)     ! local exponent minima
+      integer :: arr_lmax_exp(nflds)     ! local exponent maxima
+      integer :: arr_lextremes(0:nflds,2)! local exponent extrema
+      integer :: arr_gextremes(0:nflds,2)! global exponent extrema
+
+      integer :: arr_gmax_exp(nflds)     ! global exponents maxima
+      integer :: arr_gmin_exp(nflds)     ! global exponents minima
+      integer :: arr_max_shift           ! maximum safe exponent for
+                                         !  value < 1 (so that sum does
+                                         !  not overflow)
+      integer :: max_levels(nflds)       ! maximum number of levels of
+                                         !  integer expansion to use
+      integer :: max_level               ! maximum value in max_levels
+
+      real(r8) :: xmax_nsummands         ! dble of max_nsummands
+      real(r8) :: arr_lsum(nflds)        ! local sums
+      real(r8) :: arr_gsum_fast(nflds)   ! global sum calculated using
+                                         !  fast, nonreproducible,
+                                         !  floating point alg.
+      real(r8) :: abs_diff               ! absolute difference between
+                                         !  fixed and floating point
+                                         !  sums
+
+#ifdef _OPENMP
+      integer omp_get_max_threads
+      external omp_get_max_threads
+#endif
+
+      !WHL - debug
+      logical :: test_sum = .false.
+      real(r8), dimension(dsummands,nflds) :: arr_test
+      arr_test(:,:) = 0.0d0
+      call mpi_comm_rank(MPI_COMM_WORLD, mypid, ierr)
+      if (verbose_reprosum .and. mypid == 0) then
+         write(iulog,*) 'In cism_reprosum_mod'
+      endif
+!
+!-----------------------------------------------------------------------
+!
+! initialize local statistics variables
+      gbl_lor_red = 0
+      gbl_max_red = 0
+      repro_sum_fast = 0
+      repro_sum_slow = 0
+      repro_sum_both = 0
+      nonrepro_sum = 0
+
+! set MPI communicator
+      if ( present(commid) ) then
+         mpi_comm = commid
+      else
+         mpi_comm = MPI_COMM_WORLD
+      endif
+#ifdef TIMING
+      call t_barrierf('sync_repro_sum',mpi_comm)
+#endif
+! check whether should abort if input contains NaNs or INFs
+      abort_inf_nan = .not. repro_sum_allow_infnan
+      if ( present(allow_infnan) ) then
+         abort_inf_nan = .not. allow_infnan
+      endif
+#ifdef TIMING
+      call t_startf('cism_reprosum_INF_NaN_Chk')
+#endif
+! initialize flags to indicate that no NaNs or INFs are present in the input data
+      inf_nan_gchecks = .false.
+      arr_gsum_infnan = .false.
+
+      !TODO - Remove the inf_nan option; assume abort_inf_nan = T, as in CICE
+      if (abort_inf_nan) then
+
+         if (verbose_reprosum .and. mypid == 0) then
+            write(iulog,*) '   abort_inf_nan check'
+         endif
+
+! check whether input contains NaNs or INFs, and abort if so
+         nan_check = any(cism_infnan_isnan(arr))
+         inf_check = any(cism_infnan_isinf(arr))
+
+         if (nan_check .or. inf_check) then
+
+            nan_count = count(cism_infnan_isnan(arr))
+            inf_count = count(cism_infnan_isinf(arr))
+
+            if ((nan_count > 0) .or. (inf_count > 0)) then
+               call mpi_comm_rank(MPI_COMM_WORLD, mypid, ierr)
+!!               write(s_logunit,37) real(nan_count,r8), real(inf_count,r8), mypid
+               write(iulog,37) real(nan_count,r8), real(inf_count,r8), mypid
+37 format("CISM_REPROSUM_CALC: Input contains ",e12.5, &
+          " NaNs and ", e12.5, " INFs on process ", i7)
+!!               call shr_sys_abort("shr_reprosum_calc ERROR: NaNs or INFs in input")
+               write(iulog,*) "cism_reprosum_calc ERROR: NaNs or INFs in input"
+               call mpi_abort(MPI_COMM_WORLD, 1001, ierr)
+            endif
+
+         endif
+
+      else
+
+! determine whether any fields contain NaNs or INFs, and avoid processing them
+! via integer expansions
+         inf_nan_lchecks = .false.
+
+         do ifld=1,nflds
+            inf_nan_lchecks(1,ifld) = any(cism_infnan_isnan(arr(:,ifld)))
+            inf_nan_lchecks(2,ifld) = any(cism_infnan_isposinf(arr(:,ifld)))
+            inf_nan_lchecks(3,ifld) = any(cism_infnan_isneginf(arr(:,ifld)))
+         end do
+#ifdef TIMING
+         call t_startf("repro_sum_allr_lor")
+#endif
+         call mpi_allreduce (inf_nan_lchecks, inf_nan_gchecks, 3*nflds, &
+                             MPI_LOGICAL, MPI_LOR, mpi_comm, ierr)
+         gbl_lor_red = 1
+#ifdef TIMING
+         call t_stopf("repro_sum_allr_lor")
+#endif
+         do ifld=1,nflds
+            arr_gsum_infnan(ifld) = any(inf_nan_gchecks(:,ifld))
+         enddo
+
+      endif
+#ifdef TIMING
+      call t_stopf('cism_reprosum_INF_NaN_Chk')
+#endif
+! check whether should use cism_reprosum_ddpdd algorithm
+      use_ddpdd_sum = repro_sum_use_ddpdd
+      if ( present(ddpdd_sum) ) then
+         use_ddpdd_sum = ddpdd_sum
+      endif
+
+! check whether intrinsic-based algorithm will work on this system
+! (requires floating point and integer bases to be the same)
+! If not, always use ddpdd.
+      use_ddpdd_sum = use_ddpdd_sum .or. (radix(0._r8) /= radix(0_i8))
+
+      if ( use_ddpdd_sum ) then
+#ifdef TIMING
+         call t_startf('cism_reprosum_ddpdd')
+#endif
+         call cism_reprosum_ddpdd(arr, arr_gsum, nsummands, dsummands, &
+                                  nflds, mpi_comm)
+         repro_sum_fast = 1
+#ifdef TIMING
+         call t_stopf('cism_reprosum_ddpdd')
+#endif
+      else
+#ifdef TIMING
+         call t_startf('cism_reprosum_int')
+#endif
+! get number of MPI tasks
+         call mpi_comm_size(mpi_comm, tasks, ierr)
+
+         if (verbose_reprosum .and. mypid == 0) then
+            write(iulog,*) 'Starting reprosum, tasks =', tasks
+         endif
+! get number of OpenMP threads
+#ifdef _OPENMP
+         omp_nthreads = omp_get_max_threads()
+#else
+         omp_nthreads = 1
+#endif
+
+! see if have sufficient information to not require max/min allreduce
+         recompute = .true.
+         validate = .false.
+         if ( present(arr_gbl_max) .and. present(arr_max_levels) ) then
+            recompute = .false.
+
+! setting lower bound on max_level*nflds to be 64 to improve OpenMP
+! performance for loopb in cism_reprosum_int
+            max_level = (64/nflds) + 1
+            do ifld=1,nflds
+               if ((arr_gbl_max(ifld) .ge. 0.0_r8) .and. &
+                   (arr_max_levels(ifld) > 0)) then
+
+                  arr_gmax_exp(ifld)  = exponent(arr_gbl_max(ifld))
+                  if (max_level < arr_max_levels(ifld)) &
+                     max_level = arr_max_levels(ifld)
+
+               else
+                  recompute = .true.
+               endif
+            enddo
+
+            if (.not. recompute) then
+
+! determine maximum number of summands in local phases of the
+! algorithm
+#ifdef TIMING
+               call t_startf("repro_sum_allr_max")
+#endif
+               if ( present(gbl_max_nsummands) ) then
+                  if (gbl_max_nsummands < 1) then
+                     call mpi_allreduce (nsummands, max_nsummands, 1, &
+                                         MPI_INTEGER, MPI_MAX, mpi_comm, ierr)
+                     gbl_max_red = 1
+                  else
+                     max_nsummands = gbl_max_nsummands
+                  endif
+               else
+                  call mpi_allreduce (nsummands, max_nsummands, 1, &
+                                      MPI_INTEGER, MPI_MAX, mpi_comm, ierr)
+                  gbl_max_red = 1
+               endif
+#ifdef TIMING
+               call t_stopf("repro_sum_allr_max")
+#endif
+! determine maximum shift. Shift needs to be small enough that summation
+!  does not exceed maximum number of digits in i8.
+
+! if requested, return max_nsummands before it is redefined
+               if ( present( gbl_max_nsummands_out) ) then
+                  gbl_max_nsummands_out = max_nsummands
+               endif
+
+! summing within each thread first
+               max_nsummands = (max_nsummands/omp_nthreads) + 1
+! then over threads and tasks
+               max_nsummands = max(max_nsummands, tasks*omp_nthreads)
+! A 'max' is used in the above calculation because the partial sum for
+! each thread, calculated in cism_reprosum_int, is postprocessed so that
+! each integer in the corresponding vector of integers is reduced in
+! magnitude to be less than (radix(IX_8)**arr_max_shift). Therefore,
+! the maximum shift can be calculated separately for per thread sums
+! and sums over threads and tasks, and the smaller value used. This is
+! equivalent to using max_nsummands as defined above.
+
+               xmax_nsummands = dble(max_nsummands)
+               arr_max_shift = digits(0_i8) - (exponent(xmax_nsummands) + 1)
+               if (arr_max_shift < 2) then
+!!                  call shr_sys_abort('repro_sum failed: number of summands too '// &
+!!                                     'large for fixed precision algorithm' )
+                  write(iulog,*) &
+                       'repro_sum failed: number of summands too large for fixed precision algorithm'
+                  call mpi_abort(MPI_COMM_WORLD, 1001, ierr)
+               endif
+
+! calculate sum
+               if (present(repro_sum_validate)) then
+                  validate = repro_sum_validate
+               else
+                  validate = .true.
+               endif
+               call cism_reprosum_int(arr, arr_gsum, nsummands, dsummands, &
+                                     nflds, arr_max_shift, arr_gmax_exp, &
+                                     arr_max_levels, max_level, arr_gsum_infnan, &
+                                     validate, recompute, omp_nthreads, mpi_comm)
+
+! record statistics, etc.
+               repro_sum_fast = 1
+               if (recompute) then
+                  repro_sum_both = 1
+               else
+! if requested, return specified levels and upper bounds on maxima
+                  if ( present(arr_max_levels_out) ) then
+                     do ifld=1,nflds
+                        arr_max_levels_out(ifld) = arr_max_levels(ifld)
+                     enddo
+                  endif
+                  if ( present(arr_gbl_max_out) ) then
+                     do ifld=1,nflds
+                       arr_gbl_max_out(ifld) = arr_gbl_max(ifld)
+                     enddo
+                  endif
+               endif
+            endif
+         endif
+
+! do not have sufficient information; calculate global max/min and
+! use to compute required number of levels
+         if (recompute) then
+
+! record statistic
+            repro_sum_slow = 1
+
+! determine maximum and minimum (non-zero) summand values and
+! maximum number of local summands
+
+! allocate thread-specific work space
+            allocate(arr_tlmax_exp(nflds,omp_nthreads))
+            allocate(arr_tlmin_exp(nflds,omp_nthreads))
+            allocate(isum_beg(omp_nthreads))
+            allocate(isum_end(omp_nthreads))
+
+! split summand index range over OpenMP threads
+!!            call split_indices(nsummands, omp_nthreads, isum_beg, isum_end)
+            !WHL - debug
+            call split_indices(dsummands, omp_nthreads, isum_beg, isum_end)
+
+!$omp parallel do      &
+!$omp default(shared)  &
+!$omp private(ithread, ifld, isum, arr_exp, arr_exp_tlmin, arr_exp_tlmax)
+            do ithread=1,omp_nthreads
+#ifdef TIMING
+               call t_startf('repro_sum_loopa')
+#endif
+               do ifld=1,nflds
+                  arr_exp_tlmin = MAXEXPONENT(1._r8)
+                  arr_exp_tlmax = MINEXPONENT(1._r8)
+                  if (.not. arr_gsum_infnan(ifld)) then
+                     do isum=isum_beg(ithread),isum_end(ithread)
+                        if (arr(isum,ifld) .ne. 0.0_r8) then
+                           arr_exp = exponent(arr(isum,ifld))
+                           arr_exp_tlmin = min(arr_exp,arr_exp_tlmin)
+                           arr_exp_tlmax = max(arr_exp,arr_exp_tlmax)
+                        endif
+                     end do
+                  endif
+                  arr_tlmin_exp(ifld,ithread) = arr_exp_tlmin
+                  arr_tlmax_exp(ifld,ithread) = arr_exp_tlmax
+               end do
+#ifdef TIMING
+               call t_stopf('repro_sum_loopa')
+#endif
+            end do
+
+            do ifld=1,nflds
+               arr_lmax_exp(ifld) = maxval(arr_tlmax_exp(ifld,:))
+               arr_lmin_exp(ifld) = minval(arr_tlmin_exp(ifld,:))
+            end do
+            deallocate(arr_tlmin_exp,arr_tlmax_exp,isum_beg,isum_end)
+
+            arr_lextremes(0,:) = -nsummands
+            arr_lextremes(1:nflds,1) = -arr_lmax_exp(:)
+            arr_lextremes(1:nflds,2) = arr_lmin_exp(:)
+#ifdef TIMING
+            call t_startf("repro_sum_allr_minmax")
+#endif
+            call mpi_allreduce (arr_lextremes, arr_gextremes, 2*(nflds+1), &
+                                MPI_INTEGER, MPI_MIN, mpi_comm, ierr)
+#ifdef TIMING
+            call t_stopf("repro_sum_allr_minmax")
+#endif
+            max_nsummands   = -arr_gextremes(0,1)
+            arr_gmax_exp(:) = -arr_gextremes(1:nflds,1)
+            arr_gmin_exp(:) =  arr_gextremes(1:nflds,2)
+
+! if a field is identically zero or contains INFs or NaNs, arr_gmin_exp
+!   still equals MAXEXPONENT and arr_gmax_exp still equals MINEXPONENT.
+!   In this case, set arr_gmin_exp = arr_gmax_exp = MINEXPONENT
+            do ifld=1,nflds
+               arr_gmin_exp(ifld) = min(arr_gmax_exp(ifld),arr_gmin_exp(ifld))
+            enddo
+
+! if requested, return upper bounds on observed maxima
+            if ( present(arr_gbl_max_out) ) then
+               do ifld=1,nflds
+                  arr_gbl_max_out(ifld) = scale(1.0_r8,arr_gmax_exp(ifld))
+               enddo
+            endif
+
+! if requested, return max_nsummands before it is redefined
+            if ( present( gbl_max_nsummands_out) ) then
+               gbl_max_nsummands_out = max_nsummands
+            endif
+
+! determine maximum shift (same as in previous branch, but with calculated
+!  max_nsummands). Shift needs to be small enough that summation does not
+!  exceed maximum number of digits in i8.
+
+! summing within each thread first
+            max_nsummands = (max_nsummands/omp_nthreads) + 1
+! then over threads and tasks
+            max_nsummands = max(max_nsummands, tasks*omp_nthreads)
+! A 'max' is used in the above calculation because the partial sum for
+! each thread, calculated in cism_reprosum_int, is postprocessed so that
+! each integer in the corresponding vector of integers is reduced in
+! magnitude to be less than (radix(IX_8)**arr_max_shift). Therefore,
+! the maximum shift can be calculated separately for per thread sums
+! and sums over threads and tasks, and the smaller value used. This is
+! equivalent to using max_nsummands as defined above.
+
+            xmax_nsummands = dble(max_nsummands)
+            arr_max_shift = digits(0_i8) - (exponent(xmax_nsummands) + 1)
+            if (arr_max_shift < 2) then
+!!               call shr_sys_abort('repro_sum failed: number of summands too '// &
+!!                                  'large for fixed precision algorithm' )
+               write(iulog,*) &
+                    'repro_sum failed: number of summands too large for fixed precision algorithm'
+               call mpi_abort(MPI_COMM_WORLD, 1001, ierr)
+            endif
+
+! determine maximum number of levels required for each field
+! ((digits(0.0_r8) + (arr_gmax_exp(ifld)-arr_gmin_exp(ifld))) / arr_max_shift)
+! + 1 because first truncation probably does not involve a maximal shift
+! + 1 to guarantee that the integer division rounds up (not down)
+! (setting lower bound on max_level*nflds to be 64 to improve OpenMP
+!  performance for loopb in cism_reprosum_int)
+            max_level = (64/nflds) + 1
+            do ifld=1,nflds
+               max_levels(ifld) = 2 + &
+                ((digits(0.0_r8) + (arr_gmax_exp(ifld)-arr_gmin_exp(ifld))) &
+                / arr_max_shift)
+               if ( present(arr_max_levels) .and. (.not. validate) ) then
+! if validate true, then computation with arr_max_levels failed
+!  previously
+                  if ( arr_max_levels(ifld) > 0 ) then
+                     max_levels(ifld) = &
+                        min(arr_max_levels(ifld),max_levels(ifld))
+                  endif
+               endif
+               if (max_level < max_levels(ifld)) &
+                  max_level = max_levels(ifld)
+            enddo
+
+! if requested, return calculated levels
+            if ( present(arr_max_levels_out) ) then
+               do ifld=1,nflds
+                  arr_max_levels_out(ifld) = max_levels(ifld)
+               enddo
+            endif
+
+            if (verbose_reprosum .and. mypid == 0) then
+               write(iulog,*) '   max_nsummands =', max_nsummands
+               write(iulog,*) '   max_levels(1), max_level =', max_levels(1), max_level
+               write(iulog,*) '   call reprosum_int, pid =', mypid
+            endif
+
+! calculate sum
+            validate = .false.
+            call cism_reprosum_int(arr, arr_gsum, nsummands, dsummands, &
+                                  nflds, arr_max_shift, arr_gmax_exp, &
+                                  max_levels, max_level, arr_gsum_infnan, &
+                                  validate, recompute, omp_nthreads, mpi_comm)
+
+         endif
+#ifdef TIMING
+         call t_stopf('cism_reprosum_int')
+#endif
+      endif
+
+! compare fixed and floating point results
+      if ( present(rel_diff) ) then
+         if (cism_reprosum_reldiffmax >= 0.0_r8) then
+#ifdef TIMING
+            call t_barrierf('sync_nonrepro_sum',mpi_comm)
+            call t_startf('nonrepro_sum')
+#endif
+! record statistic
+            nonrepro_sum = 1
+! compute nonreproducible sum
+            arr_lsum(:) = 0._r8
+!$omp parallel do      &
+!$omp default(shared)  &
+!$omp private(ifld, isum)
+            do ifld=1,nflds
+               if (.not. arr_gsum_infnan(ifld)) then
+                  do isum=1,nsummands
+                     arr_lsum(ifld) = arr(isum,ifld) + arr_lsum(ifld)
+                  end do
+               endif
+            end do
+#ifdef TIMING
+            call t_startf("nonrepro_sum_allr_r8")
+#endif
+            call mpi_allreduce (arr_lsum, arr_gsum_fast, nflds, &
+                                MPI_REAL8, MPI_SUM, mpi_comm, ierr)
+#ifdef TIMING
+            call t_stopf("nonrepro_sum_allr_r8")
+
+            call t_stopf('nonrepro_sum')
+#endif
+! determine differences
+!$omp parallel do      &
+!$omp default(shared)  &
+!$omp private(ifld, abs_diff)
+            do ifld=1,nflds
+               abs_diff = abs(arr_gsum_fast(ifld)-arr_gsum(ifld))
+               if (abs(arr_gsum(ifld)) > abs_diff) then
+                  rel_diff(1,ifld) = abs_diff/abs(arr_gsum(ifld))
+               else
+                  rel_diff(1,ifld) = abs_diff
+               endif
+               rel_diff(2,ifld) = abs_diff
+            enddo
+         else
+            rel_diff(:,:) = 0.0_r8
+         endif
+      endif
+
+! Set field sums to NaN and INF, as needed
+      do ifld=1,nflds
+         if (arr_gsum_infnan(ifld)) then
+            if (inf_nan_gchecks(1,ifld)) then
+               ! NaN => NaN
+               arr_gsum(ifld) = cism_infnan_nan
+            else if (inf_nan_gchecks(2,ifld) .and. inf_nan_gchecks(3,ifld)) then
+               ! posINF and negINF => NaN
+               arr_gsum(ifld) = cism_infnan_nan
+            else if (inf_nan_gchecks(2,ifld)) then
+               ! posINF only => posINF
+               arr_gsum(ifld) = cism_infnan_posinf
+            else if (inf_nan_gchecks(3,ifld)) then
+               ! negINF only => negINF
+               arr_gsum(ifld) = cism_infnan_neginf
+            endif
+         endif
+      end do
+
+! return statistics
+      if ( present(repro_sum_stats) ) then
+         repro_sum_stats(1) = repro_sum_stats(1) + repro_sum_fast
+         repro_sum_stats(2) = repro_sum_stats(2) + repro_sum_slow
+         repro_sum_stats(3) = repro_sum_stats(3) + repro_sum_both
+         repro_sum_stats(4) = repro_sum_stats(4) + nonrepro_sum
+         repro_sum_stats(5) = repro_sum_stats(5) + gbl_max_red
+         repro_sum_stats(6) = repro_sum_stats(6) + gbl_lor_red
+      endif
+
+      !WHL - debug
+      if (verbose_reprosum .and. mypid == 0) then
+         write(iulog,*) 'Exiting cism_reprosum_calc, arr_gsum =', arr_gsum(:)
+      endif
+
+   end subroutine cism_reprosum_calc
+
+!
+!========================================================================
+!
+
+   subroutine cism_reprosum_int (arr, arr_gsum, nsummands, dsummands, nflds, &
+                                arr_max_shift, arr_gmax_exp, max_levels,    &
+                                max_level, skip_field, validate, recompute, &
+                                omp_nthreads, mpi_comm                      )
+!----------------------------------------------------------------------
+!
+! Purpose:
+! Compute the global sum of each field in "arr" using the indicated
+! communicator with a reproducible yet scalable implementation based
+! on a fixed point algorithm. The accuracy of the fixed point algorithm
+! is controlled by the number of "levels" of integer expansion, the
+! maximum value of which is specified by max_level.
+!
+!----------------------------------------------------------------------
+!
+! Arguments
+!
+      integer,  intent(in) :: nsummands     ! number of local summands
+      integer,  intent(in) :: dsummands     ! declared first dimension
+      integer,  intent(in) :: nflds         ! number of fields
+      integer,  intent(in) :: arr_max_shift ! maximum safe exponent for
+                                            !  value < 1 (so that sum
+                                            !  does not overflow)
+      integer,  intent(in) :: arr_gmax_exp(nflds)
+                                            ! exponents of global maxima
+      integer,  intent(in) :: max_levels(nflds)
+                                            ! maximum number of levels
+                                            !  of integer expansion
+      integer,  intent(in) :: max_level     ! maximum value in
+                                            !  max_levels
+      integer,  intent(in) :: omp_nthreads  ! number of OpenMP threads
+      integer,  intent(in) :: mpi_comm      ! MPI subcommunicator
+
+      real(r8), intent(in) :: arr(dsummands,nflds)
+                                            ! input array
+
+      logical,  intent(in) :: skip_field(nflds)
+         ! flag indicating whether the sum for this field should be
+         ! computed or not (used to skip over fields containing
+         ! NaN or INF summands)
+
+      logical,  intent(in) :: validate
+         ! flag indicating that accuracy of solution generated from
+         ! arr_gmax_exp and max_levels should be tested
+
+      logical,  intent(out):: recompute
+         ! flag indicating that either the upper bounds are inaccurate,
+         !  or max_levels and arr_gmax_exp do not generate accurate
+         !  enough sums
+
+      real(r8), intent(out):: arr_gsum(nflds)      ! global sums
+!
+! Local workspace
+!
+      integer, parameter  :: max_jlevel = &
+                                1 + (digits(0_i8)/digits(0.0_r8))
+
+      integer(i8) :: i8_arr_tlsum_level(0:max_level,nflds,omp_nthreads)
+                                   ! integer vector representing local
+                                   !  sum (per thread, per field)
+      integer(i8) :: i8_arr_lsum_level((max_level+3)*nflds)
+                                   ! integer vector representing local
+                                   !  sum
+      integer(i8) :: i8_arr_level  ! integer part of summand for current
+                                   !  expansion level
+      integer(i8) :: i8_arr_gsum_level((max_level+3)*nflds)
+                                   ! integer vector representing global
+                                   !  sum
+      integer(i8) :: IX_8          ! integer representation of current
+                                   !  jlevels of X_8 ('part' of
+                                   !  i8_arr_gsum_level)
+      integer(i8) :: i8_sign       ! sign global sum
+      integer(i8) :: i8_radix      ! radix for i8 variables
+
+      integer :: max_error(nflds,omp_nthreads)
+                                   ! accurate upper bound on data?
+      integer :: not_exact(nflds,omp_nthreads)
+                                   ! max_levels sufficient to
+                                   !  capture all digits?
+      integer :: isum_beg(omp_nthreads), isum_end(omp_nthreads)
+                                   ! range of summand indices for each
+                                   !  OpenMP thread
+      integer :: ifld, isum, ithread
+                                   ! loop variables
+      integer :: arr_exp           ! exponent of summand
+      integer :: arr_shift         ! exponent used to generate integer
+                                   !  for current expansion level
+      integer :: ilevel            ! current integer expansion level
+      integer :: offset(nflds)     ! beginning location in
+                                   !  i8_arr_{g,l}sum_level for integer
+                                   !  expansion of current ifld
+      integer :: voffset           ! modification to offset used to
+                                   !  include validation metrics
+      integer :: ioffset           ! offset(ifld)
+      integer :: jlevel            ! number of floating point 'pieces'
+                                   !  extracted from a given i8 integer
+      integer :: ierr              ! MPI error return
+      integer :: LX(max_jlevel)    ! exponent of X_8 (see below)
+      integer :: veclth            ! total length of i8_arr_lsum_level
+      integer :: sum_digits        ! lower bound on number of significant
+                                   !  in integer expansion of sum
+      integer :: curr_exp          ! exponent of partial sum during
+                                   !  reconstruction from integer vector
+      integer :: corr_exp          ! exponent of current summand in
+                                   !  reconstruction from integer vector
+
+      real(r8) :: arr_frac         ! fraction of summand
+      real(r8) :: arr_remainder    ! part of summand remaining after
+                                   !  current level of integer expansion
+      real(r8) :: X_8(max_jlevel)  ! r8 vector representation of current
+                                   !  i8_arr_gsum_level
+      real(r8) :: RX_8             ! r8 representation of difference
+                                   !  between current i8_arr_gsum_level
+                                   !  and current jlevels of X_8
+                                   !  (== IX_8). Also used in final
+                                   !  scaling step
+
+      logical :: first             ! flag used to indicate that just
+                                   !  beginning reconstruction of sum
+                                   !  from integer vector
+
+      !WHL - debug
+      integer :: mypid, k
+      call mpi_comm_rank(MPI_COMM_WORLD, mypid, ierr)
+
+      if (verbose_reprosum .and. mypid == 0) then
+         write(iulog,*) 'In cism_reprosum_int, pid, arr =', mypid, arr(:,:)
+         write(iulog,*) 'dsummands, nsummands =', dsummands, nsummands
+         write(iulog,*) 'size(arr) =', size(arr,1), size(arr,2)
+      endif
+
+!
+!-----------------------------------------------------------------------
+! Save radix of i8 variables in an i8 variable
+      i8_radix = radix(IX_8)
+
+! If validating upper bounds, reserve space for validation metrics
+! In both cases, reserve an extra level for overflow from the top level
+      if (validate) then
+        voffset = 3
+      else
+        voffset = 1
+      endif
+
+      ! compute offsets for each field
+      offset(1) = voffset
+      do ifld=2,nflds
+         offset(ifld) = offset(ifld-1) &
+                        + (max_levels(ifld-1) + voffset)
+      enddo
+      veclth = offset(nflds) + max_levels(nflds)
+
+! split summand index range over OpenMP threads
+! WHL - Should this be dsummands instead of nsummands?
+!!      call split_indices(nsummands, omp_nthreads, isum_beg, isum_end)
+      call split_indices(dsummands, omp_nthreads, isum_beg, isum_end)
+
+      if (verbose_reprosum .and. mypid == 0) then
+         write(iulog,*) 'pid, i8_radix, voffset, veclth, nflds =', mypid, i8_radix, voffset, veclth, nflds
+         write(iulog,*) '   max_levels =', max_levels(:)
+         write(iulog,*) '   dsummands, omp_nthreads =', dsummands, omp_nthreads
+         write(iulog,*) '   isum_beg/end =', isum_beg, isum_end
+!         write(iulog,*) '   size(i8_arr_lsum_level) = ', size(i8_arr_lsum_level)
+!         write(iulog,*) '   size(i8_arr_gsum_level) = ', size(i8_arr_gsum_level)
+      endif
+
+
+! convert local summands to vector of integers and sum
+! (Using scale instead of set_exponent because arr_remainder may not be
+!  "normal" after level 1 calculation)
+      i8_arr_lsum_level(:) = 0_i8
+
+      !WHLmod - initialized the gsum also.
+      !         This avoids having some garbage (lots of nonzero values) in the vector returned from mpi_reduce_sum.
+      !         (Though I don't think the nonzero values do any harm
+      i8_arr_gsum_level(:) = 0_i8
+      !end WHLmod
+
+!$omp parallel do      &
+!$omp default(shared)  &
+!$omp private(ithread, ifld, ioffset, isum, arr_frac, arr_exp, &
+!$omp         arr_shift, ilevel, i8_arr_level, arr_remainder, RX_8, IX_8)
+      do ithread=1,omp_nthreads
+#ifdef TIMING
+         call t_startf('repro_sum_loopb')
+#endif
+         do ifld=1,nflds
+          ioffset = offset(ifld)
+
+          max_error(ifld,ithread) = 0
+          not_exact(ifld,ithread) = 0
+          i8_arr_tlsum_level(:,ifld,ithread) = 0_i8
+
+          if (skip_field(ifld)) cycle
+
+          do isum=isum_beg(ithread),isum_end(ithread)
+
+            if (verbose_reprosum .and. mypid == 1) then
+               write (iulog,*) '  pid, isum, ifld, arr(isum,ifld) =', mypid, isum, ifld, arr(isum,ifld)
+            endif
+
+            arr_remainder = 0.0_r8
+
+            !WHL - debug
+            ! If isum_end > size(arr,1) = dsummands, then the next 'if' will try to access nonexistent memory.
+            ! Adding some logic to exit the do loop first.
+            ! Not necessary if split_indices is called with dsummands
+            !TODO: Check the split_indices logic for isum_end
+!!!!            if (isum > size(arr,1)) exit
+
+            if (arr(isum,ifld) .ne. 0.0_r8) then
+               arr_exp   = exponent(arr(isum,ifld))
+               arr_frac  = fraction(arr(isum,ifld))
+
+               if (verbose_reprosum) then
+!!!                  write(iulog,*) '   pid, isum, arr_exp, arr_frac:', mypid, isum, arr_exp, arr_frac 
+               endif
+
+! test that global maximum upper bound is an upper bound
+               if (arr_exp > arr_gmax_exp(ifld)) then
+                  max_error(ifld,ithread) = 1
+                  exit
+               endif
+
+! calculate first shift
+               arr_shift = arr_max_shift - (arr_gmax_exp(ifld)-arr_exp)
+
+! determine first (probably) nonzero level (assuming initial fraction is
+!  'normal' - algorithm still works if this is not true)
+!  NOTE: this is critical; scale will set to zero if min exponent is too small.
+               if (arr_shift < 1) then
+                  ilevel = (1 + (arr_gmax_exp(ifld)-arr_exp))/arr_max_shift
+                  arr_shift = ilevel*arr_max_shift - (arr_gmax_exp(ifld)-arr_exp)
+
+                  do while (arr_shift < 1)
+                     arr_shift = arr_shift + arr_max_shift
+                     ilevel = ilevel + 1
+                  enddo
+               else
+                  ilevel = 1
+               endif
+
+               if (verbose_reprosum .and. mypid == 1) then
+                  write (iulog,*) '  pid, ilevel, max_levels =', mypid, ilevel, max_levels(ifld)
+               endif
+
+               if (ilevel .le. max_levels(ifld)) then
+! apply first shift/truncate, add it to the relevant running
+! sum, and calculate the remainder.
+                  arr_remainder = scale(arr_frac,arr_shift)
+                  i8_arr_level = int(arr_remainder,i8)
+                  i8_arr_tlsum_level(ilevel,ifld,ithread) = &
+                     i8_arr_tlsum_level(ilevel,ifld,ithread) + i8_arr_level
+                  arr_remainder = arr_remainder - i8_arr_level
+
+                  if (verbose_reprosum .and. mypid <= 1) then
+                     write(iulog,*) '   pid, arr_old, i8_arr_level, arr_new:', &
+                          mypid, scale(arr_frac,arr_shift), i8_arr_level, arr_remainder
+                  endif
+                  
+! while the remainder is non-zero, continue to shift, truncate,
+! sum, and calculate new remainder
+                  do while ((arr_remainder .ne. 0.0_r8) &
+                     .and. (ilevel < max_levels(ifld)))
+                     ilevel = ilevel + 1
+                     if (verbose_reprosum .and. mypid <= 1) then
+                        write(iulog,*) '        pid, ilevel, arr_old =', mypid, ilevel, arr_remainder
+                     endif
+                     arr_remainder = scale(arr_remainder,arr_max_shift)
+                     i8_arr_level = int(arr_remainder,i8)
+                     i8_arr_tlsum_level(ilevel,ifld,ithread) = &
+                          i8_arr_tlsum_level(ilevel,ifld,ithread) + i8_arr_level
+
+                     if (verbose_reprosum .and. mypid <= 1) then
+                        write(iulog,*) '        pid, arr_scaled, i8_arr_level:', &
+                             mypid, arr_remainder, i8_arr_level
+                     endif
+
+                     arr_remainder = arr_remainder - i8_arr_level
+
+                  enddo  ! arr_remainder /= 0.0_r8
+
+               endif
+            endif  ! arr(isum,ifld) /= 0.0_r8
+
+            if (arr_remainder .ne. 0.0_r8) then
+               not_exact(ifld,ithread) = 1
+            endif
+
+          enddo  ! isum
+
+! postprocess integer vector to eliminate potential for overlap in the following
+! sums over threads and processes: if value larger than or equal to
+! (radix(IX_8)**arr_max_shift), add this 'overlap' to next larger integer in
+! vector, resulting in nonoverlapping ranges for each component. Note that
+! "ilevel-1==0" corresponds to an extra level used to guarantee that the sums
+! over threads and processes do not overflow for ilevel==1.
+          do ilevel=max_levels(ifld),1,-1
+             RX_8 = i8_arr_tlsum_level(ilevel,ifld,ithread)
+             IX_8 = int(scale(RX_8,-arr_max_shift),i8)
+             if (IX_8 .ne. 0_i8) then
+                i8_arr_tlsum_level(ilevel-1,ifld,ithread) = &
+                   i8_arr_tlsum_level(ilevel-1,ifld,ithread) + IX_8
+                IX_8 = IX_8*(i8_radix**arr_max_shift)
+                i8_arr_tlsum_level(ilevel,ifld,ithread)   = &
+                   i8_arr_tlsum_level(ilevel,ifld,ithread) - IX_8
+             endif
+          enddo
+       enddo
+#ifdef TIMING
+       call t_stopf('repro_sum_loopb')
+#endif
+    enddo  ! ithread
+
+! sum contributions from different threads
+      do ifld=1,nflds
+
+         if (verbose_reprosum .and. mypid == 0) then
+!!            write(iulog,*) 'ifld =', ifld
+         endif
+
+         ioffset = offset(ifld)
+         do ithread = 1,omp_nthreads
+            do ilevel = 0,max_levels(ifld)
+               i8_arr_lsum_level(ioffset+ilevel) = &
+                  i8_arr_lsum_level(ioffset+ilevel) &
+                  + i8_arr_tlsum_level(ilevel,ifld,ithread)
+
+               if (verbose_reprosum) then
+!!                  write(iulog,*) '   pid, ilevel, ioffset, i8_arr_lsum_level =', &
+!!                       mypid, ilevel, i8_arr_lsum_level(ioffset+ilevel)
+               endif
+               
+            enddo
+         enddo
+      enddo
+
+! record if upper bound was inaccurate or if level expansion stopped
+! before full accuracy was achieved
+      if (validate) then
+         do ifld=1,nflds
+            ioffset = offset(ifld)
+            i8_arr_lsum_level(ioffset-voffset+1) = maxval(max_error(ifld,:))
+            i8_arr_lsum_level(ioffset-voffset+2) = maxval(not_exact(ifld,:))
+         enddo
+      endif
+
+! sum integer vector element-wise
+#if ( defined noI8 )
+     ! Workaround for when shr_kind_i8 is not supported.
+#ifdef TIMING
+      call t_startf("repro_sum_allr_i4")
+#endif
+      call mpi_allreduce (i8_arr_lsum_level, i8_arr_gsum_level, &
+                          veclth, MPI_INTEGER, MPI_SUM, mpi_comm, ierr)
+
+#ifdef TIMING
+      call t_stopf("repro_sum_allr_i4")
+#endif
+#else
+#ifdef TIMING
+      call t_startf("repro_sum_allr_i8")
+#endif
+
+      call mpi_allreduce (i8_arr_lsum_level, i8_arr_gsum_level, &
+                          veclth, MPI_INTEGER8, MPI_SUM, mpi_comm, ierr)
+
+      if (verbose_reprosum .and. mypid < 5) then
+         do k = 1, size(i8_arr_lsum_level)
+            if (i8_arr_lsum_level(k) /= 0) then
+               write(iulog,*) 'pid, k, i8_arr_lsum_level =', mypid, k, i8_arr_lsum_level(k)
+            endif
+         enddo
+      endif
+
+      if (verbose_reprosum .and. mypid == 0) then
+         do k = 1, size(i8_arr_gsum_level)
+            if (i8_arr_gsum_level(k) /= 0) then
+               write(iulog,*) 'pid, k, i8_arr_gsum_level =', mypid, k, i8_arr_gsum_level(k)
+            endif
+         enddo
+      endif
+
+#ifdef TIMING
+      call t_stopf("repro_sum_allr_i8")
+#endif
+#endif
+
+! Construct global sum from integer vector representation:
+!  1) arr_max_shift is the shift applied to fraction(arr_gmax) .
+!   When shifting back, need to "add back in" true arr_gmax exponent. This was
+!   removed implicitly by working only with the fraction .
+!  2) want to add levels into sum in reverse order (smallest to largest). However,
+!   even this can generate floating point rounding errors if signs of integers
+!   alternate. To avoid this, do some arithmetic with integer vectors so that all
+!   components have the same sign. This should keep relative difference between
+!   using different integer sizes (e.g. i8 and i4) to machine epsilon
+!  3) assignment to X_8 will usually lose accuracy since maximum integer
+!   size is greater than the max number of 'digits' in r8 value (if xmax_nsummands
+!   correction not very large). Calculate remainder and add in first (since
+!   smaller). One correction is sufficient for r8 (53 digits) and i8 (63 digits).
+!   For r4 (24 digits) may need to correct twice. Code is written in a general
+!   fashion, to work no matter how many corrections are necessary (assuming
+!   max_jlevel parameter calculation is correct).
+
+      recompute = .false.
+      do ifld=1,nflds
+         arr_gsum(ifld) = 0.0_r8
+         ioffset = offset(ifld)
+
+! if validate is .true., test whether the summand upper bound
+!  was exceeded on any of the processes
+         if (validate) then
+            if (i8_arr_gsum_level(ioffset-voffset+1) .ne. 0_i8) then
+               recompute = .true.
+            endif
+         endif
+
+         if (.not. recompute) then
+
+! preprocess integer vector:
+!  a) if value larger than or equal to (radix(IX_8)**arr_max_shift), add this 'overlap'
+!     to next larger integer in vector, resulting in nonoverlapping ranges for each
+!     component. Note that have "ilevel-1=0" level here as described above.
+           do ilevel=max_levels(ifld),1,-1
+             RX_8 = i8_arr_gsum_level(ioffset+ilevel)
+             IX_8 = int(scale(RX_8,-arr_max_shift),i8)
+             if (IX_8 .ne. 0_i8) then
+               i8_arr_gsum_level(ioffset+ilevel-1) = i8_arr_gsum_level(ioffset+ilevel-1) &
+                                                   + IX_8
+               IX_8 = IX_8*(i8_radix**arr_max_shift)
+               i8_arr_gsum_level(ioffset+ilevel)   = i8_arr_gsum_level(ioffset+ilevel)   &
+                                                   - IX_8
+             endif
+           enddo
+!  b) subtract +/- 1 from larger and add +/- 1 to smaller when necessary
+!     so that all vector components have the same sign (eliminating loss
+!     of accuracy arising from difference of large values when
+!     reconstructing r8 sum from integer vector)
+           ilevel = 0
+           do while ((i8_arr_gsum_level(ioffset+ilevel) .eq. 0_i8) &
+                     .and. (ilevel < max_levels(ifld)))
+             ilevel = ilevel + 1
+           enddo
+!
+           if (ilevel < max_levels(ifld)) then
+              if (i8_arr_gsum_level(ioffset+ilevel) > 0_i8) then
+                 i8_sign = 1_i8
+              else
+                 i8_sign = -1_i8
+              endif
+              do jlevel=ilevel,max_levels(ifld)-1
+                 if (sign(1_i8,i8_arr_gsum_level(ioffset+jlevel)) &
+                     .ne. sign(1_i8,i8_arr_gsum_level(ioffset+jlevel+1))) then
+                    i8_arr_gsum_level(ioffset+jlevel)   = i8_arr_gsum_level(ioffset+jlevel) &
+                                                        - i8_sign
+                    i8_arr_gsum_level(ioffset+jlevel+1) = i8_arr_gsum_level(ioffset+jlevel+1) &
+                                                        + i8_sign*(i8_radix**arr_max_shift)
+                 endif
+              enddo
+            endif
+
+! start with maximum shift, and work up to larger values
+            arr_shift = arr_gmax_exp(ifld) &
+                        - max_levels(ifld)*arr_max_shift
+            curr_exp = 0
+            first = .true.
+            do ilevel=max_levels(ifld),0,-1
+
+               if (i8_arr_gsum_level(ioffset+ilevel) .ne. 0_i8) then
+                  jlevel = 1
+
+! r8 representation of higher order bits in integer
+                  X_8(jlevel) = i8_arr_gsum_level(ioffset+ilevel)
+                  LX(jlevel)  = exponent(X_8(jlevel))
+
+! calculate remainder
+                  IX_8 = int(X_8(jlevel),i8)
+                  RX_8 = (i8_arr_gsum_level(ioffset+ilevel) - IX_8)
+
+! repeat using remainder
+                  do while (RX_8 .ne. 0.0_r8)
+                     jlevel = jlevel + 1
+                     X_8(jlevel) = RX_8
+                     LX(jlevel) = exponent(RX_8)
+                     IX_8 = IX_8 + int(RX_8,i8)
+                     RX_8 = (i8_arr_gsum_level(ioffset+ilevel) - IX_8)
+                  enddo
+
+! add in contributions, smaller to larger, rescaling for each
+! addition to guarantee that exponent of working summand is always
+! larger than minexponent
+                  do while (jlevel > 0)
+                     if (first) then
+                        curr_exp = LX(jlevel) + arr_shift
+                        arr_gsum(ifld) = fraction(X_8(jlevel))
+                        first = .false.
+                     else
+                        corr_exp = curr_exp - (LX(jlevel) + arr_shift)
+                        arr_gsum(ifld) = fraction(X_8(jlevel)) &
+                                       + scale(arr_gsum(ifld),corr_exp)
+                        curr_exp = LX(jlevel) + arr_shift
+                     endif
+                     jlevel = jlevel - 1
+                  enddo
+
+               endif
+
+               arr_shift = arr_shift + arr_max_shift
+            enddo
+
+! apply final exponent correction, scaling first if exponent is too small
+! to apply directly
+            corr_exp = curr_exp + exponent(arr_gsum(ifld))
+            if (corr_exp .ge. MINEXPONENT(1._r8)) then
+               arr_gsum(ifld) = set_exponent(arr_gsum(ifld),corr_exp)
+            else
+               RX_8 = set_exponent(arr_gsum(ifld), &
+                                   corr_exp-MINEXPONENT(1._r8))
+               arr_gsum(ifld) = scale(RX_8,MINEXPONENT(1._r8))
+            endif
+
+            if (verbose_reprosum .and. mypid == 0) then
+               write(iulog,*) '   ifld, arr_gsum =', ifld, arr_gsum(ifld)
+            endif
+
+! if validate is .true. and some precision lost, test whether 'too much'
+!  was lost, due to too loose an upper bound, too stringent a limit on number
+!  of levels of expansion, cancellation, .... Calculated by comparing lower
+!  bound on number of sigificant digits with number of digits in 1.0_r8 .
+            if (validate) then
+               if (i8_arr_gsum_level(ioffset-voffset+2) .ne. 0_i8) then
+
+! find first nonzero level and use exponent for this level, then assume all
+! subsequent levels contribute arr_max_shift digits.
+                  sum_digits = 0
+                  do ilevel=0,max_levels(ifld)
+                     if (sum_digits .eq. 0) then
+                        if (i8_arr_gsum_level(ioffset+ilevel) .ne. 0_i8) then
+                           X_8(1) = i8_arr_gsum_level(ioffset+ilevel)
+                           LX(1)  = exponent(X_8(1))
+                           sum_digits = LX(1)
+                        endif
+                     else
+                        sum_digits = sum_digits + arr_max_shift
+                     endif
+                  enddo
+
+                  if (sum_digits < digits(1.0_r8)) then
+                     recompute = .true.
+                  endif
+               endif
+            endif
+
+         endif
+
+      enddo
+
+      if (verbose_reprosum .and. mypid == 0) then
+         write(iulog,*) '   Done in cism_reprosum_int, pid =', mypid
+         k = 1
+         write(iulog,*) '   arr_gsum, exp, frac =', arr_gsum(k), &
+              exponent(arr_gsum(k)), fraction(arr_gsum(k))
+      endif
+
+   end subroutine cism_reprosum_int
+
+!
+!========================================================================
+!
+
+   logical function cism_reprosum_tolExceeded (name, nflds, master, &
+                                              logunit, rel_diff    )
+!----------------------------------------------------------------------
+!
+! Purpose:
+! Test whether distributed sum exceeds tolerance and print out a
+! warning message.
+!
+!----------------------------------------------------------------------
+!
+! Arguments
+!
+      character(len=*), intent(in) :: name    ! distributed sum identifier
+      integer,  intent(in) :: nflds           ! number of fields
+      logical,  intent(in) :: master          ! process that will write
+                                              !  warning messages?
+      integer, optional, intent(in) :: logunit! unit warning messages
+                                              !  written to
+      real(r8), intent(in) :: rel_diff(2,nflds)
+                                              ! relative and absolute
+                                              !  differences between fixed
+                                              !  and floating point sums
+
+!
+! Local workspace
+!
+      integer  :: llogunit                    ! local log unit
+      integer  :: ifld                        ! field index
+      integer  :: exceeds_limit               ! number of fields whose
+                                              !  sum exceeds tolerance
+      real(r8) :: max_rel_diff                ! maximum relative difference
+      integer  :: max_rel_diff_idx            ! field index for max. rel. diff.
+      real(r8) :: max_abs_diff                ! maximum absolute difference
+      integer  :: max_abs_diff_idx            ! field index for max. abs. diff.
+!
+!-----------------------------------------------------------------------
+!
+      cism_reprosum_tolExceeded = .false.
+      if (cism_reprosum_reldiffmax < 0.0_r8) return
+
+      if ( present(logunit) ) then
+         llogunit = logunit
+      else
+!!         llogunit = s_logunit
+         llogunit = iulog
+      endif
+
+      ! check that "fast" reproducible sum is accurate enough.
+      exceeds_limit = 0
+      max_rel_diff = 0.0_r8
+      max_abs_diff = 0.0_r8
+      max_rel_diff_idx = 0
+      do ifld=1,nflds
+         if (rel_diff(1,ifld) > cism_reprosum_reldiffmax) then
+            exceeds_limit = exceeds_limit + 1
+            if (rel_diff(1,ifld) > max_rel_diff) then
+               max_rel_diff = rel_diff(1,ifld)
+               max_rel_diff_idx = ifld
+            endif
+            if (rel_diff(2,ifld) > max_abs_diff) then
+               max_abs_diff = rel_diff(2,ifld)
+               max_abs_diff_idx = ifld
+            endif
+         endif
+      enddo
+
+      if (exceeds_limit > 0) then
+         if (master) then
+            write(llogunit,*) trim(name), &
+                            ': difference in fixed and floating point sums ', &
+                            ' exceeds tolerance in ', exceeds_limit, &
+                            ' fields.'
+            write(llogunit,*) '  Maximum relative diff: (rel)', &
+                            rel_diff(1,max_rel_diff_idx), ' (abs) ', &
+                            rel_diff(2,max_rel_diff_idx)
+            write(llogunit,*) '  Maximum absolute diff: (rel)', &
+                            rel_diff(1,max_abs_diff_idx), ' (abs) ', &
+                            rel_diff(2,max_abs_diff_idx)
+         endif
+         cism_reprosum_tolExceeded = .true.
+       endif
+
+
+    end function cism_reprosum_tolExceeded
+
+!
+!========================================================================
+!
+
+   subroutine cism_reprosum_ddpdd (arr, arr_gsum, nsummands, dsummands,  &
+                                  nflds, mpi_comm                       )
+!----------------------------------------------------------------------
+!
+! Purpose:
+! Compute the global sum of each field in "arr" using the indicated
+! communicator with a reproducible yet scalable implementation based
+! on He and Ding's implementation of the double-double algorithm.
+!
+!----------------------------------------------------------------------
+!
+! Arguments
+!
+      integer,  intent(in) :: nsummands  ! number of local summands
+      integer,  intent(in) :: dsummands  ! declared first dimension
+      integer,  intent(in) :: nflds      ! number of fields
+      real(r8), intent(in) :: arr(dsummands,nflds)
+                                         ! input array
+      integer,  intent(in) :: mpi_comm   ! MPI subcommunicator
+
+      real(r8), intent(out):: arr_gsum(nflds)
+                                         ! global sums
+
+!
+! Local workspace
+!
+      integer :: old_cw                  ! for x86 processors, save
+                                         !  current arithmetic mode
+      integer :: ifld, isum              ! loop variables
+      integer :: ierr                    ! MPI error return
+
+      real(r8)    :: e, t1, t2           ! temporaries
+      complex(r8) :: arr_lsum_dd(nflds)  ! local sums (in double-double
+                                         !  format)
+      complex(r8) :: arr_gsum_dd(nflds)  ! global sums (in double-double
+                                         !  format)
+
+      integer, save :: mpi_sumdd
+      logical, save :: first_time = .true.
+
+!
+!-----------------------------------------------------------------------
+!
+      !WHL - This is a C routine in the CESM shared code: shr_reprosumx86.c
+      !      Commented out the call.
+      !      Not sure whether this breaks the algorithm. 
+!!      call cism_reprosumx86_fix_start (old_cw)
+
+      if (first_time) then
+         call mpi_op_create(ddpdd, .true., mpi_sumdd, ierr)
+         first_time = .false.
+      endif
+
+      do ifld=1,nflds
+         arr_lsum_dd(ifld) = (0.0_r8,0.0_r8)
+
+         do isum=1,nsummands
+
+            ! Compute arr(isum,ifld) + arr_lsum_dd(ifld) using Knuth''s
+            ! trick.
+            t1 = arr(isum,ifld) + real(arr_lsum_dd(ifld))
+            e  = t1 - arr(isum,ifld)
+            t2 = ((real(arr_lsum_dd(ifld)) - e) &
+                  + (arr(isum,ifld) - (t1 - e))) &
+                 + aimag(arr_lsum_dd(ifld))
+
+            ! The result is t1 + t2, after normalization.
+            arr_lsum_dd(ifld) = cmplx ( t1 + t2, t2 - ((t1 + t2) - t1), r8 )
+         enddo
+
+      enddo
+#ifdef TIMING
+      call t_startf("repro_sum_allr_c16")
+#endif
+      call mpi_allreduce (arr_lsum_dd, arr_gsum_dd, nflds, &
+                          MPI_COMPLEX16, mpi_sumdd, mpi_comm, ierr)
+#ifdef TIMING
+      call t_stopf("repro_sum_allr_c16")
+#endif
+      do ifld=1,nflds
+         arr_gsum(ifld) = real(arr_gsum_dd(ifld))
+      enddo
+
+      !WHL - commented out; see comment above
+!!      call cism_reprosumx86_fix_end (old_cw)
+
+   end subroutine cism_reprosum_ddpdd
+!
+!-----------------------------------------------------------------------
+!
+   subroutine DDPDD (dda, ddb, len, itype)
+!----------------------------------------------------------------------
+!
+! Purpose:
+! Modification of original codes written by David H. Bailey
+! This subroutine computes ddb(i) = dda(i)+ddb(i)
+!
+!----------------------------------------------------------------------
+!
+! Arguments
+!
+      integer, intent(in)        :: len       ! array length
+      complex(r8), intent(in)    :: dda(len)  ! input
+      complex(r8), intent(inout) :: ddb(len)  ! result
+      integer, intent(in)        :: itype     ! unused
+!
+! Local workspace
+!
+      real(r8) e, t1, t2
+      integer i
+!
+!-----------------------------------------------------------------------
+!
+      do i = 1, len
+!   Compute dda + ddb using Knuth's trick.
+         t1 = real(dda(i)) + real(ddb(i))
+         e  = t1 - real(dda(i))
+         t2 = ((real(ddb(i)) - e) + (real(dda(i)) - (t1 - e))) &
+              + aimag(dda(i)) + aimag(ddb(i))
+
+!   The result is t1 + t2, after normalization.
+         ddb(i) = cmplx ( t1 + t2, t2 - ((t1 + t2) - t1), r8 )
+      enddo
+
+
+   end subroutine DDPDD
+!
+!-----------------------------------------------------------------------
+!
+  subroutine split_indices(total,num_pieces,ibeg,iend)
+!----------------------------------------------------------------------
+!
+! Purpose:
+! Split range into 'num_pieces'
+!
+!----------------------------------------------------------------------
+!
+! Arguments
+!
+    integer, intent(in) :: total
+    integer, intent(in) :: num_pieces
+    integer, intent(out) :: ibeg(num_pieces), iend(num_pieces)
+!
+! Local workspace
+!
+    integer :: itmp1, itmp2, ioffset, i
+!
+!-----------------------------------------------------------------------
+!
+    itmp1 = total/num_pieces
+    itmp2 = mod(total,num_pieces)
+    ioffset = 0
+    do i=1,itmp2
+       ibeg(i) = ioffset + 1
+       iend(i) = ioffset + (itmp1+1)
+       ioffset = iend(i)
+    enddo
+    do i=itmp2+1,num_pieces
+       ibeg(i) = ioffset + 1
+       if (ibeg(i) > total) then
+          iend(i) = ibeg(i) - 1
+       else
+          iend(i) = ioffset + itmp1
+          ioffset = iend(i)
+       endif
+    enddo
+
+  end subroutine split_indices
+!
+!========================================================================
+!
+end module cism_reprosum_mod
diff --git a/libglimmer/glimmer_global.F90 b/libglimmer/glimmer_global.F90
index e9eb745d..0aefadee 100644
--- a/libglimmer/glimmer_global.F90
+++ b/libglimmer/glimmer_global.F90
@@ -30,12 +30,12 @@
 
 module glimmer_global
 
-  !> Module holding global variables for Glimmer. Holds real-type
-  !> kind values, and other global code parameters.
+  !> Module holding global variables for Glimmer.
+  !> Holds real and integer kind values and other global code parameters.
 
   implicit none
 
-  integer,parameter :: sp = kind(1.0) 
+  integer,parameter :: sp = kind(1.0)
 
   !> Single precision --- Fortran single-precision real-type kind 
   !> value. Used internally.
@@ -44,7 +44,7 @@ module glimmer_global
   !> the -r8 flag), then this parameter may need to be set in agreement with 
   !> that.
 
-  integer,parameter :: dp = kind(1.0d0) 
+  integer,parameter :: dp = kind(1.0d0)
   
   !> Double precision --- Fortran double-precision real-type kind 
   !> value. Used internally.
@@ -53,6 +53,12 @@ module glimmer_global
   !> the -r8 flag), then this parameter may need to be set in agreement
   !> with that
 
+  ! Integer kinds
+  ! Note: Integers are i4 by default.
+  !       i8 integers can be used to generate reproducible sums
+  integer, parameter :: i4 = kind(1)
+  integer, parameter :: i8 = kind(1_8)
+
 !WHL - Removed rk from the code, so commenting out these declarations
 !!#ifdef GLIMMER_SP
 !!  integer,parameter :: rk=sp !< Precision of glimmer module --- the general Fortran real-type kind value for the Glimmer module and its interfaces.
diff --git a/libglimmer/ncdf_template.F90.in b/libglimmer/ncdf_template.F90.in
index 177c5e29..378b18b4 100644
--- a/libglimmer/ncdf_template.F90.in
+++ b/libglimmer/ncdf_template.F90.in
@@ -437,14 +437,14 @@ contains
     ic=>model%funits%frc_first
     do while(associated(ic))
 
-!       if (main_task .and. verbose_read_forcing) write(6,*) 'possible forcing times', ic%times
+!       if (main_task .and. verbose_read_forcing) write(iulog,*) 'possible forcing times', ic%times
 
        if (ic%read_once) then  ! read once at initialization; do not re-read at runtime
 
           ic%nc%just_processed = .true.  ! prevent the file from being read
           if (main_task .and. verbose_read_forcing) then
-             write(6,*) ' '
-             write(6,*) 'In NAME_read_forcing; will not re-read the read_once file ', trim(ic%nc%filename)
+             write(iulog,*) ' '
+             write(iulog,*) 'In NAME_read_forcing; will not re-read the read_once file ', trim(ic%nc%filename)
           endif
 
        else  ! not a read_once file
@@ -464,11 +464,11 @@ contains
           endif
 
           if (main_task .and. verbose_read_forcing) then
-             write(6,*) ' '
-             write(6,*) 'In NAME_read_forcing, model time + eps =', model%numerics%time + eps
-             write(6,*) 'Forcing file nt, time_offset =', ic%nt, ic%time_offset
-             write(6,*) 'time_start_cycle, nyear_cycle:', ic%time_start_cycle, ic%nyear_cycle
-             write(6,*) 'current forcing time =', current_forcing_time
+             write(iulog,*) ' '
+             write(iulog,*) 'In NAME_read_forcing, model time + eps =', model%numerics%time + eps
+             write(iulog,*) 'Forcing file nt, time_offset =', ic%nt, ic%time_offset
+             write(iulog,*) 'time_start_cycle, nyear_cycle:', ic%time_start_cycle, ic%nyear_cycle
+             write(iulog,*) 'current forcing time =', current_forcing_time
           endif
 
           ! Find the time index associated with the previous model time step
@@ -476,7 +476,7 @@ contains
           do t = ic%nt, 1, -1  ! look through the time array backwards
              if (ic%times(t) <= current_forcing_time - model%numerics%tinc) then
                 t_prev = t
-                if (main_task .and. verbose_read_forcing) write(6,*) 'Previous time index =', t_prev
+                if (main_task .and. verbose_read_forcing) write(iulog,*) 'Previous time index =', t_prev
                 exit
              end if
           enddo
@@ -486,7 +486,7 @@ contains
              if ( ic%times(t) <= current_forcing_time) then
                 ! use the largest time that is smaller or equal to the current time (stepwise forcing)
                 if (main_task .and. verbose_read_forcing) &
-                     write(6,*) 'Largest time less than current forcing time: t, times(t):', t, ic%times(t)
+                     write(iulog,*) 'Largest time less than current forcing time: t, times(t):', t, ic%times(t)
 
                 ! If this time index (t) is larger than the previous index (t_prev), then read a new time slice.
                 ! Otherwise, we already have the current slice, and there is nothing new to read.
@@ -494,7 +494,7 @@ contains
                    ! Set the desired time to be read
                    ic%current_time = t
                    ic%nc%just_processed = .false.  ! set this to false so file will be read.
-                   if (main_task .and. verbose_read_forcing) write(6,*) 'Read new forcing slice: t, times(t) =', t, ic%times(t)
+                   if (main_task .and. verbose_read_forcing) write(iulog,*) 'Read new forcing slice: t, times(t) =', t, ic%times(t)
                 endif ! t > t_prev
 
                 exit  ! once we find the time, exit the loop
@@ -548,10 +548,10 @@ contains
        if (ic%read_once) then
 
           if (main_task .and. verbose_read_forcing) then
-             write(6,*) ' '
-             write(6,*) 'In NAME_read_forcing_once'
-             write(6,*) 'Filename =', trim(ic%nc%filename)
-             write(6,*) 'Number of slices =', ic%nt
+             write(iulog,*) ' '
+             write(iulog,*) 'In NAME_read_forcing_once'
+             write(iulog,*) 'Filename =', trim(ic%nc%filename)
+             write(iulog,*) 'Number of slices =', ic%nt
           endif
 
           write(message,*) 'Reading', ic%nt, 'slices of file ', trim(ic%nc%filename), ' just once at initialization'
@@ -568,7 +568,7 @@ contains
           do t = 1, ic%nt
 
              if (main_task .and. verbose_read_forcing) then
-                write(6,*) 'Read new forcing slice: t index, times(t) =', t, ic%times(t)
+                write(iulog,*) 'Read new forcing slice: t index, times(t) =', t, ic%times(t)
              endif
 
              ! Set the desired time to be read
@@ -588,7 +588,7 @@ contains
        endif  ! read_once
 
        if (main_task .and. verbose_read_forcing) then
-          write(6,*) 'Final ic%nc%vars = ', trim(ic%nc%vars)
+          write(iulog,*) 'Final ic%nc%vars = ', trim(ic%nc%vars)
        endif
 
        ic=>ic%next
@@ -650,13 +650,13 @@ contains
           endif
 
           if (main_task .and. verbose_read_forcing) then
-             write(6,*) ' '
-             write(6,*) 'In NAME_retrieve_forcing, model time + eps =', model%numerics%time + eps
-             write(6,*) 'Filename = ', trim(ic%nc%filename)
-             write(6,*) 'Forcing file nt, time_offset =', ic%nt, ic%time_offset
-             write(6,*) 'time_start_cycle, nyear_cycle:', ic%time_start_cycle, ic%nyear_cycle
-             write(6,*) 'current forcing time =', current_forcing_time
-             write(6,*) 'variable list:', trim(ic%nc%vars)
+             write(iulog,*) ' '
+             write(iulog,*) 'In NAME_retrieve_forcing, model time + eps =', model%numerics%time + eps
+             write(iulog,*) 'Filename = ', trim(ic%nc%filename)
+             write(iulog,*) 'Forcing file nt, time_offset =', ic%nt, ic%time_offset
+             write(iulog,*) 'time_start_cycle, nyear_cycle:', ic%time_start_cycle, ic%nyear_cycle
+             write(iulog,*) 'current forcing time =', current_forcing_time
+             write(iulog,*) 'variable list:', trim(ic%nc%vars)
           endif
 
           ! Optionally, associate the current forcing time with a different date in the forcing file.
@@ -669,8 +669,8 @@ contains
              open(unit=11, file=trim(ic%shuffle_file), status='old')
              this_year = int(current_forcing_time - model%numerics%tstart)
              if (main_task .and. verbose_read_forcing) then
-                write(6,*) 'shuffle_file = ', trim(ic%shuffle_file)
-                write(6,*) 'tstart, this_year =', model%numerics%tstart, this_year
+                write(iulog,*) 'shuffle_file = ', trim(ic%shuffle_file)
+                write(iulog,*) 'tstart, this_year =', model%numerics%tstart, this_year
              endif
              forcing_year = 0
              do while (forcing_year == 0)
@@ -684,11 +684,11 @@ contains
              decimal_year = current_forcing_time - floor(current_forcing_time)
              current_forcing_time = real(forcing_year,dp) + decimal_year
              if (main_task .and. verbose_read_forcing) then
-                write(6,*) 'forcing_year, decimal =', forcing_year, decimal_year
-                write(6,*) 'shuffled forcing_time =', current_forcing_time
+                write(iulog,*) 'forcing_year, decimal =', forcing_year, decimal_year
+                write(iulog,*) 'shuffled forcing_time =', current_forcing_time
              endif
           else
-             if (main_task .and. verbose_read_forcing) write(6,*) 'no shuffle_file'
+             if (main_task .and. verbose_read_forcing) write(iulog,*) 'no shuffle_file'
           endif  ! shuffle_file exists
 
           ! Find the time index associated with the previous model time step
@@ -696,7 +696,7 @@ contains
           do t = ic%nt, 1, -1  ! look through the time array backwards
              if (ic%times(t) <= current_forcing_time - model%numerics%tinc) then
                 t_prev = t
-                if (main_task .and. verbose_read_forcing) write(6,*) 'Previous time index =', t_prev
+                if (main_task .and. verbose_read_forcing) write(iulog,*) 'Previous time index =', t_prev
                 exit
              end if
           enddo
@@ -706,14 +706,14 @@ contains
              if ( ic%times(t) <= current_forcing_time) then
                 ! use the largest time that is smaller or equal to the current time (stepwise forcing)
                 if (main_task .and. verbose_read_forcing) &
-                     write(6,*) 'Largest time less than current forcing time: t, times(t):', t, ic%times(t)
+                     write(iulog,*) 'Largest time less than current forcing time: t, times(t):', t, ic%times(t)
                 ! If this time index (t) is larger than the previous index (t_prev), then retrieve a new time slice.
                 ! Otherwise, we already have the current slice, and there is nothing new to read.
                 if (t > t_prev) then
                    ! Set the desired time to be read
                    ic%current_time = t
                    retrieve_new_slice = .true.
-                   if (main_task .and. verbose_read_forcing) write(6,*) 'Retrieve new forcing slice'
+                   if (main_task .and. verbose_read_forcing) write(iulog,*) 'Retrieve new forcing slice'
                    write(message,*) &
                         'Retrieve slice', t, 'at forcing time', ic%times(t), 'from file ', trim(ic%nc%filename)
                    call write_log(message)
diff --git a/libglimmer/parallel_mpi.F90 b/libglimmer/parallel_mpi.F90
index 39a06994..972b8b32 100644
--- a/libglimmer/parallel_mpi.F90
+++ b/libglimmer/parallel_mpi.F90
@@ -30,6 +30,16 @@ module cism_parallel
   use glimmer_global, only : dp, sp
   use glimmer_paramets, only: iulog
 
+!TODO - Not sure setopts is needed
+!TODO - Remove coupled ifdefs and always use the CISM version?
+!TODO - Use cism_reprosum_mod from individual functions
+#ifdef CCSMCOUPLED
+  use shr_reprosum_mod, only: shr_reprosum_setopts, shr_reprosum_calc
+#else
+  use cism_reprosum_mod, only: cism_reprosum_setopts, cism_reprosum_calc
+#endif
+  use cism_reprosum_mod, only: verbose_reprosum
+
   implicit none
 
   ! integers associated with the main global communicator
@@ -144,6 +154,10 @@ module cism_parallel
      integer :: main_rank_col  ! integer ID for the master task on the column
      logical :: main_task_col  ! true if this_rank_col = main_rank_col
 
+     ! option to compute reproducible sums
+     logical :: reprosum       ! if true, compute reproducible global sums
+                               ! (interface parallel_reduce_sum)
+
   end type parallel_type
 
   ! Information on the local & global bounds of an array
@@ -291,17 +305,21 @@ module cism_parallel
   interface parallel_global_sum
      module procedure parallel_global_sum_integer_2d
      module procedure parallel_global_sum_integer_3d
-     module procedure parallel_global_sum_real4_2d
      module procedure parallel_global_sum_real8_2d
      module procedure parallel_global_sum_real8_3d
   end interface
 
-  interface parallel_global_sum_staggered
-     module procedure parallel_global_sum_staggered_3d_real8
-     module procedure parallel_global_sum_staggered_3d_real8_nvar
-     module procedure parallel_global_sum_staggered_2d_real8
-     module procedure parallel_global_sum_staggered_2d_real8_nvar
-  end interface parallel_global_sum_staggered
+  interface parallel_global_sum_patch
+     module procedure parallel_global_sum_patch_integer_2d
+     module procedure parallel_global_sum_patch_real8_2d
+  end interface parallel_global_sum_patch
+
+  interface parallel_global_sum_stagger
+     module procedure parallel_global_sum_stagger_real8_2d
+     module procedure parallel_global_sum_stagger_real8_3d
+     module procedure parallel_global_sum_stagger_real8_2d_nflds
+     module procedure parallel_global_sum_stagger_real8_3d_nflds
+  end interface parallel_global_sum_stagger
 
   interface parallel_halo
      module procedure parallel_halo_integer_2d
@@ -2427,7 +2445,9 @@ end function distributed_get_var_real8_3d
 
   subroutine distributed_grid(ewn,      nsn,        &
                               parallel,             &
-                              nhalo_in, global_bc_in)
+                              nhalo_in,             &
+                              global_bc_in,         &
+                              reprosum_in)
 
     ! Divide the global domain into blocks, with one task per block.
     ! Set various grid and domain variables for the local task.
@@ -2437,6 +2457,7 @@ subroutine distributed_grid(ewn,      nsn,        &
     type(parallel_type), intent(inout) :: parallel      ! info for parallel communication, computed here
     integer, intent(in), optional :: nhalo_in           ! number of rows of halo cells
     character(*), intent(in), optional :: global_bc_in  ! string indicating the global BC option
+    logical, intent(in), optional :: reprosum_in        ! if true, compute reproducible global sums
 
     integer :: best,i,j,metric
     real(dp) :: rewtasks,rnstasks
@@ -2479,7 +2500,8 @@ subroutine distributed_grid(ewn,      nsn,        &
          staggered_ilo      => parallel%staggered_ilo,      &
          staggered_ihi      => parallel%staggered_ihi,      &
          staggered_jlo      => parallel%staggered_jlo,      &
-         staggered_jhi      => parallel%staggered_jhi       &
+         staggered_jhi      => parallel%staggered_jhi,      &
+         reprosum           => parallel%reprosum            &
          )
 
     ! set the boundary conditions (periodic by default)
@@ -2665,6 +2687,25 @@ subroutine distributed_grid(ewn,      nsn,        &
         call parallel_stop(__FILE__, __LINE__)
     endif
 
+    if (present(reprosum_in)) then
+       reprosum = reprosum_in
+    else
+       reprosum = .false.
+    endif
+
+    ! If computing reproducible sums, then set some options
+    !TODO - Are these saved from one call to the next?
+    ! Note: For standalone CISM, reprosum = F by default; can set = T in the config file
+    !       For CESM coupled runs, reprosum = T by default
+    if (reprosum) then
+
+#ifdef CCSM_COUPLED
+!!       call shr_reprosum_setops()
+#else
+!!       call cism_reprosum_setops()
+#endif
+    endif  ! reprosum
+
 !    call parallel_barrier
 !    write(iulog,*) 'task, west, east, south, north:', this_rank, west, east, south, north
 
@@ -2693,6 +2734,7 @@ subroutine distributed_grid_active_blocks(ewn,      nsn,        &
                                             nx_block, ny_block,   &
                                             ice_domain_mask,      &
                                             parallel,             &
+                                            reprosum_in,          &
                                             inquire_only)
 
     ! Divide the global domain into blocks, setting various grid and domain variables
@@ -2731,6 +2773,7 @@ subroutine distributed_grid_active_blocks(ewn,      nsn,        &
     integer, intent(in), dimension(:,:) :: &
          ice_domain_mask                            ! = 1 where ice is potentially present and active, else = 0
     type(parallel_type), intent(inout) :: parallel  ! info for parallel communication, computed here
+    logical, intent(in), optional :: reprosum_in    ! if true, compute reproducible global sums
     logical, intent(in), optional :: inquire_only   ! if true, then report the number of active blocks and abort
 
     integer :: i, j, nb, nt
@@ -2809,7 +2852,8 @@ subroutine distributed_grid_active_blocks(ewn,      nsn,        &
          staggered_ilo      => parallel%staggered_ilo,      &
          staggered_ihi      => parallel%staggered_ihi,      &
          staggered_jlo      => parallel%staggered_jlo,      &
-         staggered_jhi      => parallel%staggered_jhi       &
+         staggered_jhi      => parallel%staggered_jhi,      &
+         reprosum           => parallel%reprosum            &
          )
 
     if (present(inquire_only)) then
@@ -3283,7 +3327,28 @@ subroutine distributed_grid_active_blocks(ewn,      nsn,        &
 !            southwest_corner, southeast_corner, northwest_corner, northeast_corner
     endif
 
-    ! Uncomment to print grid geometry
+    if (present(reprosum_in)) then
+       reprosum = reprosum_in
+    else
+       reprosum = .false.
+    endif
+
+    ! If computing reproducible sums, then set some options
+    !TODO - Are these saved from one call to the next?
+    ! Note: For standalone CISM, reprosum = F by default; can set = T in the config file
+    !       For CESM coupled runs, reprosum = T by default
+
+    if (reprosum) then
+
+#ifdef CCSM_COUPLED
+!!       call shr_reprosum_setops()
+#else
+!!       call cism_reprosum_setops()
+#endif
+
+    endif  ! reprosum
+
+! Uncomment to print grid geometry
 !    write(iulog,*) " "
 !    write(iulog,*) "Process ", this_rank, " Total = ", tasks, " ewtasks = ", ewtasks, " nstasks = ", nstasks
 !    write(iulog,*) "Process ", this_rank, " ewrank = ", ewrank, " nsrank = ", nsrank
@@ -6082,18 +6147,18 @@ end function parallel_global_sum_integer_3d
 
 !=======================================================================
 
-  function parallel_global_sum_real4_2d(a, parallel, mask_2d)
+  function parallel_global_sum_real8_2d(a, parallel, mask_2d)
 
-    ! Calculates the global sum of a 2D single-precision field
+    ! Calculates the global sum of a 2D double-precision field
 
-    real(sp),dimension(:,:),intent(in) :: a
+    real(dp), dimension(:,:), intent(in) :: a
     type(parallel_type) :: parallel
     integer, dimension(:,:), intent(in), optional :: mask_2d
 
     integer :: i, j
     integer, dimension(parallel%local_ewn,parallel%local_nsn) :: mask
-    real(sp) :: local_sum
-    real(sp) :: parallel_global_sum_real4_2d
+    real(dp) :: local_sum
+    real(dp) :: parallel_global_sum_real8_2d
 
     associate(  &
          local_ewn   => parallel%local_ewn,    &
@@ -6105,130 +6170,183 @@ function parallel_global_sum_real4_2d(a, parallel, mask_2d)
        mask = 1
     endif
 
-    local_sum = 0.0
-    do j = nhalo+1, local_nsn-nhalo
-       do i = nhalo+1, local_ewn-nhalo
-          if (mask(i,j) == 1) then
-             local_sum = local_sum + a(i,j)
-          endif
+    if (parallel%reprosum) then   ! compute using cism_reprosum_calc
+
+       !TODO - Add the code here
+       call parallel_stop(__FILE__,__LINE__)
+
+    else  ! compute using parallel_reduce_sum (not reproducible)
+
+       local_sum = 0.0d0
+       do j = nhalo+1, local_nsn-nhalo
+          do i = nhalo+1, local_ewn-nhalo
+             if (mask(i,j) == 1) then
+                local_sum = local_sum + a(i,j)
+             endif
+          enddo
        enddo
-    enddo
-    parallel_global_sum_real4_2d = parallel_reduce_sum(local_sum)
+
+       ! Compute the global sum
+       parallel_global_sum_real8_2d = parallel_reduce_sum(local_sum)
+
+    endif   ! reprosum
 
     end associate
 
-  end function parallel_global_sum_real4_2d
+  end function parallel_global_sum_real8_2d
 
 !=======================================================================
 
-  function parallel_global_sum_real8_2d(a, parallel, mask_2d)
+  function parallel_global_sum_real8_3d(a, parallel, mask_2d)
 
-    ! Calculates the global sum of a 2D double-precision field
+    ! Calculates the global sum of a 3D double-precision field
+    ! Note: The vertical dimension should be the first dimension of the input field.
 
-    real(dp), dimension(:,:), intent(in) :: a
+    real(dp), dimension(:,:,:),intent(in) :: a
     type(parallel_type) :: parallel
     integer, dimension(:,:), intent(in), optional :: mask_2d
 
-    integer :: i, j
+    integer :: i, j, k
+    integer :: kmax
     integer, dimension(parallel%local_ewn,parallel%local_nsn) :: mask
     real(dp) :: local_sum
-    real(dp) :: parallel_global_sum_real8_2d
+    real(dp) :: parallel_global_sum_real8_3d
 
     associate(  &
          local_ewn   => parallel%local_ewn,    &
          local_nsn   => parallel%local_nsn)
 
+    kmax = size(a,1)
+
+    ! Note: The mask is 2D, since typically all layers in a column are either masked in or masked out
     if (present(mask_2d)) then
        mask = mask_2d
     else
        mask = 1
     endif
 
-    local_sum = 0.0d0
+    if (parallel%reprosum) then   ! compute using cism_reprosum_calc
+
+       !TODO - Add the code here
+       call parallel_stop(__FILE__,__LINE__)
+
+    else  ! compute using parallel_reduce_sum (not reproducible)
+
+       local_sum = 0
+       do j = nhalo+1, local_nsn-nhalo
+          do i = nhalo+1, local_ewn-nhalo
+             if (mask(i,j) == 1) then
+                do k = 1, kmax
+                   local_sum = local_sum + a(k,i,j)
+                enddo
+             endif
+          enddo
+       enddo
+       parallel_global_sum_real8_3d = parallel_reduce_sum(local_sum)
+
+    endif   ! reprosum
+
+    end associate
+
+  end function parallel_global_sum_real8_3d
+
+!=======================================================================
+  ! subroutines belonging to the parallel_global_sum_patch interface
+
+  function parallel_global_sum_patch_integer_2d(a, npatch, patch_id, parallel)
+
+    ! Calculates the global sum of a 2D double-precision field over each
+    !  user-defined patch of the domain.
+    ! The number of patches = npatch.
+    ! Each cell has an integer ID assigning it to at most one patch.
+    ! If a cell has patch_id = 0, it belongs to no patches.
+
+    integer, dimension(:,:), intent(in) :: a
+    integer, intent(in) :: npatch
+    integer, dimension(:,:), intent(in) :: patch_id
+    type(parallel_type) :: parallel
+
+    integer :: i, j, np
+    integer, dimension(npatch) :: local_patch_sum
+    integer, dimension(npatch) :: parallel_global_sum_patch_integer_2d
+
+    associate(  &
+         local_ewn   => parallel%local_ewn,    &
+         local_nsn   => parallel%local_nsn)
+
+    local_patch_sum = 0
+
     do j = nhalo+1, local_nsn-nhalo
        do i = nhalo+1, local_ewn-nhalo
-          if (mask(i,j) == 1) then
-             local_sum = local_sum + a(i,j)
+          np = patch_id(i,j)
+          if (np > 0) then
+             local_patch_sum(np) = local_patch_sum(np) + a(i,j)
           endif
        enddo
     enddo
-    parallel_global_sum_real8_2d = parallel_reduce_sum(local_sum)
+
+    parallel_global_sum_patch_integer_2d = parallel_reduce_sum(local_patch_sum)
 
     end associate
 
-  end function parallel_global_sum_real8_2d
+  end function parallel_global_sum_patch_integer_2d
 
 !=======================================================================
 
-  function parallel_global_sum_real8_3d(a, parallel, mask_3d)
+  function parallel_global_sum_patch_real8_2d(a, npatch, patch_id, parallel)
 
-    ! Calculates the global sum of a 3D double-precision field
-    ! Note: The vertical dimension should be the first dimension of the input field.
+    ! Calculates the global sum of a 2D double-precision field over each
+    !  user-defined patch of the domain.
+    ! The number of patches = npatch.
+    ! Each cell has an integer ID assigning it to at most one patch.
+    ! If a cell has patch_id = 0, it belongs to no patches.
+    !TODO - Add a reprosum option with npatch = nflds
 
-    real(dp), dimension(:,:,:),intent(in) :: a
+    real(dp), dimension(:,:), intent(in) :: a
+    integer, intent(in) :: npatch
+    integer, dimension(:,:), intent(in) :: patch_id
     type(parallel_type) :: parallel
-    integer, dimension(:,:,:), intent(in), optional :: mask_3d
 
-    integer :: i, j, k
-    integer :: kmax
-    integer, dimension(size(a,1),parallel%local_ewn,parallel%local_nsn) :: mask
-    real(dp) :: local_sum
-    real(dp) :: parallel_global_sum_real8_3d
+    integer :: i, j, np
+    real(dp), dimension(npatch) :: local_patch_sum
+    real(dp), dimension(npatch) :: parallel_global_sum_patch_real8_2d
 
     associate(  &
          local_ewn   => parallel%local_ewn,    &
          local_nsn   => parallel%local_nsn)
 
-    kmax = size(a,1)
-
-    if (present(mask_3d)) then
-       mask = mask_3d
-    else
-       mask = 1
-    endif
+    local_patch_sum = 0.0d0
 
-    local_sum = 0
     do j = nhalo+1, local_nsn-nhalo
        do i = nhalo+1, local_ewn-nhalo
-          do k = 1, kmax
-             if (mask(k,i,j) == 1) then
-                local_sum = local_sum + a(k,i,j)
-             endif
-          enddo
+          np = patch_id(i,j)
+          if (np > 0) then
+             local_patch_sum(np) = local_patch_sum(np) + a(i,j)
+          endif
        enddo
     enddo
-    parallel_global_sum_real8_3d = parallel_reduce_sum(local_sum)
+
+    parallel_global_sum_patch_real8_2d = parallel_reduce_sum(local_patch_sum)
 
     end associate
 
-  end function parallel_global_sum_real8_3d
+  end function parallel_global_sum_patch_real8_2d
 
 !=======================================================================
+  ! subroutines belonging to the parallel_global_sum_stagger interface
 
-  ! subroutines belonging to the parallel_global_sum_staggered interface
-  !TODO - Turn these into functions, analogous to the parallel_global_sum functions above.
-
-  subroutine parallel_global_sum_staggered_3d_real8(&
-       nx,            ny,         &
-       nz,            parallel,   &
-       global_sum,                &
-       work1,  work2)
-
-    ! Sum one or two local arrays on the staggered grid, then take the global sum.
-
-    integer, intent(in) :: &
-         nx, ny,             &  ! horizontal grid dimensions (for scalars)
-         nz                     ! number of vertical layers at which velocity is computed
+  function parallel_global_sum_stagger_real8_2d(arr1, parallel, arr2)
 
-    type(parallel_type), intent(in) :: &
-         parallel               ! info for parallel communication
+    ! Calculate the global sum of a 2D double-precision field on the staggered grid
+    ! Similar to unstagged version, except it uses staggered_ilo/ihi/jlo/jhi
 
-    real(dp), intent(out) :: global_sum   ! global sum
-    real(dp), intent(in), dimension(nz,nx-1,ny-1) :: work1            ! local array
-    real(dp), intent(in), dimension(nz,nx-1,ny-1), optional :: work2  ! local array
+    real(dp), dimension(:,:), intent(in) :: arr1
+    type(parallel_type) :: parallel
+    real(dp), dimension(:,:), intent(in), optional :: arr2
 
-    integer :: i, j, k
+    integer :: i, j
     real(dp) :: local_sum
+    real(dp) :: parallel_global_sum_stagger_real8_2d
 
     integer :: &
          staggered_ilo, staggered_ihi, &  ! bounds of locally owned vertices on staggered grid
@@ -6239,58 +6357,53 @@ subroutine parallel_global_sum_staggered_3d_real8(&
     staggered_jlo = parallel%staggered_jlo
     staggered_jhi = parallel%staggered_jhi
 
-    local_sum = 0.d0
+    if (parallel%reprosum) then   ! compute using cism_reprosum_calc
 
-    ! sum over locally owned velocity points
+       !TODO - Add the code here
+       call parallel_stop(__FILE__,__LINE__)
 
-    if (present(work2)) then
-       do j = staggered_jlo, staggered_jhi
-          do i = staggered_ilo, staggered_ihi
-             do k = 1, nz
-                local_sum = local_sum + work1(k,i,j) + work2(k,i,j)
-             enddo
-          enddo
-       enddo
-    else
-       do j = staggered_jlo, staggered_jhi
-          do i = staggered_ilo, staggered_ihi
-             do k = 1, nz
-                local_sum = local_sum + work1(k,i,j)
+    else  ! compute using parallel_reduce_sum (not reproducible)
+
+       local_sum = 0.0d0
+
+       if (present(arr2)) then  ! compute global sum of arr1 + arr2
+
+          do j = staggered_jlo, staggered_jhi
+             do i = staggered_ilo, staggered_ihi
+                local_sum = local_sum + arr1(i,j) + arr2(i,j)
              enddo
           enddo
-       enddo
-    endif
 
-    ! take the global sum
+       else  ! compute global sum of arr1
 
-    global_sum = parallel_reduce_sum(local_sum)
+          do j = staggered_jlo, staggered_jhi
+             do i = staggered_ilo, staggered_ihi
+                local_sum = local_sum + arr1(i,j)
+             enddo
+          enddo
 
-  end subroutine parallel_global_sum_staggered_3d_real8
+       endif
 
-!=======================================================================
+       parallel_global_sum_stagger_real8_2d = parallel_reduce_sum(local_sum)
 
-  subroutine parallel_global_sum_staggered_3d_real8_nvar(&
-       nx,            ny,         &
-       nz,            parallel,   &
-       global_sum,                &
-       work1,  work2)
+    endif   ! reprosum
 
-    ! Sum one or two local arrays on the staggered grid, then take the global sum.
+  end function parallel_global_sum_stagger_real8_2d
 
-    integer, intent(in) :: &
-         nx, ny,                &  ! horizontal grid dimensions (for scalars)
-         nz                        ! number of vertical layers at which velocity is computed
+!=======================================================================
 
-    type(parallel_type), intent(in) :: &
-         parallel               ! info for parallel communication
+  function parallel_global_sum_stagger_real8_3d(arr1, parallel, arr2)
 
-    real(dp), intent(out), dimension(:) :: global_sum   ! global sum
+    ! Calculate the global sum of a 3D double-precision field on the staggered grid
+    ! Assumes k is the first index, followed by i and j
 
-    real(dp), intent(in), dimension(nz,nx-1,ny-1,size(global_sum)) :: work1            ! local array
-    real(dp), intent(in), dimension(nz,nx-1,ny-1,size(global_sum)), optional :: work2  ! local array
+    real(dp), dimension(:,:,:), intent(in) :: arr1
+    type(parallel_type) :: parallel
+    real(dp), dimension(:,:,:), intent(in), optional :: arr2
 
-    integer :: i, j, k, n, nvar
-    real(dp), dimension(size(global_sum)) :: local_sum
+    integer :: i, j, k, nz
+    real(dp) :: local_sum
+    real(dp) :: parallel_global_sum_stagger_real8_3d
 
     integer :: &
          staggered_ilo, staggered_ihi, &  ! bounds of locally owned vertices on staggered grid
@@ -6301,63 +6414,66 @@ subroutine parallel_global_sum_staggered_3d_real8_nvar(&
     staggered_jlo = parallel%staggered_jlo
     staggered_jhi = parallel%staggered_jhi
 
-    nvar = size(global_sum)
+    nz = size(arr1,1)
 
-    local_sum(:) = 0.d0
+    if (parallel%reprosum) then   ! compute using cism_reprosum_calc
 
-    do n = 1, nvar
+       !TODO - Add the code here
+       call parallel_stop(__FILE__,__LINE__)
 
-       ! sum over locally owned velocity points
+    else  ! compute using parallel_reduce_sum (not reproducible)
+
+       local_sum = 0.0d0
+
+       if (present(arr2)) then  ! compute global sum of arr1 + arr2
 
-       if (present(work2)) then
           do j = staggered_jlo, staggered_jhi
              do i = staggered_ilo, staggered_ihi
                 do k = 1, nz
-                   local_sum(n) = local_sum(n) + work1(k,i,j,n) + work2(k,i,j,n)
+                   local_sum = local_sum + arr1(k,i,j) + arr2(k,i,j)
                 enddo
              enddo
           enddo
-       else
+
+       else  ! compute global sum of arr1
+
           do j = staggered_jlo, staggered_jhi
              do i = staggered_ilo, staggered_ihi
                 do k = 1, nz
-                   local_sum(n) = local_sum(n) + work1(k,i,j,n)
+                   local_sum = local_sum + arr1(k,i,j)
                 enddo
              enddo
           enddo
-       endif
 
-    enddo   ! nvar
+       endif
 
-    ! take the global sum
+       parallel_global_sum_stagger_real8_3d = parallel_reduce_sum(local_sum)
 
-    global_sum(:) = parallel_reduce_sum(local_sum(:))
+    endif   ! reprosum
 
-  end subroutine parallel_global_sum_staggered_3d_real8_nvar
+  end function parallel_global_sum_stagger_real8_3d
 
 !=======================================================================
 
-  subroutine parallel_global_sum_staggered_2d_real8(&
-       nx,            ny,    &
-       parallel,             &
-       global_sum,           &
-       work1,  work2)
+  function parallel_global_sum_stagger_real8_2d_nflds(arr1, nflds, parallel, arr2)
 
     ! Sum one or two local arrays on the staggered grid, then take the global sum.
+    ! The final index is equal to the number of independent fields to be summed.
+
+    real(dp), dimension(:,:,:), intent(in) :: arr1
 
-    integer, intent(in) :: &
-         nx, ny                     ! horizontal grid dimensions (for scalars)
+    integer, intent(in) :: nflds
 
     type(parallel_type), intent(in) :: &
          parallel               ! info for parallel communication
 
-    real(dp), intent(out) :: global_sum   ! global sum
+    real(dp), dimension(:,:,:), intent(in), optional :: arr2
 
-    real(dp), intent(in), dimension(nx-1,ny-1) :: work1            ! local array
-    real(dp), intent(in), dimension(nx-1,ny-1), optional :: work2  ! local array
+    real(dp), dimension(size(arr1,3)) :: parallel_global_sum_stagger_real8_2d_nflds
 
-    integer :: i, j
-    real(dp) :: local_sum
+    integer :: i, j, n
+
+    real(dp), dimension(size(arr1,3)) :: local_sum
 
     integer :: &
          staggered_ilo, staggered_ihi, &  ! bounds of locally owned vertices on staggered grid
@@ -6368,55 +6484,64 @@ subroutine parallel_global_sum_staggered_2d_real8(&
     staggered_jlo = parallel%staggered_jlo
     staggered_jhi = parallel%staggered_jhi
 
-    local_sum = 0.d0
+    if (parallel%reprosum) then   ! compute using cism_reprosum_calc
 
-    ! sum over locally owned velocity points
+       !TODO - Add the code here
+       call parallel_stop(__FILE__,__LINE__)
 
-    if (present(work2)) then
-       do j = staggered_jlo, staggered_jhi
-          do i = staggered_ilo, staggered_ihi
-             local_sum = local_sum + work1(i,j) + work2(i,j)
-          enddo
-       enddo
-    else
-       do j = staggered_jlo, staggered_jhi
-          do i = staggered_ilo, staggered_ihi
-             local_sum = local_sum + work1(i,j)
-          enddo
-       enddo
-    endif
+    else  ! compute using parallel_reduce_sum (not reproducible)
 
-    ! take the global sum
+       local_sum(:) = 0.d0
+
+       do n = 1, nflds
+
+          ! sum over locally owned velocity points
+
+          if (present(arr2)) then
+             do j = staggered_jlo, staggered_jhi
+                do i = staggered_ilo, staggered_ihi
+                   local_sum(n) = local_sum(n) + arr1(i,j,n) + arr2(i,j,n)
+                enddo
+             enddo
+          else
+             do j = staggered_jlo, staggered_jhi
+                do i = staggered_ilo, staggered_ihi
+                   local_sum(n) = local_sum(n) + arr1(i,j,n)
+                enddo
+             enddo
+          endif
 
-    global_sum = parallel_reduce_sum(local_sum)
+       enddo   ! nflds
 
-  end subroutine parallel_global_sum_staggered_2d_real8
+    endif   ! reprosum
+
+    ! take the global sum
+    parallel_global_sum_stagger_real8_2d_nflds = parallel_reduce_sum(local_sum(:))
+
+  end function parallel_global_sum_stagger_real8_2d_nflds
 
 !=======================================================================
 
-  subroutine parallel_global_sum_staggered_2d_real8_nvar(&
-       nx,            ny,            &
-       parallel,                     &
-       global_sum,                   &
-       work1,  work2)
+  function parallel_global_sum_stagger_real8_3d_nflds(arr1, nflds, parallel, arr2)
 
     ! Sum one or two local arrays on the staggered grid, then take the global sum.
+    ! Assumes k is the first index, followed by i and j.
+    ! The final index is equal to the number of independent fields to be summed.
+
+    real(dp), dimension(:,:,:,:), intent(in) :: arr1
 
-    integer, intent(in) :: &
-         nx, ny              ! horizontal grid dimensions (for scalars)
+    integer, intent(in) :: nflds   ! size of final index; number of global sums to be computed
 
     type(parallel_type), intent(in) :: &
          parallel               ! info for parallel communication
 
-    real(dp), intent(out), dimension(:) :: &
-         global_sum          ! global sum
+    real(dp), dimension(:,:,:,:), intent(in), optional :: arr2
 
-    real(dp), intent(in), dimension(nx-1,ny-1,size(global_sum)) :: work1            ! local array
-    real(dp), intent(in), dimension(nx-1,ny-1,size(global_sum)), optional :: work2  ! local array
+    real(dp), dimension(size(arr1,4)) :: parallel_global_sum_stagger_real8_3d_nflds
 
-    integer :: i, j, n, nvar
+    integer :: i, j, k, n, nz
 
-    real(dp), dimension(size(global_sum)) :: local_sum
+    real(dp), dimension(size(arr1,4)) :: local_sum
 
     integer :: &
          staggered_ilo, staggered_ihi, &  ! bounds of locally owned vertices on staggered grid
@@ -6427,38 +6552,49 @@ subroutine parallel_global_sum_staggered_2d_real8_nvar(&
     staggered_jlo = parallel%staggered_jlo
     staggered_jhi = parallel%staggered_jhi
 
-    nvar = size(global_sum)
+    nz = size(arr1,1)
 
-    local_sum(:) = 0.d0
+    if (parallel%reprosum) then   ! compute using cism_reprosum_calc
 
-    do n = 1, nvar
+       !TODO - Add the code here
+       call parallel_stop(__FILE__,__LINE__)
 
-       ! sum over locally owned velocity points
+    else  ! compute using parallel_reduce_sum (not reproducible)
 
-       if (present(work2)) then
-          do j = staggered_jlo, staggered_jhi
-             do i = staggered_ilo, staggered_ihi
-                local_sum(n) = local_sum(n) + work1(i,j,n) + work2(i,j,n)
+       local_sum(:) = 0.d0
+
+       do n = 1, nflds
+
+          ! sum over locally owned velocity points
+
+          if (present(arr2)) then
+             do j = staggered_jlo, staggered_jhi
+                do i = staggered_ilo, staggered_ihi
+                   do k = 1, nz
+                      local_sum(n) = local_sum(n) + arr1(k,i,j,n) + arr2(k,i,j,n)
+                   enddo
+                enddo
              enddo
-          enddo
-       else
-          do j = staggered_jlo, staggered_jhi
-             do i = staggered_ilo, staggered_ihi
-                local_sum(n) = local_sum(n) + work1(i,j,n)
+          else
+             do j = staggered_jlo, staggered_jhi
+                do i = staggered_ilo, staggered_ihi
+                   do k = 1, nz
+                      local_sum(n) = local_sum(n) + arr1(k,i,j,n)
+                   enddo
+                enddo
              enddo
-          enddo
-       endif
+          endif
 
-    enddo   ! nvar
+       enddo   ! nflds
 
-    ! take the global sum
+    endif   ! reprosum
 
-    global_sum(:) = parallel_reduce_sum(local_sum(:))
+    ! take the global sum
+    parallel_global_sum_stagger_real8_3d_nflds = parallel_reduce_sum(local_sum(:))
 
-  end subroutine parallel_global_sum_staggered_2d_real8_nvar
+  end function parallel_global_sum_stagger_real8_3d_nflds
 
 !=======================================================================
-
   ! functions belonging to the parallel_is_zero interface
 
   function parallel_is_zero_integer_2d(a)
@@ -8701,13 +8837,20 @@ end function parallel_reduce_sum_real4
   function parallel_reduce_sum_real8(x)
 
     use mpi_mod
+
     implicit none
+
     real(dp) :: x
 
     integer :: ierror
     real(dp) :: recvbuf,sendbuf, parallel_reduce_sum_real8
 
+    ! Input and output arguments for subroutine cism_reprosum_calc
+    real(dp), dimension(:,:), allocatable :: arr       ! array to be summed over processors
+    real(dp), dimension(:), allocatable :: arr_gsum    ! global sum of arr
+
     ! begin
+
     sendbuf = x
     call mpi_allreduce(sendbuf,recvbuf,1,mpi_real8,mpi_sum,comm,ierror)
     parallel_reduce_sum_real8 = recvbuf
@@ -8742,14 +8885,151 @@ function parallel_reduce_sum_real8_nvar(x)
     integer :: ierror, nvar
     real(dp), dimension(size(x)) :: recvbuf,sendbuf, parallel_reduce_sum_real8_nvar
 
+    ! Input and output arguments for subroutine cism_reprosum_calc
+    real(dp), dimension(:,:), allocatable :: arr       ! array to be summed over processors
+    real(dp), dimension(:), allocatable :: arr_gsum    ! global sum of arr
+
     ! begin
     nvar = size(x)
+
     sendbuf = x
     call mpi_allreduce(sendbuf,recvbuf,nvar,mpi_real8,mpi_sum,comm,ierror)
     parallel_reduce_sum_real8_nvar = recvbuf
 
   end function parallel_reduce_sum_real8_nvar
 
+!=======================================================================
+
+  subroutine parallel_reduce_reprosum(arr, arr_gsum)
+
+    ! Compute a reproducible global sum for a floating-point variable or array.
+    ! Can be called from parallel_global_sum, parallel_global_sum_patch, or
+    ! parallel_global_sum_stagger.
+    ! Still under construction
+
+    implicit none
+
+    real(dp), dimension(:,:), intent(in) :: arr
+    real(dp), dimension(:), intent(out) :: arr_gsum
+
+    ! Notes on subroutine cism_reprosum_calc:
+    ! The first five arguments are required: arr(dsummands,nflds), arr_gsum(dsummands),
+    !  dsummands, dflds and nsummands. Typically, nsummands = dsummands = number of local values.
+    ! We use the default fixed-precision algorithm (instead of ddpdd, which apparently is less robust).
+    ! We do not allow Inf or NaN values in the input array.
+    ! Typically, the algorithm calls mpi_allreduce twice. By passing in both arr_gbl_max and arr_max_levels,
+    !  it may be possible to call mpi_allreduce just once, improving performance.
+    !  If we don't pass these arguments, then arr_gbl_max and arr_max_levels are computed internally..
+    !  By passing arr_glb_max_out and arr_max_levels_out, we can see the calculated values.
+    ! By passing rel_diff, we can verify that the computed reproducible sum is close
+    !  to the (nonreproducible) floating-point value.
+    !
+!!    !    commid                    ! MPI communicator
+    ! See comments in cism_reprosum_calc for more info
+
+    ! Required arguments
+
+    integer :: dsummands, nflds                      ! dimensions of arr
+    integer :: nsummands                             ! number of processors
+
+    ! Optional arguments
+
+    real(dp), dimension(:), allocatable :: &
+         arr_gbl_max,         & ! upper bound on max(abs(arr))
+         arr_gbl_max_out        ! calculated upper bound on max(abs(arr))
+
+    real(dp), dimension(:,:), allocatable :: &
+         rel_diff               ! relative and absolute differences between fixed and floating point sums
+
+    integer, dimension(:), allocatable :: &
+         arr_max_levels,      & ! maximum number of levels of integer expansion to use
+         arr_max_levels_out     ! output of number of levels of integer expansion to use
+
+    integer :: &
+         gbl_max_nsummands,   & ! maximum of nsummand over all processes
+         gbl_max_nsummands_out  ! calculated maximum nsummands over all processes
+
+    integer, dimension(6) :: &
+         repro_sum_stats(6)     ! increment running totals for
+                                !  (1) one-reduction repro_sum
+                                !  (2) two-reduction repro_sum
+                                !  (3) both types in one call
+                                !  (4) nonrepro_sum
+                                !  (5) global max nsummands reduction
+                                !  (6) global lor 3*nflds reduction
+
+    logical :: &
+         ddpdd_sum,           & ! use ddpdd algorithm instead of fixed-precision algorithm
+         allow_infnan,        & ! if .true., allow INF or NaN input values;
+                                ! if .false. (the default), then abort if INF or NaNs are present
+         repro_sum_validate     ! flag enabling/disabling testing that gmax and max_levels
+                                ! are accurate/sufficient. Default is enabled.
+
+
+    ! Set parameters and allocate arrays
+    dsummands = size(arr,1)
+    nflds = size(arr,2)
+    nsummands = dsummands
+
+    !WHL - debug
+    if (verbose_reprosum .and. main_task) then
+       write(iulog,*) 'In parallel_reduce_reprosum'
+       write(iulog,*) 'dsummands, nflds:', dsummands, nflds
+    endif
+    if (verbose_reprosum) then
+!!       write(iulog,*) 'rank, arr:', this_rank, arr(:,:)
+    endif
+
+    allocate (arr_gbl_max(nflds))
+    allocate (arr_gbl_max_out(nflds))
+    allocate (arr_max_levels(nflds))
+    allocate (arr_max_levels_out(nflds))
+    allocate (rel_diff(2,nflds))
+
+    allow_infnan = .false.
+    ddpdd_sum = .false.
+    repro_sum_validate = .true.
+
+#ifdef CCSMCOUPLED
+
+!!      call shr_reprosum_calc
+
+#else
+
+    ! The following subroutine is adapted from shr_reprosum_calc in CESM shared code.
+
+    call cism_reprosum_calc(&
+         arr,        arr_gsum,                          &
+         nsummands,  dsummands,  nflds,                 &
+         allow_infnan = allow_infnan,                   &
+         ddpdd_sum = ddpdd_sum,                         &
+!            arr_gbl_max = arr_gbl_max,                     &
+         arr_gbl_max_out = arr_gbl_max_out,             &
+!            arr_max_levels = arr_max_levels,               &
+         arr_max_levels_out = arr_max_levels_out,       &
+!            gbl_max_nsummands = gbl_max_nsummands,         &
+         gbl_max_nsummands_out = gbl_max_nsummands_out, &
+         repro_sum_validate = repro_sum_validate,       &
+         repro_sum_stats = repro_sum_stats,             &
+         rel_diff = rel_diff)
+
+#endif
+
+    if (verbose_reprosum .and. main_task) then
+       write(iulog,*) 'arr_gbl_max_out =', arr_gbl_max_out
+       write(iulog,*) 'arr_max_levels_out =', arr_max_levels_out
+       write(iulog,*) 'gbl_max_nsummands_out =', gbl_max_nsummands_out
+       write(iulog,*) 'rel diff =', rel_diff(1,:)
+       write(iulog,*) 'abs diff =', rel_diff(2,:)
+       write(iulog,*) 'stats =', repro_sum_stats(:)
+    endif
+
+    deallocate(arr_gbl_max, arr_gbl_max_out)
+    deallocate(arr_max_levels, arr_max_levels_out)
+    deallocate(rel_diff)
+
+  end subroutine parallel_reduce_reprosum
+
 !=======================================================================
 
   ! functions belonging to the parallel_reduce_max interface
diff --git a/libglissade/glissade.F90 b/libglissade/glissade.F90
index 83eaec79..6a11c5ac 100644
--- a/libglissade/glissade.F90
+++ b/libglissade/glissade.F90
@@ -218,11 +218,12 @@ subroutine glissade_initialise(model, evolve_ice)
           ! The subroutine will report how many tasks are needed to compute on all active blocks, and then abort.
           ! The user can then resubmit (on an optimal number of processors) with model%options%compute_blocks = ACTIVE_BLOCKS.
 
-          call distributed_grid_active_blocks(model%general%ewn,      model%general%nsn,      &
-                                              model%general%nx_block, model%general%ny_block, &
-                                              model%general%ice_domain_mask,                  &
-                                              model%parallel,                                 &
-                                              inquire_only = .true.)
+          call distributed_grid_active_blocks(&
+               model%general%ewn,      model%general%nsn,      &
+               model%general%nx_block, model%general%ny_block, &
+               model%general%ice_domain_mask,                  &
+               model%parallel,                                 &
+               inquire_only = .true.)
 
        else  ! compute_blocks = ACTIVE_BLOCKS_ONLY
 
@@ -235,10 +236,12 @@ subroutine glissade_initialise(model, evolve_ice)
              model%general%global_bc = GLOBAL_BC_NO_ICE
           endif
 
-          call distributed_grid_active_blocks(model%general%ewn,      model%general%nsn,      &
-                                              model%general%nx_block, model%general%ny_block, &
-                                              model%general%ice_domain_mask,                  &
-                                              model%parallel)
+          call distributed_grid_active_blocks(&
+               model%general%ewn,      model%general%nsn,      &
+               model%general%nx_block, model%general%ny_block, &
+               model%general%ice_domain_mask,                  &
+               model%parallel,                                 &
+               reprosum_in = model%options%reproducible_sums)
 
        endif   ! compute_blocks
 
@@ -247,13 +250,19 @@ subroutine glissade_initialise(model, evolve_ice)
 
     elseif (model%general%global_bc == GLOBAL_BC_OUTFLOW) then
 
-       call distributed_grid(model%general%ewn, model%general%nsn, &
-                             model%parallel,    global_bc_in = 'outflow')
+       call distributed_grid(&
+            model%general%ewn,   model%general%nsn,         &
+            model%parallel,                                 &
+            reprosum_in = model%options%reproducible_sums,  &
+            global_bc_in = 'outflow')
 
     elseif (model%general%global_bc == GLOBAL_BC_NO_ICE) then
 
-       call distributed_grid(model%general%ewn, model%general%nsn, &
-                             model%parallel,     global_bc_in = 'no_ice')
+       call distributed_grid(&
+            model%general%ewn,   model%general%nsn,         &
+            model%parallel,                                 &
+            reprosum_in = model%options%reproducible_sums,  &
+            global_bc_in = 'no_ice')
 
     elseif (model%general%global_bc == GLOBAL_BC_NO_PENETRATION) then
 
@@ -261,15 +270,19 @@ subroutine glissade_initialise(model, evolve_ice)
        !       The difference is that we also use no-penetration masks for (uvel,vvel) at the global boundary
        !       (computed by calling staggered_no_penetration_mask below).
 
-       call distributed_grid(model%general%ewn, model%general%nsn, &
-                             model%parallel,     global_bc_in = 'no_penetration')
+       call distributed_grid(&
+            model%general%ewn,   model%general%nsn,         &
+            model%parallel,                                 &
+            reprosum_in = model%options%reproducible_sums,  &
+            global_bc_in = 'no_penetration')
 
     else  ! global_bc = GLOBAL_BC_PERIODIC
 
-!       call distributed_grid(model%general%ewn, model%general%nsn, global_bc_in = 'periodic')
-
-       call distributed_grid(model%general%ewn, model%general%nsn, &
-                             model%parallel,  global_bc_in = 'periodic')
+       call distributed_grid(&
+            model%general%ewn,   model%general%nsn,         &
+            model%parallel,                                 &
+            reprosum_in = model%options%reproducible_sums,  &
+            global_bc_in = 'periodic')
 
     endif
 
@@ -815,8 +828,7 @@ subroutine glissade_initialise(model, evolve_ice)
     call glide_set_mask(model%numerics,                                &
                         model%geometry%thck,  model%geometry%topg,     &
                         model%general%ewn,    model%general%nsn,       &
-                        model%climate%eus,    model%geometry%thkmask,  &
-                        model%geometry%iarea, model%geometry%ivol)
+                        model%climate%eus,    model%geometry%thkmask)
 
     ! compute halo for relaxed topography
     ! Note: See comments above with regard to the halo update for topg.
@@ -879,8 +891,7 @@ subroutine glissade_initialise(model, evolve_ice)
        call glide_set_mask(model%numerics,                                &
                            model%geometry%thck,  model%geometry%topg,     &
                            model%general%ewn,    model%general%nsn,       &
-                           model%climate%eus,    model%geometry%thkmask,  &
-                           model%geometry%iarea, model%geometry%ivol)
+                           model%climate%eus,    model%geometry%thkmask)
 
     endif  ! initial calving
 
@@ -1104,6 +1115,7 @@ subroutine glissade_initialise(model, evolve_ice)
 
           call glissade_basin_average(&
                model%general%ewn, model%general%nsn,   &
+               parallel,                               &
                model%ocean_data%nbasin,                &
                model%ocean_data%basin_number,          &
                floating_mask * 1.0d0,                  &   ! real mask
@@ -1155,7 +1167,7 @@ subroutine glissade_tstep(model, time)
 
     use glimmer_paramets, only: eps11
     use glimmer_physcon, only: scyr
-    use glide_mask, only: glide_set_mask, calc_iareaf_iareag
+    use glide_mask, only: glide_set_mask
     use glissade_mass_balance, only: glissade_prepare_climate_forcing
 
     implicit none
@@ -1360,15 +1372,7 @@ subroutine glissade_tstep(model, time)
     call glide_set_mask(model%numerics,                                &
                         model%geometry%thck,  model%geometry%topg,     &
                         model%general%ewn,    model%general%nsn,       &
-                        model%climate%eus,    model%geometry%thkmask,  &
-                        model%geometry%iarea, model%geometry%ivol)
-
-    ! --- Calculate global area of ice that is floating and grounded.
-    !TODO  May want to calculate iareaf and iareag in glide_write_diag and remove those calculations here.  
-
-    call calc_iareaf_iareag(model%numerics%dew,    model%numerics%dns,     &
-                            model%geometry%thkmask,                        &
-                            model%geometry%iareaf, model%geometry%iareag)
+                        model%climate%eus,    model%geometry%thkmask)
 
     ! ------------------------------------------------------------------------
     ! Do the vertical thermal solve if it is time to do so.
diff --git a/libglissade/glissade_basal_water.F90 b/libglissade/glissade_basal_water.F90
index 3b642157..18b03292 100644
--- a/libglissade/glissade_basal_water.F90
+++ b/libglissade/glissade_basal_water.F90
@@ -1031,8 +1031,7 @@ subroutine fill_depressions(&
     ! Continue until no further lowering of phi is possible.  At that point, phi = phi_out.
     ! Note: Setting eps = 0 would result in flat surfaces that would need to be fixed later.
 
-    use cism_parallel, only: parallel_reduce_sum
-    use cism_parallel, only: parallel_globalindex
+    use cism_parallel, only: parallel_reduce_sum, parallel_globalindex
 
     implicit none
 
diff --git a/libglissade/glissade_bmlt_float.F90 b/libglissade/glissade_bmlt_float.F90
index 7a0da2d5..227ed6e4 100644
--- a/libglissade/glissade_bmlt_float.F90
+++ b/libglissade/glissade_bmlt_float.F90
@@ -1014,6 +1014,7 @@ subroutine glissade_bmlt_float_thermal_forcing(&
 
        call glissade_basin_average(&
             nx,        ny,                   &
+            parallel,                        &
             ocean_data%nbasin,               &
             ocean_data%basin_number,         &
             thermal_forcing_mask * f_float,  &
@@ -1024,6 +1025,7 @@ subroutine glissade_bmlt_float_thermal_forcing(&
 
        call glissade_basin_average(&
             nx,        ny,                   &
+            parallel,                        &
             ocean_data%nbasin,               &
             ocean_data%basin_number,         &
             thermal_forcing_mask * f_float,  &
@@ -1126,6 +1128,7 @@ subroutine glissade_bmlt_float_thermal_forcing(&
 
                 call glissade_basin_average(&
                      nx,        ny,                   &
+                     parallel,                        &
                      ocean_data%nbasin,               &
                      ocean_data%basin_number,         &
                      thermal_forcing_mask * f_float,  &
diff --git a/libglissade/glissade_calving.F90 b/libglissade/glissade_calving.F90
index f2ad1cb1..04903763 100644
--- a/libglissade/glissade_calving.F90
+++ b/libglissade/glissade_calving.F90
@@ -37,10 +37,9 @@ module glissade_calving
   use glimmer_utils, only: point_diag
 
   use cism_parallel, only: this_rank, main_task, nhalo, &
-       parallel_halo, parallel_globalindex, &
+       parallel_halo, parallel_globalindex, parallel_global_sum, &
        parallel_reduce_sum, parallel_reduce_max, parallel_reduce_log_or
 
-
   implicit none
 
   private
@@ -50,7 +49,6 @@ module glissade_calving
   public :: verbose_calving
 
   logical, parameter :: verbose_calving = .false.
-!!  logical, parameter :: verbose_calving = .true.
 
 contains
 
@@ -582,15 +580,7 @@ subroutine glissade_calve_ice(nx,             ny,      &
        if (verbose_calving) then
           call point_diag(cf_length, 'cf_length (m)', itest, jtest, rtest, 7, 7)
           ! Diagnose the total CF length
-          total_cf_length = 0.0d0
-          do j = nhalo+1, ny-nhalo
-             do i = nhalo+1, nx-nhalo
-                if (calving_front_mask(i,j) == 1) then
-                   total_cf_length = total_cf_length + cf_length(i,j)
-                endif
-             enddo
-          enddo
-          total_cf_length =  parallel_reduce_sum(total_cf_length)
+          total_cf_length = parallel_global_sum(cf_length, parallel, calving_front_mask)
           if (this_rank == rtest) then
              write(iulog,*) 'Total CF length (km)', total_cf_length/1000.d0
           endif
@@ -871,13 +861,7 @@ subroutine glissade_calve_ice(nx,             ny,      &
           endif
 
           ! Compute the total ice area and the area of each quadrant
-          total_ice_area = 0.0d0
-          do j = nhalo+1, ny-nhalo
-             do i = nhalo+1, nx-nhalo
-                total_ice_area = total_ice_area + dx*dy*calving%effective_areafrac(i,j)
-             enddo
-          enddo
-          total_ice_area = parallel_reduce_sum(total_ice_area)
+          total_ice_area = parallel_global_sum(dx*dy*calving%effective_areafrac, parallel)
 
           if (verbose_calving) then
              if (this_rank == rtest) then
diff --git a/libglissade/glissade_glacier.F90 b/libglissade/glissade_glacier.F90
index cce8bdd9..b896beb6 100644
--- a/libglissade/glissade_glacier.F90
+++ b/libglissade/glissade_glacier.F90
@@ -75,7 +75,7 @@ subroutine glissade_glacier_init(model, glacier)
     ! The CISM input file contains the RGI IDs.
 
     use cism_parallel, only: distributed_gather_var, distributed_scatter_var, &
-         parallel_reduce_sum, parallel_reduce_max, parallel_reduce_min, parallel_is_zero, &
+         parallel_global_sum, parallel_reduce_max, parallel_reduce_min, parallel_is_zero, &
          broadcast, parallel_halo, staggered_parallel_halo, parallel_globalindex
 
     type(glide_global_type),intent(inout) :: model
@@ -113,6 +113,9 @@ subroutine glissade_glacier_init(model, glacier)
          current_id,                & ! current glacier_id from list
          gid_minval, gid_maxval       ! min and max values of glacier_id
 
+    integer, dimension(model%general%ewn,model%general%nsn) :: &
+         glacier_mask                 ! = 1 for cells with glaciers (glacier_id > 0), else = 0
+
     type(parallel_type) :: parallel   ! info for parallel communication
 
     !WHL - debug, for quicksort test
@@ -194,8 +197,7 @@ subroutine glissade_glacier_init(model, glacier)
        !  and these arrays should already have the correct dimensions.
 
        if (associated(glacier%glacierid)) deallocate(glacier%glacierid)
-       if (associated(glacier%cism_to_rgi_glacier_id)) &
-            deallocate(glacier%cism_to_rgi_glacier_id)
+       if (associated(glacier%cism_to_rgi_glacier_id)) deallocate(glacier%cism_to_rgi_glacier_id)
        if (associated(glacier%area)) deallocate(glacier%area)
        if (associated(glacier%volume)) deallocate(glacier%volume)
        if (associated(glacier%area_init)) deallocate(glacier%area_init)
@@ -220,21 +222,9 @@ subroutine glissade_glacier_init(model, glacier)
        ! Count the number of cells with glaciers
        ! Loop over locally owned cells
 
-       count = 0
-       do j = nhalo+1, nsn-nhalo
-          do i = nhalo+1, ewn-nhalo
-             if (glacier%rgi_glacier_id(i,j) > 0) then
-                count = count + 1
-             elseif (glacier%rgi_glacier_id(i,j) < 0) then  ! should not happen
-                call parallel_globalindex(i, j, iglobal, jglobal, parallel)
-                write(message,*) 'RGI glacier_id < 0: i, j, value =', &
-                     iglobal, jglobal, glacier%rgi_glacier_id(i,j)
-                call write_log(message, GM_FATAL)
-             endif
-          enddo
-       enddo
-
-       ncells_glacier = parallel_reduce_sum(count)
+       glacier_mask = 0
+       where (glacier%rgi_glacier_id > 0) glacier_mask = 1
+       ncells_glacier = parallel_global_sum(glacier_mask, parallel)
 
        ! Gather the RGI glacier IDs to the main task
        if (main_task) allocate(rgi_glacier_id_global(global_ewn, global_nsn))
@@ -434,6 +424,7 @@ subroutine glissade_glacier_init(model, glacier)
 
        call glacier_area_volume(&
             ewn,           nsn,               &
+            parallel,                         &
             nglacier,                         &
             glacier%cism_glacier_id_init,     &
             model%geometry%cell_area,         &  ! m^2
@@ -543,6 +534,7 @@ subroutine glissade_glacier_init(model, glacier)
 
        call glacier_2d_to_1d(&
             ewn,                   nsn,                           &
+            parallel,                                             &
             nglacier,              glacier%cism_glacier_id_init,  &
             model%climate%smb_obs, glacier%smb_obs)
 
@@ -614,6 +606,7 @@ subroutine glissade_glacier_init(model, glacier)
 
        call glacier_area_volume(&
             ewn,           nsn,               &
+            parallel,                         &
             nglacier,                         &
             glacier%cism_glacier_id,          &
             model%geometry%cell_area,         &  ! m^2
@@ -626,6 +619,7 @@ subroutine glissade_glacier_init(model, glacier)
 
        call glacier_area_volume(&
             ewn,           nsn,               &
+            parallel,                         &
             nglacier,                         &
             glacier%cism_glacier_id_init,     &
             model%geometry%cell_area,         &  ! m^2
@@ -730,7 +724,7 @@ subroutine glissade_glacier_update(model, glacier)
 
     use glissade_grid_operators, only: glissade_stagger
     use glissade_utils, only: glissade_usrf_to_thck
-    use cism_parallel, only: parallel_reduce_sum, parallel_global_sum, &
+    use cism_parallel, only: parallel_global_sum, &
          parallel_halo, staggered_parallel_halo
 
     ! Do glacier inversion (if applicable), update glacier masks, and compute glacier diagnostics.
@@ -767,12 +761,13 @@ subroutine glissade_glacier_update(model, glacier)
     integer :: i, j, ng
 
     integer, dimension(model%general%ewn, model%general%nsn) ::  &
-         ice_mask                   ! = 1 where ice is present (thck > thklim), else = 0
+         ice_mask,                & ! = 1 where ice is present (thck > thklim), else = 0
+         glacier_mask               ! temporary mask
 
     real(dp), dimension(model%general%ewn, model%general%nsn) ::  &
          thck,                    & ! ice thickness (m)
          dthck_dt,                & ! rate of change of thickness (m/yr)
-         cell_area,               & ! grid cell area (m^2)
+         cell_area_uniform,       & ! grid cell area defined as dew*dns(m^2)
          thck_old,                & ! saved value of ice thickness (m)
          artm,                    & ! artm, baseline or current date
          snow,                    & ! snowfall, baseline or current date
@@ -872,14 +867,12 @@ subroutine glissade_glacier_update(model, glacier)
 
     nglacier = glacier%nglacier
     ngdiag = glacier%ngdiag
+    cell_area_uniform = dew*dns
 
     ! some unit conversions
-    !TODO - Use model%geometry%thck without a copy.
     !       Skip these conversion and use SI units (s instead of yr) in the code.
-    dt = model%numerics%dt /scyr                      ! s to yr
-    thck = model%geometry%thck
+    dt = model%numerics%dt /scyr                    ! s to yr
     dthck_dt = model%geometry%dthck_dt * scyr       ! m/s to m/yr
-    cell_area = model%geometry%cell_area            ! model units to m^2
 
     ! Accumulate the 2D fields used for mu_star and alpha_snow inversion: snow and Tpos.
     ! Also accumulate dthck_dt, which is used for powerlaw_c inversion.
@@ -1189,23 +1182,23 @@ subroutine glissade_glacier_update(model, glacier)
 
        if (glacier%redistribute_advanced_ice) then
 
-          thck_old = thck
+          thck_old = model%geometry%thck
 
           call glacier_redistribute_advanced_ice(&
             ewn,             nsn,               &
+            parallel,                           &
             itest,   jtest,  rtest,             &
             nglacier,        ngdiag,            &
             real(glacier_update_interval,dp),   &  ! yr
-            dew*dns,                            &  ! m^2
+            cell_area_uniform,                  &  ! m^2
             glacier%thinning_rate_advanced_ice, &  ! m/yr
             glacier%cism_glacier_id_init,       &
             glacier%smb_glacier_id,             &
             model%climate%smb,                  &  ! m/yr
-            thck,                               &  ! m
-            parallel)
+            model%geometry%thck)                   ! m
 
           glacier%dthck_dt_annmean = glacier%dthck_dt_annmean + &
-               (thck - thck_old) / real(glacier_update_interval,dp)
+               (model%geometry%thck - thck_old) / real(glacier_update_interval,dp)
 
        endif   ! redistribute advanced ice
 
@@ -1230,6 +1223,7 @@ subroutine glissade_glacier_update(model, glacier)
 
        call glacier_2d_to_1d_weighted(&
             ewn,                  nsn,       &
+            parallel,                        &
             nglacier,                        &
             glacier%smb_glacier_id_init,     &
             smb_weight_init,                 &
@@ -1249,6 +1243,7 @@ subroutine glissade_glacier_update(model, glacier)
 
        call glacier_2d_to_1d_weighted(&
             ewn,                  nsn,       &
+            parallel,                        &
             nglacier,                        &
             glacier%smb_glacier_id,          &
             smb_weight_current,              &
@@ -1276,6 +1271,7 @@ subroutine glissade_glacier_update(model, glacier)
 
              call glacier_invert_mu_star_alpha_snow(&
                   ewn,                         nsn,                   &
+                  parallel,                                           &
                   itest,       jtest,          rtest,                 &
                   nglacier,                    ngdiag,                &
                   glacier%smb_glacier_id_init,                        &
@@ -1300,6 +1296,7 @@ subroutine glissade_glacier_update(model, glacier)
 
              call glacier_invert_mu_star(&
                   ewn,                   nsn,                  &
+                  parallel,                                    &
                   itest,     jtest,      rtest,                &
                   nglacier,              ngdiag,               &
                   glacier%smb_glacier_id_init,                 &
@@ -1318,13 +1315,13 @@ subroutine glissade_glacier_update(model, glacier)
        endif   ! set_mu_star
 
        ! advance/retreat diagnostics
-       ! Note: This subroutine assumes cell_area = dew*dns for all cells
        call glacier_area_advance_retreat(&
             ewn,           nsn,           &
+            parallel,                     &
             nglacier,                     &
             glacier%cism_glacier_id_init, &
             glacier%cism_glacier_id,      &
-            dew*dns,                      &
+            cell_area_uniform,            &
             area_initial,                 &
             area_current,                 &
             area_advance,                 &
@@ -1383,7 +1380,8 @@ subroutine glissade_glacier_update(model, glacier)
           ! Interpolate thck to the staggered grid
           call glissade_stagger(&
                ewn,         nsn,              &
-               thck,        stag_thck)
+               model%geometry%thck,           &
+               stag_thck)
 
           ! Interpolate dthck_dt to the staggered grid
           call glissade_stagger(&
@@ -1443,17 +1441,17 @@ subroutine glissade_glacier_update(model, glacier)
 
        call glacier_advance_retreat(&
             ewn,             nsn,           &
+            parallel,                       &
             itest,   jtest,  rtest,         &
             nglacier,                       &
             glacier%minthck,                &  ! m
-            thck,                           &  ! m
+            model%geometry%thck,            &  ! m
             glacier%snow_annmean,           &  ! mm/yr w.e.
             glacier%Tpos_annmean,           &  ! deg C
             glacier%mu_star,                &  ! mm/yr/deg
             glacier%alpha_snow,             &  ! unitless
             glacier%cism_glacier_id_init,   &
-            glacier%cism_glacier_id,        &
-            parallel)
+            glacier%cism_glacier_id)
 
        ! Compute smb_glacier_id, which determines where the SMB is computed. It is the union of
        !       (1) cism_glacier_id > 0
@@ -1559,7 +1557,7 @@ subroutine glissade_glacier_update(model, glacier)
        endif   ! set_mu_star
 
        if (verbose_glacier) then
-          call point_diag(thck, 'After advance_retreat, thck', itest, jtest, rtest, 7, 7)
+          call point_diag(model%geometry%thck, 'After advance_retreat, thck', itest, jtest, rtest, 7, 7)
           call point_diag(glacier%cism_glacier_id_init, 'cism_glacier_id_init', itest, jtest, rtest, 7, 7)
           call point_diag(glacier%smb_glacier_id_init, 'smb_glacier_id_init', itest, jtest, rtest, 7, 7)
           call point_diag(glacier%cism_glacier_id, 'New cism_glacier_id', itest, jtest, rtest, 7, 7)
@@ -1591,16 +1589,20 @@ subroutine glissade_glacier_update(model, glacier)
        ! (1) Include only cells that are part of the initial glacier extent
        call glacier_accumulation_area_ratio(&
             ewn,           nsn,               &
+            parallel,                         &
             nglacier,                         &
             glacier%cism_glacier_id_init,     &
+            cell_area_uniform,                &
             model%climate%smb,                &
             aar_init)
 
        ! (2) Include all cells in the glacier
        call glacier_accumulation_area_ratio(&
             ewn,           nsn,               &
+            parallel,                         &
             nglacier,                         &
             glacier%cism_glacier_id,          &
+            cell_area_uniform,                &
             model%climate%smb,                &
             aar)
 
@@ -1630,16 +1632,20 @@ subroutine glissade_glacier_update(model, glacier)
           ! (1) Include only cells that are part of the initial glacier extent
           call glacier_accumulation_area_ratio(&
                ewn,           nsn,               &
+               parallel,                         &
                nglacier,                         &
                glacier%cism_glacier_id_init,     &
+               cell_area_uniform,                &
                glacier%smb_recent,               &
                aar_init_recent)
 
           ! (2) Include all cells in the glacier
           call glacier_accumulation_area_ratio(&
                ewn,           nsn,               &
+               parallel,                         &
                nglacier,                         &
                glacier%cism_glacier_id,          &
+               cell_area_uniform,                &
                glacier%smb_recent,               &
                aar_recent)
 
@@ -1661,10 +1667,11 @@ subroutine glissade_glacier_update(model, glacier)
 
        call glacier_area_volume(&
             ewn,           nsn,               &
+            parallel,                         &
             nglacier,                         &
             glacier%cism_glacier_id,          &
-            cell_area,                        &  ! m^2
-            thck,                             &  ! m
+            model%geometry%cell_area,         &  ! m^2
+            model%geometry%thck,              &  ! m
             glacier%diagnostic_minthck,       &  ! m
             glacier%area,                     &  ! m^2
             glacier%volume)                      ! m^3
@@ -1674,10 +1681,11 @@ subroutine glissade_glacier_update(model, glacier)
 
        call glacier_area_volume(&
             ewn,           nsn,               &
+            parallel,                         &
             nglacier,                         &
             glacier%cism_glacier_id_init,     &
-            cell_area,                        &  ! m^2
-            thck,                             &  ! m
+            model%geometry%cell_area,         &  ! m^2
+            model%geometry%thck,              &  ! m
             glacier%diagnostic_minthck,       &  ! m
             glacier%area_init_extent,         &  ! m^2
             glacier%volume_init_extent)          ! m^3
@@ -1701,6 +1709,7 @@ subroutine glissade_glacier_update(model, glacier)
 
           call glacier_area_volume(&
                ewn,           nsn,               &
+               parallel,                         &
                nglacier,                         &
                glacier%cism_glacier_id_init,     &
                model%geometry%cell_area,         &  ! m^2
@@ -1718,28 +1727,21 @@ subroutine glissade_glacier_update(model, glacier)
 
        if (verbose_glacier) then
 
-          ! debug - count cells in masks
-          count_cgii = 0
-          count_cgi = 0
-          count_sgii = 0
-          count_sgi = 0
-          do j = nhalo+1, nsn-nhalo
-             do i = nhalo+1, ewn-nhalo
-                ng = glacier%cism_glacier_id_init(i,j)
-                if (ng == ngdiag) count_cgii = count_cgii + 1
-                ng = glacier%cism_glacier_id(i,j)
-                if (ng == ngdiag) count_cgi  = count_cgi  + 1
-                ng = glacier%smb_glacier_id_init(i,j)
-                if (ng == ngdiag) count_sgii = count_sgii + 1
-                ng = glacier%smb_glacier_id(i,j)
-                if (ng == ngdiag) count_sgi  = count_sgi  + 1
-             enddo
-          enddo
+          glacier_mask = 0
+          where (glacier%cism_glacier_id_init == ngdiag) glacier_mask = 1
+          count_cgii = parallel_global_sum(glacier_mask, parallel)
+
+          glacier_mask = 0
+          where (glacier%cism_glacier_id == ngdiag) glacier_mask = 1
+          count_cgi = parallel_global_sum(glacier_mask, parallel)
 
-          count_cgii = parallel_reduce_sum(count_cgii)
-          count_cgi  = parallel_reduce_sum(count_cgi)
-          count_sgii = parallel_reduce_sum(count_sgii)
-          count_sgi  = parallel_reduce_sum(count_sgi)
+          glacier_mask = 0
+          where (glacier%smb_glacier_id_init == ngdiag) glacier_mask = 1
+          count_sgii = parallel_global_sum(glacier_mask, parallel)
+
+          glacier_mask = 0
+          where (glacier%smb_glacier_id == ngdiag) glacier_mask = 1
+          count_sgi = parallel_global_sum(glacier_mask, parallel)
 
           if (this_rank == rtest) then
              write(iulog,*) ' '
@@ -1752,15 +1754,13 @@ subroutine glissade_glacier_update(model, glacier)
 
     endif   ! glacier_update_inverval
 
-    ! Copy fields back to model derived type
-    model%geometry%thck = thck
-
   end subroutine glissade_glacier_update
 
 !****************************************************
 
   subroutine glacier_invert_mu_star(&
        ewn,              nsn,           &
+       parallel,                        &
        itest,   jtest,   rtest,         &
        nglacier,         ngdiag,        &
        smb_glacier_id_init,             &
@@ -1785,6 +1785,9 @@ subroutine glacier_invert_mu_star(&
          nglacier,                    & ! total number of glaciers in the domain
          ngdiag                         ! CISM ID of diagnostic glacier
 
+    type(parallel_type), intent(in) :: &
+         parallel                       ! info for parallel communication
+
     integer, dimension(ewn,nsn), intent(in) :: &
          smb_glacier_id_init            ! smb_glacier_id based on the initial glacier extent
 
@@ -1862,6 +1865,7 @@ subroutine glacier_invert_mu_star(&
 
     call glacier_2d_to_1d_weighted(&
          ewn,           nsn,                   &
+         parallel,                             &
          nglacier,                             &
          smb_glacier_id_init,                  &
          smb_weight,                           &
@@ -1869,6 +1873,7 @@ subroutine glacier_invert_mu_star(&
 
     call glacier_2d_to_1d_weighted(&
          ewn,           nsn,                   &
+         parallel,                             &
          nglacier,                             &
          smb_glacier_id_init,                  &
          smb_weight,                           &
@@ -1993,6 +1998,7 @@ end subroutine glacier_invert_mu_star
 
   subroutine glacier_invert_mu_star_alpha_snow(&
        ewn,              nsn,            &
+       parallel,                         &
        itest,   jtest,   rtest,          &
        nglacier,         ngdiag,         &
        smb_glacier_id_init,              &
@@ -2023,6 +2029,9 @@ subroutine glacier_invert_mu_star_alpha_snow(&
          nglacier,                    & ! total number of glaciers in the domain
          ngdiag                         ! CISM ID of diagnostic glacier
 
+    type(parallel_type), intent(in) :: &
+         parallel                       ! info for parallel communication
+
     integer, dimension(ewn,nsn), intent(in) :: &
          smb_glacier_id_init            ! smb_glacier_id based on the initial glacier extent
 
@@ -2114,6 +2123,7 @@ subroutine glacier_invert_mu_star_alpha_snow(&
 
     call glacier_2d_to_1d_weighted(&
          ewn,                  nsn,       &
+         parallel,                        &
          nglacier,                        &
          smb_glacier_id_init,             &
          smb_weight,                      &
@@ -2121,6 +2131,7 @@ subroutine glacier_invert_mu_star_alpha_snow(&
 
     call glacier_2d_to_1d_weighted(&
          ewn,                  nsn,       &
+         parallel,                        &
          nglacier,                        &
          smb_glacier_id_init,             &
          smb_weight,                      &
@@ -2128,6 +2139,7 @@ subroutine glacier_invert_mu_star_alpha_snow(&
 
     call glacier_2d_to_1d_weighted(&
          ewn,                  nsn,       &
+         parallel,                        &
          nglacier,                        &
          smb_glacier_id_init,             &
          smb_weight,                      &
@@ -2135,6 +2147,7 @@ subroutine glacier_invert_mu_star_alpha_snow(&
 
     call glacier_2d_to_1d_weighted(&
          ewn,                  nsn,       &
+         parallel,                        &
          nglacier,                        &
          smb_glacier_id_init,             &
          smb_weight,                      &
@@ -2533,6 +2546,7 @@ end subroutine glacier_calc_snow
 
   subroutine glacier_redistribute_advanced_ice(&
        ewn,             nsn,          &
+       parallel,                      &
        itest,   jtest,  rtest,        &
        nglacier,        ngdiag,       &
        glacier_update_interval,       & ! yr
@@ -2541,15 +2555,14 @@ subroutine glacier_redistribute_advanced_ice(&
        cism_glacier_id_init,          &
        smb_glacier_id,                &
        smb,                           & ! m/yr
-       thck,                          & ! m
-       parallel)
+       thck)                            ! m
 
     ! Limit glacier advance in the accumulation zone.
     ! This applies to grid cells that are initially ice-free, into which ice is advected.
     ! The fix here is to thin the ice in these cells at a prescribed rate and
     !  redistribute the mass conservatively across the glacier.
 
-    use cism_parallel, only: parallel_reduce_sum, parallel_halo
+    use cism_parallel, only: parallel_halo, parallel_global_sum_patch
 
     ! input/output arguments
 
@@ -2559,11 +2572,16 @@ subroutine glacier_redistribute_advanced_ice(&
          nglacier,                    & ! number of glaciers
          ngdiag                         ! CISM ID of diagnostic glacier
 
+    type(parallel_type), intent(in) :: &
+         parallel                       ! info for parallel communication
+
     real(dp), intent(in) :: &
          glacier_update_interval,     & ! time interval (yr) of the glacier update, typically 1 yr
-         cell_area,                   & ! grid cell area (m^2), assumed to be the same for each cell
          thinning_rate_advanced_ice     ! thinning rate (m/yr) where glaciers advance in the accumulation zone
 
+    real(dp), dimension(ewn,nsn), intent(in) :: &
+         cell_area                      ! grid cell area (m^2)
+
     integer, dimension(ewn,nsn), intent(in) :: &
          cism_glacier_id_init,        & ! integer glacier ID at the start of the run
          smb_glacier_id                 ! integer ID for current glacier cells and adjacent glacier-free cells
@@ -2574,14 +2592,10 @@ subroutine glacier_redistribute_advanced_ice(&
     real(dp), dimension(ewn,nsn), intent(inout) ::  &
          thck                           ! ice thickness (m)
 
-    type(parallel_type), intent(in) :: parallel   ! info for parallel communication
-
     ! local variables
 
     integer :: i, j, ng
 
-    real(dp) :: dthck                   ! thickness change (m)
-
     real(dp), dimension(nglacier) :: &
          glacier_area_init,           & ! glacier area based on cism_glacier_id_init
          glacier_vol_removed,         & ! total volume (m^3) removed from each advanced cells in each glacier
@@ -2589,53 +2603,37 @@ subroutine glacier_redistribute_advanced_ice(&
          glacier_vol_1,               & ! volume (m^3) of each glacier before thinning and restribution
          glacier_vol_2                  ! volume (m^3) of each glacier after thinning and restribution
 
+    real(dp), dimension(ewn,nsn) ::  &
+         dthck                          ! thickness removed (m)
+
+    integer, dimension(ewn,nsn) :: &
+         glacier_id                     ! temporary glacier ID
+
+    glacier_id = max(cism_glacier_id_init, smb_glacier_id)
+
     ! Compute the total volume of each glacier before limiting advance.
     ! Note: This includes adjacent glacier-free cells that might have a small nonzero thickness
     !       (i.e., cism_glacier_id = 0 but smb_glacier_id > 0).
-    !TODO: Write a sum-over-glaciers subroutine
 
-    glacier_vol_1(:) = 0.0d0
-    do j = nhalo+1, nsn-nhalo
-       do i = nhalo+1, ewn-nhalo
-          ng = smb_glacier_id(i,j)
-          if (ng > 0) then
-             glacier_vol_1(ng) = glacier_vol_1(ng) + cell_area*thck(i,j)
-          endif
-       enddo
-    enddo
-    glacier_vol_1 = parallel_reduce_sum(glacier_vol_1)
+    glacier_vol_1 = parallel_global_sum_patch(cell_area*thck, nglacier, glacier_id, parallel)
 
     ! compute the area of each glacier over its initial extent
-    glacier_area_init(:) = 0.0d0
-    do j = nhalo+1, nsn-nhalo
-       do i = nhalo+1, ewn-nhalo
-          ng = cism_glacier_id_init(i,j)
-          if (ng > 0) then
-             glacier_area_init(ng) = glacier_area_init(ng) + cell_area
-          endif
-       enddo
-    enddo
-    glacier_area_init = parallel_reduce_sum(glacier_area_init)
+
+    glacier_area_init = parallel_global_sum_patch(cell_area, nglacier, cism_glacier_id_init, parallel)
 
     ! Compute thinning in advanced grid cells
     ! This includes potential advanced cells adjacent to current glacier cells.
     ! Note: Currently, SMB is set to 0 in advanced cells where SMB would be > 0 otherwise.
     !       The logic below (smb >= 0) ensures that ice in these cells is thinned.
 
-    glacier_vol_removed(:) = 0.0d0
-    do j = nhalo+1, nsn-nhalo
-       do i = nhalo+1, ewn-nhalo
-          if (cism_glacier_id_init(i,j) == 0 .and. smb_glacier_id(i,j) > 0) then ! advanced cell
-             if (smb(i,j) >= 0.d0) then   ! accumulation zone
-                ng = smb_glacier_id(i,j)
-                dthck = min(thinning_rate_advanced_ice*glacier_update_interval, thck(i,j))
-                thck(i,j) = thck(i,j) - dthck
-                glacier_vol_removed(ng) = glacier_vol_removed(ng) + cell_area*dthck
-             endif
-          endif
-       enddo
-    enddo
-    glacier_vol_removed = parallel_reduce_sum(glacier_vol_removed)
+    dthck = 0.0d0
+    where (cism_glacier_id_init == 0 .and. smb_glacier_id > 0)   ! advanced cell
+       where (smb >= 0.0d0)   ! accumulation zone
+          dthck = min(thinning_rate_advanced_ice*glacier_update_interval, thck)
+          thck = thck - dthck
+       endwhere
+    endwhere
+    glacier_vol_removed = parallel_global_sum_patch(cell_area*dthck, nglacier, smb_glacier_id, parallel)
 
     ! Assuming conservation of volume, compute the thickness to be added to each glacier.
     ! Only cells within the initial glacier extent can thicken.
@@ -2659,16 +2657,8 @@ subroutine glacier_redistribute_advanced_ice(&
     call parallel_halo(thck, parallel)
 
     ! Compute the volume of each glacier after limiting advance
-    glacier_vol_2(:) = 0.0d0
-    do j = nhalo+1, nsn-nhalo
-       do i = nhalo+1, ewn-nhalo
-          ng = max(cism_glacier_id_init(i,j), smb_glacier_id(i,j))
-          if (ng > 0) then
-             glacier_vol_2(ng) = glacier_vol_2(ng) + cell_area*thck(i,j)
-          endif
-       enddo
-    enddo
-    glacier_vol_2 = parallel_reduce_sum(glacier_vol_2)
+
+    glacier_vol_2 = parallel_global_sum_patch(cell_area*thck, nglacier, glacier_id, parallel)
 
     ! conservation check
     do ng = 1, nglacier
@@ -2685,6 +2675,7 @@ end subroutine glacier_redistribute_advanced_ice
 
   subroutine glacier_advance_retreat(&
        ewn,             nsn,         &
+       parallel,                     &
        itest,   jtest,  rtest,       &
        nglacier,                     &
        glacier_minthck,              &
@@ -2694,8 +2685,7 @@ subroutine glacier_advance_retreat(&
        mu_star,                      &
        alpha_snow,                   &
        cism_glacier_id_init,         &
-       cism_glacier_id,              &
-       parallel)
+       cism_glacier_id)
 
     ! Allow glaciers to advance and retreat.
     !
@@ -2727,6 +2717,9 @@ subroutine glacier_advance_retreat(&
          itest, jtest, rtest,         & ! coordinates of diagnostic cell
          nglacier                       ! number of glaciers
 
+    type(parallel_type), intent(in) :: &
+         parallel                       ! info for diagnostic only
+
     real(dp), intent(in) :: &
          glacier_minthck                ! min ice thickness (m) counted as part of a glacier
 
@@ -2747,8 +2740,6 @@ subroutine glacier_advance_retreat(&
     integer, dimension(ewn,nsn), intent(inout) :: &
          cism_glacier_id                ! current cism glacier_id, > 0 for glaciated cells
 
-    type(parallel_type), intent(in) :: parallel  ! diagnostic only
-
     ! local variables
 
     integer, dimension(ewn,nsn) :: &
@@ -3038,13 +3029,14 @@ end subroutine update_smb_glacier_id
 
   subroutine glacier_2d_to_1d(&
        ewn,           nsn,              &
+       parallel,                        &
        nglacier,      cism_glacier_id,  &
        field_2d,      glacier_field)
 
     ! Given a 2D field, compute the average of the field over each glacier
     !TODO - Pass in cellarea to compute an area average.
 
-    use cism_parallel, only: parallel_reduce_sum
+    use cism_parallel, only: parallel_global_sum_patch
 
     ! input/output arguments
 
@@ -3052,6 +3044,9 @@ subroutine glacier_2d_to_1d(&
          ewn, nsn,                    & ! number of cells in each horizontal direction
          nglacier                       ! total number of glaciers in the domain
 
+    type(parallel_type), intent(in) :: &
+         parallel                       ! info for parallel communication
+
     integer, dimension(ewn,nsn), intent(in) ::  &
          cism_glacier_id                ! integer glacier ID in the range (1, nglacier)
 
@@ -3067,22 +3062,12 @@ subroutine glacier_2d_to_1d(&
 
     integer, dimension(nglacier) :: ncells_glacier
 
-    ncells_glacier(:) = 0
-    glacier_field(:) = 0.0d0
+    integer, dimension(ewn,nsn) :: ones   ! matrix = 1 everywhere
 
-    ! Loop over locally owned cells
-    do j = nhalo+1, nsn-nhalo
-       do i = nhalo+1, ewn-nhalo
-          ng = cism_glacier_id(i,j)
-          if (ng > 0) then
-             ncells_glacier(ng) = ncells_glacier(ng) + 1
-             glacier_field(ng) = glacier_field(ng) + field_2d(i,j)
-          endif
-       enddo
-    enddo
+    ones(:,:) = 1
 
-    ncells_glacier = parallel_reduce_sum(ncells_glacier)
-    glacier_field  = parallel_reduce_sum(glacier_field)
+    ncells_glacier = parallel_global_sum_patch(ones, nglacier, cism_glacier_id, parallel)
+    glacier_field = parallel_global_sum_patch(field_2d, nglacier, cism_glacier_id, parallel)
 
     where (ncells_glacier > 0)
        glacier_field = glacier_field/ncells_glacier
@@ -3094,6 +3079,7 @@ end subroutine glacier_2d_to_1d
 
   subroutine glacier_2d_to_1d_weighted(&
        ewn,           nsn,              &
+       parallel,                        &
        nglacier,                        &
        glacier_id,    weight,           &
        field_2d,      glacier_field)
@@ -3101,7 +3087,7 @@ subroutine glacier_2d_to_1d_weighted(&
     ! Given a 2D field, compute the average of the field over each glacier
     ! Certain grid cells (e.g., at the glacier periphery) can be given weights between 0 and 1.
 
-    use cism_parallel, only: parallel_reduce_sum
+    use cism_parallel, only: parallel_global_sum_patch
 
     ! input/output arguments
 
@@ -3109,6 +3095,9 @@ subroutine glacier_2d_to_1d_weighted(&
          ewn, nsn,                    & ! number of cells in each horizontal direction
          nglacier                       ! total number of glaciers in the domain
 
+    type(parallel_type), intent(in) :: &
+         parallel                       ! info for parallel communication
+
     integer, dimension(ewn,nsn), intent(in) ::  &
          glacier_id                     ! integer glacier ID
 
@@ -3123,26 +3112,11 @@ subroutine glacier_2d_to_1d_weighted(&
 
     ! local variables
 
-    integer :: i, j, ng
-
     real(dp), dimension(nglacier) :: sum_weights
 
-    sum_weights(:) = 0.0d0
-    glacier_field(:) = 0.0d0
-
-    ! Loop over locally owned cells
-    do j = nhalo+1, nsn-nhalo
-       do i = nhalo+1, ewn-nhalo
-          ng = glacier_id(i,j)
-          if (ng > 0) then
-             sum_weights(ng) = sum_weights(ng) + weight(i,j)
-             glacier_field(ng) = glacier_field(ng) + weight(i,j) * field_2d(i,j)
-          endif
-       enddo
-    enddo
+    sum_weights = parallel_global_sum_patch(weight, nglacier, glacier_id, parallel)
+    glacier_field = parallel_global_sum_patch(weight*field_2d, nglacier, glacier_id, parallel)
 
-    sum_weights = parallel_reduce_sum(sum_weights)
-    glacier_field  = parallel_reduce_sum(glacier_field)
     where (sum_weights > 0.0d0)
        glacier_field = glacier_field/sum_weights
     endwhere
@@ -3196,12 +3170,13 @@ end subroutine glacier_1d_to_2d
 
   subroutine glacier_area_volume(&
        ewn,           nsn,               &
+       parallel,                         &
        nglacier,      cism_glacier_id,   &
        cell_area,     thck,              &
        diagnostic_minthck,               &
        area,          volume)
 
-    use cism_parallel, only: parallel_reduce_sum
+    use cism_parallel, only: parallel_global_sum_patch
 
     ! input/output arguments
 
@@ -3209,6 +3184,9 @@ subroutine glacier_area_volume(&
          ewn, nsn,                    & ! number of cells in each horizontal direction
          nglacier                       ! total number of glaciers in the domain
 
+    type(parallel_type), intent(in) :: &
+         parallel                       ! info for parallel communication
+
     integer, dimension(ewn,nsn), intent(in) ::  &
          cism_glacier_id                ! integer glacier ID in the range (1, nglacier)
 
@@ -3226,36 +3204,22 @@ subroutine glacier_area_volume(&
 
     ! local variables
 
-    real(dp), dimension(nglacier) ::  &
-         local_area, local_volume       ! area and volume on each processor, before global sum
-
-    integer :: i, j, ng
-
-    ! Initialize the output arrays
-    area(:) = 0.0d0
-    volume(:) = 0.0d0
-
-    ! Initialize local arrays
-    local_area(:) = 0.0d0
-    local_volume(:) = 0.0d0
+    real(dp), dimension(ewn,nsn) ::  &
+         diag_area, diag_volume       ! area and volume where thck >= diagnostic_minthck
 
     ! Compute the area and volume of each glacier.
-    ! We need parallel sums, since a glacier can lie on two or more processors.
+    ! Need parallel sums, since a glacier can lie on two or more processors.
 
-    do j = nhalo+1, nsn-nhalo
-       do i = nhalo+1, ewn-nhalo
-          ng = cism_glacier_id(i,j)
-          if (ng > 0) then
-             if (thck(i,j) >= diagnostic_minthck) then
-                local_area(ng) = local_area(ng) + cell_area(i,j)
-                local_volume(ng) = local_volume(ng) + cell_area(i,j) * thck(i,j)
-             endif
-          endif
-       enddo
-    enddo
+    where(thck >= diagnostic_minthck)
+       diag_area = cell_area
+       diag_volume = cell_area*thck
+    elsewhere
+       diag_area = 0.0d0
+       diag_volume = 0.0d0
+    endwhere
 
-    area   = parallel_reduce_sum(local_area)
-    volume = parallel_reduce_sum(local_volume)
+    area = parallel_global_sum_patch(diag_area, nglacier, cism_glacier_id, parallel)
+    volume = parallel_global_sum_patch(diag_volume, nglacier, cism_glacier_id, parallel)
 
   end subroutine glacier_area_volume
 
@@ -3263,6 +3227,7 @@ end subroutine glacier_area_volume
 
   subroutine glacier_area_advance_retreat(&
        ewn,           nsn,     &
+       parallel,               &
        nglacier,               &
        cism_glacier_id_init,   &
        cism_glacier_id,        &
@@ -3272,7 +3237,7 @@ subroutine glacier_area_advance_retreat(&
        area_advance,           &
        area_retreat)
 
-    use cism_parallel, only: parallel_reduce_sum
+    use cism_parallel, only: parallel_global_sum_patch
 
     ! For each glacier, compare the current glacier area (as given by cism_glacier_id)
     ! to the initial area (given by cism_glacier_id_init).
@@ -3287,12 +3252,15 @@ subroutine glacier_area_advance_retreat(&
          ewn, nsn,                    & ! number of cells in each horizontal direction
          nglacier                       ! total number of glaciers in the domain
 
+    type(parallel_type), intent(in) :: &
+         parallel                       ! info for parallel communication
+
     integer, dimension(ewn,nsn), intent(in) ::  &
          cism_glacier_id_init,        & ! integer glacier ID in the range (1, nglacier), initial value
          cism_glacier_id                ! integer glacier ID in the range (1, nglacier), current value
 
-    real(dp), intent(in) ::  &
-         cell_area                      ! grid cell area = dew*dns (m^2); same for all cells
+    real(dp), dimension(ewn,nsn), intent(in) ::  &
+         cell_area                      ! grid cell area (m^2)
 
     real(dp), dimension(nglacier), intent(out) ::  &
          area_initial,                & ! initial glacier area
@@ -3302,69 +3270,38 @@ subroutine glacier_area_advance_retreat(&
 
     ! local variables
 
-    real(dp), dimension(nglacier) :: &
-         local_area                     ! area on each processor, before global sum
-
-    integer :: i, j, ng, ngi
+    integer, dimension(ewn,nsn) :: glacier_id    ! temporary glacier ID
 
-    ! Initialize the output arrays
-    area_initial(:) = 0.0d0
-    area_current(:) = 0.0d0
-    area_advance(:) = 0.0d0
-    area_retreat(:) = 0.0d0
+    integer :: ng
 
     ! Compute the area of each glacier over the initial and current masks.
     ! We need parallel sums, since a glacier can lie on two or more processors.
 
-    ! init area
-    local_area(:) = 0.0d0
-    do j = nhalo+1, nsn-nhalo
-       do i = nhalo+1, ewn-nhalo
-          ngi = cism_glacier_id_init(i,j)
-          if (ngi > 0) then
-             local_area(ngi) = local_area(ngi) + cell_area
-          endif
-       enddo
-    enddo
-    area_initial = parallel_reduce_sum(local_area)
+    area_initial = parallel_global_sum_patch(cell_area, nglacier, cism_glacier_id_init, parallel)
 
     ! current area
-    local_area(:) = 0.0d0
-    do j = nhalo+1, nsn-nhalo
-       do i = nhalo+1, ewn-nhalo
-          ng = cism_glacier_id(i,j)
-          if (ng > 0) then
-             local_area(ng) = local_area(ng) + cell_area
-          endif
-       enddo
-    enddo
-    area_current = parallel_reduce_sum(local_area)
+
+    area_current = parallel_global_sum_patch(cell_area, nglacier, cism_glacier_id, parallel)
 
     ! area where the glacier has advanced
-    local_area(:) = 0.0d0
-    do j = nhalo+1, nsn-nhalo
-       do i = nhalo+1, ewn-nhalo
-          ngi = cism_glacier_id_init(i,j)
-          ng  = cism_glacier_id(i,j)
-          if (ngi == 0 .and. ng > 0) then
-             local_area(ng) = local_area(ng) + cell_area
-          endif
-       enddo
-    enddo
-    area_advance = parallel_reduce_sum(local_area)
+
+    where (cism_glacier_id_init == 0 .and. cism_glacier_id > 0)
+       glacier_id = cism_glacier_id
+    elsewhere
+       glacier_id = 0
+    endwhere
+
+    area_advance = parallel_global_sum_patch(cell_area, nglacier, glacier_id, parallel)
 
     ! area where the glacier has retreated
-    local_area(:) = 0.0d0
-    do j = nhalo+1, nsn-nhalo
-       do i = nhalo+1, ewn-nhalo
-          ngi = cism_glacier_id_init(i,j)
-          ng  = cism_glacier_id(i,j)
-          if (ngi > 0 .and. ng == 0) then
-             local_area(ngi) = local_area(ngi) + cell_area
-          endif
-       enddo
-    enddo
-    area_retreat = parallel_reduce_sum(local_area)
+
+    where (cism_glacier_id_init > 0 .and. cism_glacier_id == 0)
+       glacier_id = cism_glacier_id_init
+    elsewhere
+       glacier_id = 0
+    endwhere
+
+    area_retreat = parallel_global_sum_patch(cell_area, nglacier, glacier_id, parallel)
 
     ! bug check
     do ng = 1, nglacier
@@ -3382,15 +3319,16 @@ end subroutine glacier_area_advance_retreat
 
   subroutine glacier_accumulation_area_ratio(&
        ewn,           nsn,     &
+       parallel,               &
        nglacier,               &
        cism_glacier_id,        &
+       cell_area,              &
        smb,                    &
        aar)
 
     ! Compute the accumulation area ratio (AAR) for each glacier.
-    ! Note: In this subroutine the grid cell area is assumed equal for all cells.
 
-    use cism_parallel, only: parallel_reduce_sum
+    use cism_parallel, only: parallel_global_sum_patch
 
     ! input/output arguments
 
@@ -3398,9 +3336,15 @@ subroutine glacier_accumulation_area_ratio(&
          ewn, nsn,                    & ! number of cells in each horizontal direction
          nglacier                       ! total number of glaciers in the domain
 
+    type(parallel_type), intent(in) :: &
+         parallel                       ! info for parallel communication
+
     integer, dimension(ewn,nsn), intent(in) ::  &
          cism_glacier_id                ! integer glacier ID in the range (1, nglacier)
 
+    real(dp), dimension(ewn,nsn), intent(in) ::  &
+         cell_area                      ! grid cell area = dew*dns (m^2); same for all cells
+
     real(dp), dimension(ewn,nsn), intent(in) ::  &
          smb                            ! surface mass balance (mm/yr w.e.)
 
@@ -3409,34 +3353,32 @@ subroutine glacier_accumulation_area_ratio(&
 
     ! local variables
 
-    integer :: i, j, ng
+!    integer :: i, j, ng
 
     real(dp), dimension(nglacier) :: &
          ablat_area,                  & ! area of accumulation zone (SMB < 0)
          accum_area                     ! area of accumulation zone (SMB > 0)
 
-    ! initialize
-    ablat_area(:) = 0.0d0
-    accum_area(:) = 0.0d0
+    integer, dimension(ewn,nsn) :: glacier_id   ! temporary glacier ID
 
     ! Compute the accumulation and ablation area for each glacier
     ! Note: Grid cells with SMB = 0 are not counted in either zone.
 
-    do j = nhalo+1, nsn-nhalo
-       do i = nhalo+1, ewn-nhalo
-          ng = cism_glacier_id(i,j)
-          if (ng > 0) then
-             if (smb(i,j) > 0.0d0) then
-                accum_area(ng) = accum_area(ng) + 1.0d0
-             elseif (smb(i,j) < 0.0d0) then
-                ablat_area(ng) = ablat_area(ng) + 1.0d0
-             endif
-          endif
-       enddo   ! i
-    enddo   ! j
+    where (cism_glacier_id > 0 .and. smb > 0.0d0)
+       glacier_id = 1
+    elsewhere
+       glacier_id = 0
+    endwhere
+
+    accum_area = parallel_global_sum_patch(cell_area, nglacier, cism_glacier_id, parallel)
+
+    where (cism_glacier_id > 0 .and. smb < 0.0d0)
+       glacier_id = 1
+    elsewhere
+       glacier_id = 0
+    endwhere
 
-    accum_area = parallel_reduce_sum(accum_area)
-    ablat_area = parallel_reduce_sum(ablat_area)
+    ablat_area = parallel_global_sum_patch(cell_area, nglacier, cism_glacier_id, parallel)
 
     ! Compute the AAR for each glacier
 
diff --git a/libglissade/glissade_inversion.F90 b/libglissade/glissade_inversion.F90
index 71a40c7a..c8d38bff 100644
--- a/libglissade/glissade_inversion.F90
+++ b/libglissade/glissade_inversion.F90
@@ -699,6 +699,7 @@ subroutine glissade_inversion_solve(model)
             ewn, nsn,                                  &
             model%numerics%dew,                        &  ! m
             model%numerics%dns,                        &  ! m
+            parallel,                                  &
             itest, jtest, rtest,                       &
             model%ocean_data%nbasin,                   &
             model%ocean_data%basin_number,             &
@@ -780,6 +781,7 @@ subroutine glissade_inversion_solve(model)
             ewn, nsn,                                  &
             model%numerics%dew,                        &  ! m
             model%numerics%dns,                        &  ! m
+            parallel,                                  &
             itest, jtest, rtest,                       &
             model%ocean_data%nbasin,                   &
             model%ocean_data%basin_number,             &
@@ -853,6 +855,7 @@ subroutine glissade_inversion_solve(model)
             ewn, nsn,                                  &
             model%numerics%dew,                        &  ! m
             model%numerics%dns,                        &  ! m
+            parallel,                                  &
             itest, jtest, rtest,                       &
             model%ocean_data%nbasin,                   &
             model%ocean_data%basin_number,             &
@@ -906,6 +909,7 @@ subroutine glissade_inversion_solve(model)
             ewn, nsn,                                  &
             model%numerics%dew,                        &  ! m
             model%numerics%dns,                        &  ! m
+            parallel,                                  &
             itest, jtest, rtest,                       &
             model%ocean_data%nbasin,                   &
             model%ocean_data%basin_number,             &
@@ -944,6 +948,7 @@ subroutine glissade_inversion_solve(model)
        if (model%ocean_data%nbasin > 1) then
           call glissade_basin_average(&
                model%general%ewn, model%general%nsn,  &
+               model%parallel,                        &
                model%ocean_data%nbasin,               &
                model%ocean_data%basin_number,         &
                floating_mask * 1.0d0,                 &   ! real mask
@@ -1338,6 +1343,7 @@ subroutine invert_basal_friction_basin(&
        dt,                          &
        nx,            ny,           &
        dx,            dy,           &
+       parallel,                    &
        itest, jtest,  rtest,        &
        nbasin,                      &
        basin_number,                &
@@ -1372,6 +1378,9 @@ subroutine invert_basal_friction_basin(&
     real(dp), intent(in) :: &
          dx, dy                  ! grid cell size in each direction (m)
 
+    type(parallel_type), intent(in) :: &
+         parallel                ! info for parallel communication
+
     integer, intent(in) :: &
          itest, jtest, rtest     ! coordinates of diagnostic point
 
@@ -1418,6 +1427,7 @@ subroutine invert_basal_friction_basin(&
     call get_basin_targets(&
          nx,           ny,                  &
          dx,           dy,                  &
+         parallel,                          &
          nbasin,       basin_number,        &
          itest, jtest, rtest,               &
          stag_thck,    stag_dthck_dt,       &
@@ -1430,6 +1440,7 @@ subroutine invert_basal_friction_basin(&
 
     call glissade_basin_average(&
          nx,          ny,                   &
+         parallel,                          &
          nbasin,      basin_number,         &
          stag_rmask,                        &
          friction_c,  friction_c_basin)
@@ -1495,6 +1506,7 @@ subroutine invert_deltaT_ocn_basin(&
        dt,                          &
        nx,            ny,           &
        dx,            dy,           &
+       parallel,                    &
        itest, jtest,  rtest,        &
        nbasin,                      &
        basin_number,                &
@@ -1534,6 +1546,9 @@ subroutine invert_deltaT_ocn_basin(&
     real(dp), intent(in) :: &
          dx, dy                  ! grid cell size in each direction (m)
 
+    type(parallel_type), intent(in) :: &
+         parallel                ! info for parallel communication
+
     integer, intent(in) :: &
          itest, jtest, rtest     ! coordinates of diagnostic point
 
@@ -1592,6 +1607,7 @@ subroutine invert_deltaT_ocn_basin(&
     call get_basin_targets(&
          nx,           ny,                  &
          dx,           dy,                  &
+         parallel,                          &
          nbasin,       basin_number,        &
          itest, jtest, rtest,               &
          thck,         dthck_dt,            &
@@ -1610,6 +1626,7 @@ subroutine invert_deltaT_ocn_basin(&
 
     call glissade_basin_average(&
          nx,          ny,                   &
+         parallel,                          &
          nbasin,      basin_number,         &
          mask,                              &
          deltaT_ocn,  deltaT_basin)
@@ -2142,6 +2159,7 @@ end subroutine invert_flow_enhancement_factor
   subroutine get_basin_targets(&
        nx,           ny,                &
        dx,           dy,                &
+       parallel,                        &
        nbasin,       basin_number,      &
        itest, jtest, rtest,             &
        thck,         dthck_dt,          &
@@ -2167,6 +2185,9 @@ subroutine get_basin_targets(&
     real(dp), intent(in) :: &
          dx, dy                  ! grid cell size in each direction (m)
 
+    type(parallel_type), intent(in) :: &
+         parallel                ! info for parallel communication
+
     integer, intent(in) :: &
          nbasin                  ! number of basins
 
@@ -2221,6 +2242,7 @@ subroutine get_basin_targets(&
 
     call glissade_basin_sum(&
          nx,         ny,                &
+         parallel,                      &
          nbasin,     basin_number,      &
          target_rmask,                  &
          cell_area,                     &
@@ -2232,6 +2254,7 @@ subroutine get_basin_targets(&
 
     call glissade_basin_sum(&
          nx,         ny,                &
+         parallel,                      &
          nbasin,     basin_number,      &
          target_rmask,                  &
          thck_target*dx*dy,             &
@@ -2241,6 +2264,7 @@ subroutine get_basin_targets(&
 
     call glissade_basin_sum(&
          nx,         ny,                &
+         parallel,                      &
          nbasin,     basin_number,      &
          target_rmask,                  &
          thck*dx*dy,                    &
@@ -2250,6 +2274,7 @@ subroutine get_basin_targets(&
 
     call glissade_basin_sum(&
          nx,         ny,                &
+         parallel,                      &
          nbasin,     basin_number,      &
          target_rmask,                  &
          dthck_dt*dx*dy,                &
diff --git a/libglissade/glissade_mass_balance.F90 b/libglissade/glissade_mass_balance.F90
index ea4c5fac..fb03dc79 100644
--- a/libglissade/glissade_mass_balance.F90
+++ b/libglissade/glissade_mass_balance.F90
@@ -43,7 +43,7 @@ module glissade_mass_balance
     use glimmer_utils, only: point_diag
     use glide_types
     use cism_parallel, only: this_rank, main_task, nhalo, lhalo, uhalo, &
-         parallel_halo, parallel_reduce_max, parallel_reduce_sum, parallel_globalindex
+         parallel_halo, parallel_reduce_max, parallel_global_sum, parallel_globalindex
 
     implicit none
     save
@@ -81,17 +81,27 @@ subroutine glissade_mass_balance_init(model)
 
     ! Initialize artm for the case that we are reading in artm_ref or artm_3d.
     ! For some temp_init options, this is needed for correct interior temperatures.
+    ! Note: Do not call if glaciers are enabled. When running with glaciers, artm is
+    !       accumulated over year 1 from a forcing file which hasn't been read in yet.
+    ! TODO: Think about how to initialize glacier temperatures. Currently assume artm = 0,
+    !       which isn't realistic.
     if (model%options%artm_input_function /= ARTM_INPUT_FUNCTION_XY) then
-       call downscale_artm(model)
+       if (.not.model%options%enable_glaciers) then
+          call downscale_artm(model)
+       endif
     endif
 
     ! Initialize smb for the case that we are reading in smb_ref or smb_3d.
     ! This is not strictly needed, since the SMB will be recomputed before it is used,
     !  but can be a helpful diagnostic.
-    !TODO - Do this also for the PDD option?
+    ! Note: Do not call if glaciers are enabled. When running with glaciers, smb is
+    !       accumulated over year 1 from a forcing file which hasn't been read in yet.
+    ! TODO - call downscale_smb for the PDD option?
     if (model%options%smb_input_function == SMB_INPUT_FUNCTION_XY_GRADZ .or. &
         model%options%smb_input_function == SMB_INPUT_FUNCTION_XYZ) then
-       call downscale_smb(model)
+       if (.not.model%options%enable_glaciers) then
+          call downscale_smb(model)
+       endif
     endif
 
     ! Initialize acab, if SMB (with different units) was read in
@@ -1093,6 +1103,7 @@ subroutine mass_balance_driver(&
        call glissade_sum_mass_and_tracers(&
             nx,                ny,              &
             nlyr,              ntracers,        &
+            parallel,                           &
             thck_layer(:,:,:), msum_init,       &
             tracers(:,:,:,:),  mtsum_init(:))
 
@@ -1171,9 +1182,9 @@ subroutine mass_balance_driver(&
              enddo
           enddo
 
-          sum_acab = parallel_reduce_sum(sum_acab)
-          sum_bmlt = parallel_reduce_sum(sum_bmlt)
-          sum_melt_potential = parallel_reduce_sum(sum_melt_potential)
+          sum_acab = parallel_global_sum(acab*effective_areafrac, parallel)
+          sum_bmlt = parallel_global_sum(bmlt*effective_areafrac, parallel)
+          sum_melt_potential = parallel_global_sum(melt_potential, parallel)
 
           msum_init = msum_init + (sum_acab - sum_bmlt)*dt
 
@@ -1182,6 +1193,7 @@ subroutine mass_balance_driver(&
           call glissade_sum_mass_and_tracers(&
                nx,                ny,              &
                nlyr,              ntracers,        &
+               parallel,                           &
                thck_layer(:,:,:), msum_final,      &
                tracers(:,:,:,:),  mtsum_final(:))
 
diff --git a/libglissade/glissade_transport.F90 b/libglissade/glissade_transport.F90
index b42d6866..9ba618f4 100644
--- a/libglissade/glissade_transport.F90
+++ b/libglissade/glissade_transport.F90
@@ -45,7 +45,7 @@ module glissade_transport
     use glimmer_log
     use glissade_remap, only: glissade_horizontal_remap, make_remap_mask, puny
     use cism_parallel, only: this_rank, main_task, nhalo, lhalo, uhalo, staggered_lhalo, staggered_uhalo, &
-         parallel_type, parallel_reduce_max, parallel_reduce_sum, parallel_reduce_minloc, &
+         parallel_type, parallel_global_sum, parallel_reduce_max, parallel_reduce_minloc, &
          parallel_globalindex, broadcast
 
     implicit none
@@ -445,6 +445,7 @@ subroutine glissade_transport_driver(dt,                         &
          call glissade_sum_mass_and_tracers(&
               nx,                ny,              &
               nlyr,              ntracers,        &
+              parallel,                           &
               thck_layer(:,:,:), msum_init,       &
               tracers(:,:,:,:),  mtsum_init(:))
       endif
@@ -603,6 +604,7 @@ subroutine glissade_transport_driver(dt,                         &
          call glissade_sum_mass_and_tracers(&
               nx,                ny,              &
               nlyr,              ntracers,        &
+              parallel,                           &
               thck_layer(:,:,:), msum_final,      &
               tracers(:,:,:,:),  mtsum_final(:))
 
@@ -661,6 +663,7 @@ subroutine glissade_transport_driver(dt,                         &
          call glissade_sum_mass_and_tracers(&
               nx,                ny,              &
               nlyr,              ntracers,        &
+              parallel,                           &
               thck_layer(:,:,:), msum_final,      &
               tracers(:,:,:,:),  mtsum_final(:))
 
@@ -950,6 +953,7 @@ end subroutine glissade_check_cfl
     subroutine glissade_sum_mass_and_tracers(&
          nx,         ny,        &
          nlyr,       ntracer,   &
+         parallel,              &
          thck_layer, msum,      &
          tracer,     mtsum)
 
@@ -963,6 +967,9 @@ subroutine glissade_sum_mass_and_tracers(&
          nlyr,                 &! number of vertical layers
          ntracer                ! number of tracers
 
+      type(parallel_type), intent(in) :: &
+         parallel               ! info for parallel communication
+
       real(dp), dimension (nx,ny,nlyr), intent(in) ::     &
          thck_layer             ! ice layer thickness
 
@@ -977,30 +984,21 @@ subroutine glissade_sum_mass_and_tracers(&
 
       ! Local arguments
 
-      integer :: i, j, nt
-
-      msum  = 0.d0
-      if (present(mtsum)) mtsum(:) = 0.d0
+      integer :: nt, k
 
-      do j = 1+nhalo, ny-nhalo
-         do i = 1+nhalo, nx-nhalo
-               
-            ! accumulate ice mass and mass*tracers
-            ! (actually, accumulate thickness, assuming rhoi*dx*dy is the same for each cell)
-
-            msum = msum + sum(thck_layer(i,j,:))
-
-            if (present(mtsum)) then
-               do nt = 1, ntracer
-                  mtsum(nt) =  mtsum(nt) + sum(tracer(i,j,nt,:)*thck_layer(i,j,:))
-               enddo
-            endif
-
-         enddo    ! i
-      enddo       ! j
+      msum  = 0.0d0
+      do k = 1, nlyr
+           msum = msum + parallel_global_sum(thck_layer(:,:,k), parallel)
+      enddo
 
-      msum = parallel_reduce_sum(msum)
-      if (present(mtsum)) mtsum = parallel_reduce_sum(mtsum)
+      if (present(mtsum)) then
+         mtsum(:) = 0.0d0
+         do k = 1, nlyr
+            do nt = 1, ntracer
+               mtsum(nt) = mtsum(nt) + parallel_global_sum(tracer(:,:,nt,k)*thck_layer(:,:,k), parallel)
+            enddo
+         enddo
+      endif
 
     end subroutine glissade_sum_mass_and_tracers
 
diff --git a/libglissade/glissade_utils.F90 b/libglissade/glissade_utils.F90
index e7e3431f..eb5b629f 100644
--- a/libglissade/glissade_utils.F90
+++ b/libglissade/glissade_utils.F90
@@ -516,8 +516,10 @@ end subroutine glissade_adjust_topography
 
 !****************************************************
 
+  !TODO - Calls to this subroutine could be replaced by inline calls to parallel_global_sum_patch
   subroutine glissade_basin_sum(&
        nx,           ny,            &
+       parallel,                    &
        nbasin,       basin_number,  &
        rmask,                       &
        field_2d,                    &
@@ -527,11 +529,14 @@ subroutine glissade_basin_sum(&
     ! The sum is taken over grid cells with mask = 1.
     ! All cells are weighted equally.
 
-    use cism_parallel, only: parallel_reduce_sum, nhalo
+    use cism_parallel, only: parallel_global_sum_patch
 
     integer, intent(in) :: &
          nx, ny                    !> number of grid cells in each dimension
 
+    type(parallel_type), intent(in) :: &
+         parallel                  !> info for parallel communication
+
     integer, intent(in) :: &
          nbasin                    !> number of basins
 
@@ -546,29 +551,10 @@ subroutine glissade_basin_sum(&
     real(dp), dimension(nbasin), intent(out) :: &
          field_basin_sum           !> basin-sum output field
 
-    ! local variables
-
-    integer :: i, j, nb
-
     !TODO - Replace sumcell with sumarea, and pass in cell area.
     !       Current algorithm assumes all cells with mask = 1 have equal weight.
 
-    real(dp), dimension(nbasin) ::  &
-         sumfield_local     ! sum of field on local task
-
-    sumfield_local(:) = 0.0d0
-
-    ! loop over locally owned cells
-    do j = nhalo+1, ny-nhalo
-       do i = nhalo+1, nx-nhalo
-          nb = basin_number(i,j)
-          if (nb >= 1) then
-             sumfield_local(nb) = sumfield_local(nb) + rmask(i,j)*field_2d(i,j)
-          endif
-       enddo
-    enddo
-
-    field_basin_sum(:) =  parallel_reduce_sum(sumfield_local(:))
+    field_basin_sum = parallel_global_sum_patch(rmask*field_2d, nbasin, basin_number, parallel)
 
   end subroutine glissade_basin_sum
 
@@ -576,6 +562,7 @@ end subroutine glissade_basin_sum
 
   subroutine glissade_basin_average(&
        nx,           ny,            &
+       parallel,                    &
        nbasin,       basin_number,  &
        rmask,                       &
        field_2d,                    &
@@ -586,11 +573,14 @@ subroutine glissade_basin_average(&
     ! All cells are weighted equally.
     ! Note: This subroutine assumes an input field located at cell centers
 
-    use cism_parallel, only: parallel_reduce_sum, nhalo
+    use cism_parallel, only: parallel_global_sum_patch
 
     integer, intent(in) :: &
          nx, ny                    !> number of grid cells in each dimension
 
+    type(parallel_type), intent(in) :: &
+         parallel                  !> info for parallel communication
+
     integer, intent(in) :: &
          nbasin                    !> number of basins
 
@@ -607,33 +597,17 @@ subroutine glissade_basin_average(&
 
     ! local variables
 
-    integer :: i, j, nb
+    integer :: nb
 
     !TODO - Replace sumcell with sumarea, and pass in cell area.
     !       Current algorithm assumes all cells with mask = 1 have equal weight.
 
     real(dp), dimension(nbasin) ::  &
-         summask_local,          & ! sum of mask in each basin on local task
          summask_global,         & ! sum of mask in each basin on full domain
-         sumfield_local,         & ! sum of field on local task
          sumfield_global           ! sum of field over full domain
 
-    summask_local(:) = 0.0d0
-    sumfield_local(:) = 0.0d0
-
-    ! loop over locally owned cells only
-    do j = nhalo+1, ny-nhalo
-       do i = nhalo+1, nx-nhalo
-          nb = basin_number(i,j)
-          if (nb >= 1) then
-             summask_local(nb) = summask_local(nb) + rmask(i,j)
-             sumfield_local(nb) = sumfield_local(nb) + rmask(i,j)*field_2d(i,j)
-          endif
-       enddo
-    enddo
-
-    summask_global(:)  =  parallel_reduce_sum(summask_local(:))
-    sumfield_global(:) =  parallel_reduce_sum(sumfield_local(:))
+    summask_global = parallel_global_sum_patch(rmask, nbasin, basin_number, parallel)
+    sumfield_global = parallel_global_sum_patch(rmask*field_2d, nbasin, basin_number, parallel)
 
     do nb = 1, nbasin
        if (summask_global(nb) > tiny(0.0d0)) then
diff --git a/libglissade/glissade_velo_higher.F90 b/libglissade/glissade_velo_higher.F90
index 3b146e44..6ea89c0a 100644
--- a/libglissade/glissade_velo_higher.F90
+++ b/libglissade/glissade_velo_higher.F90
@@ -239,6 +239,8 @@ module glissade_velo_higher
 !    logical :: verbose_L1L2 = .true.
     logical :: verbose_diva = .false.
 !    logical :: verbose_diva = .true.
+    logical :: verbose_bp = .false.
+!    logical :: verbose_bp = .true.
     logical :: verbose_glp = .false.
 !    logical :: verbose_glp = .true.
     logical :: verbose_picard = .false.
@@ -3591,7 +3593,7 @@ subroutine glissade_velo_higher_solve(model,                &
 
     !WHL - debug
     !TODO - One diagnostic to write out column velocities for any approximation
-    if (whichapprox == HO_APPROX_BP .and. this_rank==rtest) then
+    if (verbose_bp .and. whichapprox == HO_APPROX_BP .and. this_rank==rtest) then
        write(iulog,*) ' '
        i = itest
        j = jtest
@@ -9424,7 +9426,7 @@ subroutine compute_residual_vector_3d(nx,    ny,     nz,            &
                 if ( (k+kA >= 1 .and. k+kA <= nz)      &
                                 .and.                  &
                      (i+iA >= 1 .and. i+iA <= nx-1)    &
-                             .and.                     &
+                                .and.                  &
                      (j+jA >= 1 .and. j+jA <= ny-1) ) then
 
                    m = indxA_3d(iA,jA,kA)
@@ -9475,6 +9477,11 @@ subroutine compute_residual_vector_3d(nx,    ny,     nz,            &
     L2_norm = parallel_reduce_sum(L2_norm)
     L2_norm = sqrt(L2_norm)
 
+!!    sum_resid_u_sq = parallel_global_sum(bu*bu, parallel, active_vertex)
+!!    sum_resid_v_sq = parallel_global_sum(bv*bv, parallel, active_vertex)
+!!    L2_norm = parallel_global_sum(resid_sq, parallel)
+!!    L2_norm = sqrt(L2_norm)
+
     if (verbose_residual) then
 
        if (this_rank==rtest) then
diff --git a/libglissade/glissade_velo_higher_pcg.F90 b/libglissade/glissade_velo_higher_pcg.F90
index 23c9ceaf..1d4df22a 100644
--- a/libglissade/glissade_velo_higher_pcg.F90
+++ b/libglissade/glissade_velo_higher_pcg.F90
@@ -48,8 +48,7 @@ module glissade_velo_higher_pcg
     use glimmer_log
     use profile, only: t_startf, t_stopf
     use cism_parallel, only: this_rank, main_task, &
-         parallel_type, staggered_parallel_halo, parallel_reduce_sum,  &
-         parallel_global_sum_staggered
+         parallel_type, staggered_parallel_halo, parallel_global_sum_stagger
 
     implicit none
     
@@ -60,8 +59,6 @@ module glissade_velo_higher_pcg
 
     logical, parameter :: verbose_pcg = .false.
     logical, parameter :: verbose_tridiag = .false.
-!!    logical, parameter :: verbose_pcg = .true.
-!!    logical, parameter :: verbose_tridiag = .true.
 
   contains
 
@@ -322,11 +319,7 @@ subroutine pcg_solver_standard_3d(nx,        ny,            &
     ! find global sum of the squared L2 norm
 
     call t_startf("pcg_glbsum_init")
-    call parallel_global_sum_staggered(&
-         nx,            ny,          &
-         nz,            parallel,    &
-         L2_rhs,                     &
-         work0u, work0v)
+    L2_rhs = parallel_global_sum_stagger(work0u, parallel, work0v)
     call t_stopf("pcg_glbsum_init")
 
     ! take square root
@@ -387,11 +380,7 @@ subroutine pcg_solver_standard_3d(nx,        ny,            &
        call t_stopf("pcg_dotprod")
 
        call t_startf("pcg_glbsum_iter")
-       call parallel_global_sum_staggered(&
-            nx,            ny,            &
-            nz,            parallel,      &
-            eta1,                         &
-            work0u, work0v)
+       eta1 = parallel_global_sum_stagger(work0u, parallel, work0v)
        call t_stopf("pcg_glbsum_iter")
 
        !WHL - If the SIA solver has failed due to singular matrices,
@@ -447,11 +436,7 @@ subroutine pcg_solver_standard_3d(nx,        ny,            &
        call t_stopf("pcg_dotprod")
 
        call t_startf("pcg_glbsum_iter")
-       call parallel_global_sum_staggered(&
-            nx,            ny,        &
-            nz,            parallel,  &
-            eta2,                     &
-            work0u, work0v)
+       eta2 = parallel_global_sum_stagger(work0u, parallel, work0v)
        call t_stopf("pcg_glbsum_iter")
 
        ! Compute alpha
@@ -519,11 +504,7 @@ subroutine pcg_solver_standard_3d(nx,        ny,            &
           call t_stopf("pcg_dotprod")
 
           call t_startf("pcg_glbsum_resid")
-          call parallel_global_sum_staggered(&
-               nx,            ny,       &
-               nz,            parallel, &
-               L2_resid,                &
-               work0u, work0v)
+          L2_resid = parallel_global_sum_stagger(work0u, parallel, work0v)
           call t_stopf("pcg_glbsum_resid")
 
           ! take square root
@@ -753,11 +734,7 @@ subroutine pcg_solver_standard_2d(nx,        ny,            &
     ! find global sum of the squared L2 norm
 
     call t_startf("pcg_glbsum_init")
-    call parallel_global_sum_staggered(&
-         nx,        ny,       &
-         parallel,            &
-         L2_rhs,              &
-         work0u,    work0v)
+    L2_rhs = parallel_global_sum_stagger(work0u, parallel, work0v)
     call t_stopf("pcg_glbsum_init")
 
     ! take square root
@@ -810,11 +787,7 @@ subroutine pcg_solver_standard_2d(nx,        ny,            &
        call t_stopf("pcg_dotprod")
 
        call t_startf("pcg_glbsum_iter")
-       call parallel_global_sum_staggered(&
-            nx,     ny,     &
-            parallel,       &
-            eta1,           &
-            work0u, work0v)
+       eta1 = parallel_global_sum_stagger(work0u, parallel, work0v)
        call t_stopf("pcg_glbsum_iter")
 
        !WHL - If the SIA solver has failed due to singular matrices,
@@ -870,11 +843,7 @@ subroutine pcg_solver_standard_2d(nx,        ny,            &
        call t_stopf("pcg_dotprod")
 
        call t_startf("pcg_glbsum_iter")
-       call parallel_global_sum_staggered(&
-            nx,     ny,       &
-            parallel,         &
-            eta2,             &
-            work0u, work0v)
+       eta2 = parallel_global_sum_stagger(work0u, parallel, work0v)
        call t_stopf("pcg_glbsum_iter")
 
        ! Compute alpha
@@ -942,11 +911,7 @@ subroutine pcg_solver_standard_2d(nx,        ny,            &
           call t_stopf("pcg_dotprod")
 
           call t_startf("pcg_glbsum_resid")
-          call parallel_global_sum_staggered(&
-               nx,     ny,     &
-               parallel,       &
-               L2_resid,       &
-               work0u, work0v)
+          L2_resid = parallel_global_sum_stagger(work0u, parallel, work0v)
           call t_stopf("pcg_glbsum_resid")
 
           ! take square root
@@ -1428,11 +1393,7 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
     ! find global sum of the squared L2 norm
 
     call t_startf("pcg_glbsum_init")
-    call parallel_global_sum_staggered(&
-         nx,     ny,         &
-         nz,     parallel,   &
-         bb,                 &
-         worku,  workv)
+    bb = parallel_global_sum_stagger(worku, parallel, workv)
     call t_stopf("pcg_glbsum_init")
 
     ! take square root
@@ -1626,11 +1587,7 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
     !---- Find global sums of (r,z) and (d,q)
 
     call t_startf("pcg_glbsum_iter")
-    call parallel_global_sum_staggered(&
-         nx,     ny,         &
-         nz,     parallel,   &
-         gsum,               &
-         work2u, work2v)
+    gsum = parallel_global_sum_stagger(work2u, 2, parallel, work2v)
     call t_stopf("pcg_glbsum_iter")
 
     !---- Halo update for q
@@ -1805,11 +1762,7 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
        ! this is the one MPI global reduction per iteration.
 
        call t_startf("pcg_glbsum_iter")
-       call parallel_global_sum_staggered(&
-            nx,     ny,          &
-            nz,     parallel,    &
-            gsum,                &
-            work2u, work2v)
+       gsum = parallel_global_sum_stagger(work2u, 2, parallel, work2v)
        call t_stopf("pcg_glbsum_iter")
 
        !---- Halo update for Az
@@ -1911,11 +1864,7 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
           call t_stopf("pcg_dotprod")
 
           call t_startf("pcg_glbsum_resid")
-          call parallel_global_sum_staggered(&
-               nx,     ny,         &
-               nz,     parallel,   &
-               rr,                 &
-               worku,  workv)
+          rr = parallel_global_sum_stagger(worku, parallel, workv)
           call t_stopf("pcg_glbsum_resid")
 
           L2_resid = sqrt(rr)          ! L2 norm of residual
@@ -2255,6 +2204,7 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
             denom_u,       denom_v)
 
     elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then
+       !TODO - Figure out why these calculations depend on the number of cores. Halo bug?
 
        ! Allocate tridiagonal matrices
        ! Note: (i,j) indices are switced for the A_v matrices to reduce striding.
@@ -2365,11 +2315,7 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
     ! find global sum of the squared L2 norm
 
     call t_startf("pcg_glbsum_init")
-    call parallel_global_sum_staggered(&
-         nx,     ny,   &
-         parallel,     &
-         bb,           &
-         worku,  workv)
+    bb = parallel_global_sum_stagger(worku, parallel, workv)
     call t_stopf("pcg_glbsum_init")
 
     ! take square root
@@ -2450,7 +2396,7 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
 
        !WHL - debug
        if (verbose_pcg .and. this_rank == rtest) then
-          i = itest
+!          i = itest
 !          write(iulog,*) ' '
 !          write(iulog,*) 'zv solve with diagonal precond, this_rank, i =', this_rank, i
 !          write(iulog,*) 'j, active, Adiagv, rv, zv, xv:'
@@ -2496,14 +2442,14 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
                                     zu,            zv)               ! solution
 
        !WHL - debug
-          if (verbose_pcg .and. this_rank == rtest) then
-             j = jtest
-             write(iulog,*) ' '
-             write(iulog,*) 'tridiag solve: i, ru, rv, zu, zv:'
-             do i = itest-3, itest+3
-                write(iulog,'(i4, 4f16.10)') i, ru(i,j), rv(i,j), zu(i,j), zv(i,j)
-             enddo
-          endif
+       if (verbose_pcg .and. this_rank == rtest) then
+          j = jtest
+          write(iulog,*) ' '
+          write(iulog,*) 'tridiag solve: i, ru, rv, zu, zv:'
+          do i = itest-3, itest+3
+             write(iulog,'(i4, 4f16.10)') i, ru(i,j), rv(i,j), zu(i,j), zv(i,j)
+          enddo
+       endif
 
        !Note: Need zu and zv in a row of halo cells so that q = A*d is correct in all locally owned cells
        !TODO: See whether tridiag solvers could be modified to provide zu and zv in halo cells?
@@ -2627,16 +2573,6 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
                                        qu,            qv)
     call t_stopf("pcg_matmult_iter")
 
-    !WHL - debug
-    usum = sum(qu(staggered_ilo:staggered_ihi,staggered_jlo:staggered_jhi))
-    usum_global = parallel_reduce_sum(usum)
-    vsum = sum(qv(staggered_ilo:staggered_ihi,staggered_jlo:staggered_jhi))
-    vsum_global = parallel_reduce_sum(vsum)
-
-    if (verbose_pcg .and. this_rank == rtest) then
-!!       write(iulog,*) 'Prep: sum(qu), sum(qv) =', usum_global, vsum_global
-    endif
-
     !---- Compute intermediate result for dot product (d,q) = (d,Ad)
 
     call t_startf("pcg_dotprod")
@@ -2647,17 +2583,9 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
     !---- Find global sums of (r,z) and (d,q)
 
     call t_startf("pcg_glbsum_iter")
-    call parallel_global_sum_staggered(&
-         nx,     ny,   &
-         parallel,     &
-         gsum,         &
-         work2u, work2v)
+    gsum = parallel_global_sum_stagger(work2u, 2, parallel, work2v)  ! nflds = 2
     call t_stopf("pcg_glbsum_iter")
 
-    if (verbose_pcg .and. this_rank == rtest) then
-!!       write(iulog,*) 'Prep: gsum(1), gsum(2) =', gsum(1), gsum(2)
-    endif
-
     !---- Halo update for q
 
     call t_startf("pcg_halo_iter")
@@ -2686,16 +2614,16 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
     rv(:,:) = rv(:,:) - alpha*qv(:,:)
     call t_stopf("pcg_vecupdate")
 
-       !WHL - debug
-       if (verbose_pcg .and. this_rank == rtest) then
-          j = jtest
-          write(iulog,*) ' '
-          write(iulog,*) 'iter = 1: i, xu, xv, ru, rv:'
+    !WHL - debug
+    if (verbose_pcg .and. this_rank == rtest) then
+       j = jtest
+       write(iulog,*) ' '
+       write(iulog,*) 'iter = 1: i, xu, xv, ru, rv:'
 !!          do i = itest-3, itest+3
-          do i = staggered_ilo, staggered_ihi
-             write(iulog,'(i4, 4f16.10)') i, xu(i,j), xv(i,j), ru(i,j), rv(i,j)
-          enddo  ! i
-       endif
+       do i = staggered_ilo, staggered_ihi
+          write(iulog,'(i4, 4f16.10)') i, xu(i,j), xv(i,j), ru(i,j), rv(i,j)
+       enddo  ! i
+    endif
 
     !---------------------------------------------------------------
     ! Iterate to solution
@@ -2934,11 +2862,7 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
        ! this is the one MPI global reduction per iteration.
 
        call t_startf("pcg_glbsum_iter")
-       call parallel_global_sum_staggered(&
-            nx,      ny,    &
-            parallel,        &
-            gsum,            &
-            work2u,  work2v)
+       gsum = parallel_global_sum_stagger(work2u, 2, parallel, work2v)  ! nflds = 2
        call t_stopf("pcg_glbsum_iter")
 
        !---- Halo update for Az
@@ -3038,11 +2962,7 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
           call t_stopf("pcg_dotprod")
 
           call t_startf("pcg_glbsum_resid")
-          call parallel_global_sum_staggered(&
-               nx,     ny,    &
-               parallel,      &
-               rr,            &
-               worku,  workv)
+          rr = parallel_global_sum_stagger(worku, parallel, workv)
           call t_stopf("pcg_glbsum_resid")
 
           L2_resid = sqrt(rr)          ! L2 norm of residual

From 37df31dac277793384aadda9ffd3e6c11ac5a69c Mon Sep 17 00:00:00 2001
From: William Lipscomb <lipscomb@ucar.edu>
Date: Fri, 5 Dec 2025 12:51:55 -0700
Subject: [PATCH 03/21] Shortened some subroutine names in the cism_parallel
 module

This commit changes subroutines distributed_gather_var_* to gather_var_*,
and similarly changes distributed_scatter_var_* to scatter_var_*.
This prevents some compilers from complaining about long subroutine names.

The 'distributed' prefix in subroutines names goes back a decade or so
to when CISM was first parallelized. Might at some point want to remove
it from other subroutine names.
---
 libglide/isostasy_elastic.F90            |  12 +-
 libglimmer/parallel_mpi.F90              | 198 +++++++++++------------
 libglint/glint_initialise.F90            |   4 +-
 libglint/glint_interp.F90                |  40 ++---
 libglissade/glissade.F90                 |   3 +-
 libglissade/glissade_basal_traction.F90  |  10 +-
 libglissade/glissade_glacier.F90         |  10 +-
 libglissade/glissade_velo_higher_pcg.F90 |  17 +-
 8 files changed, 146 insertions(+), 148 deletions(-)

diff --git a/libglide/isostasy_elastic.F90 b/libglide/isostasy_elastic.F90
index 60f2f55e..d309137d 100644
--- a/libglide/isostasy_elastic.F90
+++ b/libglide/isostasy_elastic.F90
@@ -136,7 +136,7 @@ subroutine calc_elastic(&
     !>  the load for simulations on more than one task.
 
     use cism_parallel, only: this_rank, main_task, &
-         parallel_type, distributed_gather_var, distributed_scatter_var, parallel_halo
+         parallel_type, gather_var, scatter_var, parallel_halo
 
     implicit none
 
@@ -182,8 +182,8 @@ subroutine calc_elastic(&
 
     ! Gather the local arrays onto the main task
     ! Note: global arrays are allocated in the subroutine
-    call distributed_gather_var(load_factors, load_factors_global, parallel)
-    call distributed_gather_var(load, load_global, parallel)
+    call gather_var(load_factors, load_factors_global, parallel)
+    call gather_var(load, load_global, parallel)
 
     if (main_task) then
        do j = 1, global_nsn
@@ -207,9 +207,9 @@ subroutine calc_elastic(&
 
     ! Scatter the load values back to local arrays
     ! Note: The global array is deallocated in the subroutine
-    call distributed_scatter_var(load, load_global, parallel)
+    call scatter_var(load, load_global, parallel)
 
-    ! distributed_scatter_var does not update the halo, so do an update here
+    ! scatter_var does not update the halo, so do an update here
     call parallel_halo(load, parallel)
 
     ! Deallocate the other global array (which is intent(in) and does not need to be scattered)
@@ -251,7 +251,7 @@ subroutine init_rbel(rbel, a)
     use isostasy_kelvin
     implicit none
     type(isos_elastic) :: rbel        !> structure holding elastic litho data
-    real(dp), intent(in) :: a             !> radius of disk
+    real(dp), intent(in) :: a         !> radius of disk
 
     real(dp) :: dummy_a
 
diff --git a/libglimmer/parallel_mpi.F90 b/libglimmer/parallel_mpi.F90
index 972b8b32..6afb1d9f 100644
--- a/libglimmer/parallel_mpi.F90
+++ b/libglimmer/parallel_mpi.F90
@@ -196,29 +196,29 @@ module cism_parallel
      module procedure broadcast_real8_1d
   end interface
 
-  interface distributed_gather_var
-     module procedure distributed_gather_var_integer_2d
-     module procedure distributed_gather_var_logical_2d
-     module procedure distributed_gather_var_real4_2d
-     module procedure distributed_gather_var_real4_3d
-     module procedure distributed_gather_var_real8_2d
-     module procedure distributed_gather_var_real8_3d
+  interface gather_var
+     module procedure gather_var_integer_2d
+     module procedure gather_var_logical_2d
+     module procedure gather_var_real4_2d
+     module procedure gather_var_real4_3d
+     module procedure gather_var_real8_2d
+     module procedure gather_var_real8_3d
   end interface
 
-  interface distributed_gather_var_row
-     module procedure distributed_gather_var_row_real8_2d
+  interface gather_var_row
+     module procedure gather_var_row_real8_2d
   end interface
 
-  interface distributed_gather_all_var_row
-     module procedure distributed_gather_all_var_row_real8_2d
+  interface gather_all_var_row
+     module procedure gather_all_var_row_real8_2d
   end interface
 
-  interface distributed_gather_var_col
-     module procedure distributed_gather_var_col_real8_2d
+  interface gather_var_col
+     module procedure gather_var_col_real8_2d
   end interface
 
-  interface distributed_gather_all_var_col
-     module procedure distributed_gather_all_var_col_real8_2d
+  interface gather_all_var_col
+     module procedure gather_all_var_col_real8_2d
   end interface
 
   interface distributed_get_var
@@ -246,21 +246,21 @@ module cism_parallel
      module procedure distributed_put_var_real8_3d
   end interface
 
-  interface distributed_scatter_var
-     module procedure distributed_scatter_var_integer_2d
-     module procedure distributed_scatter_var_logical_2d
-     module procedure distributed_scatter_var_real4_2d
-     module procedure distributed_scatter_var_real4_3d
-     module procedure distributed_scatter_var_real8_2d
-     module procedure distributed_scatter_var_real8_3d
+  interface scatter_var
+     module procedure scatter_var_integer_2d
+     module procedure scatter_var_logical_2d
+     module procedure scatter_var_real4_2d
+     module procedure scatter_var_real4_3d
+     module procedure scatter_var_real8_2d
+     module procedure scatter_var_real8_3d
   end interface
 
-  interface distributed_scatter_var_row
-     module procedure distributed_scatter_var_row_real8_2d
+  interface scatter_var_row
+     module procedure scatter_var_row_real8_2d
   end interface
 
-  interface distributed_scatter_var_col
-     module procedure distributed_scatter_var_col_real8_2d
+  interface scatter_var_col
+     module procedure scatter_var_col_real8_2d
   end interface
 
   interface parallel_boundary_value
@@ -631,10 +631,10 @@ end function distributed_execution
 
 !=======================================================================
 
-  ! subroutines belonging to the distributed_gather_var interface
+  ! subroutines belonging to the gather_var interface
 
   ! WHL, July 2019:
-  ! There is an issue with allocating the global_values array in the distributed_gather_var_*,
+  ! There is an issue with allocating the global_values array in the gather_var_*,
   !  distributed_get_var_*, distributed_print_*, and distributed_put_var_* functions and subroutines
   !  when computing only on active blocks (compute_blocks = 1).
   ! This array is allocated based on the max and min of ewlb, ewub, nslb, and nsub over the global domain.
@@ -646,7 +646,7 @@ end function distributed_execution
   !  global_minval_nslb, and global_maxval_nsub, which are now computed at initialization
   !  based on the bounds in all blocks (including inactive blocks), not just active blocks.
 
-  subroutine distributed_gather_var_integer_2d(values, global_values, parallel)
+  subroutine gather_var_integer_2d(values, global_values, parallel)
 
     ! Gather a distributed variable back to main_task node
     ! values = local portion of distributed variable
@@ -679,7 +679,7 @@ subroutine distributed_gather_var_integer_2d(values, global_values, parallel)
 
     if (uhalo==0 .and. size(values,1)==local_ewn-1) then
        ! Fixing this would require some generalization as is done for distributed_put_var
-       write(*,*) "distributed_gather does not currently work for"
+       write(*,*) "gather does not currently work for"
        write(*,*) "variables on the staggered grid when uhalo=0"
        call parallel_stop(__FILE__, __LINE__)
     end if
@@ -751,10 +751,10 @@ subroutine distributed_gather_var_integer_2d(values, global_values, parallel)
     end associate
     ! automatic deallocation
 
-  end subroutine distributed_gather_var_integer_2d
+  end subroutine gather_var_integer_2d
 
 
-  subroutine distributed_gather_var_logical_2d(values, global_values, parallel)
+  subroutine gather_var_logical_2d(values, global_values, parallel)
 
     ! Gather a distributed variable back to main_task node
     ! values = local portion of distributed variable
@@ -787,7 +787,7 @@ subroutine distributed_gather_var_logical_2d(values, global_values, parallel)
 
     if (uhalo==0 .and. size(values,1)==local_ewn-1) then
        ! Fixing this would require some generalization as is done for distributed_put_var
-       write(*,*) "distributed_gather does not currently work for"
+       write(*,*) "gather does not currently work for"
        write(*,*) "variables on the staggered grid when uhalo=0"
        call parallel_stop(__FILE__, __LINE__)
     end if
@@ -858,10 +858,10 @@ subroutine distributed_gather_var_logical_2d(values, global_values, parallel)
     end associate
     ! automatic deallocation
 
-  end subroutine distributed_gather_var_logical_2d
+  end subroutine gather_var_logical_2d
 
 
-  subroutine distributed_gather_var_real4_2d(values, global_values, parallel)
+  subroutine gather_var_real4_2d(values, global_values, parallel)
 
     ! Gather a distributed variable back to main_task node
     ! values = local portion of distributed variable
@@ -894,7 +894,7 @@ subroutine distributed_gather_var_real4_2d(values, global_values, parallel)
 
     if (uhalo==0 .and. size(values,1)==local_ewn-1) then
        ! Fixing this would require some generalization as is done for distributed_put_var
-       write(*,*) "distributed_gather does not currently work for"
+       write(*,*) "gather does not currently work for"
        write(*,*) "variables on the staggered grid when uhalo=0"
        call parallel_stop(__FILE__, __LINE__)
     end if
@@ -965,10 +965,10 @@ subroutine distributed_gather_var_real4_2d(values, global_values, parallel)
     end associate
     ! automatic deallocation
 
-  end subroutine distributed_gather_var_real4_2d
+  end subroutine gather_var_real4_2d
 
 
-  subroutine distributed_gather_var_real4_3d(values, global_values, parallel, ld1, ud1)
+  subroutine gather_var_real4_3d(values, global_values, parallel, ld1, ud1)
 
     ! Gather a distributed variable back to main_task node
     ! values = local portion of distributed variable
@@ -1002,7 +1002,7 @@ subroutine distributed_gather_var_real4_3d(values, global_values, parallel, ld1,
 
     if (uhalo==0 .and. size(values,1)==local_ewn-1) then
        ! Fixing this would require some generalization as is done for distributed_put_var
-       write(*,*) "distributed_gather does not currently work for"
+       write(*,*) "gather does not currently work for"
        write(*,*) "variables on the staggered grid when uhalo=0"
        call parallel_stop(__FILE__, __LINE__)
     end if
@@ -1091,10 +1091,10 @@ subroutine distributed_gather_var_real4_3d(values, global_values, parallel, ld1,
     end associate
     ! automatic deallocation
 
-  end subroutine distributed_gather_var_real4_3d
+  end subroutine gather_var_real4_3d
 
 
-  subroutine distributed_gather_var_real8_2d(values, global_values, parallel)
+  subroutine gather_var_real8_2d(values, global_values, parallel)
 
     ! Gather a distributed variable back to main_task node
     ! values = local portion of distributed variable
@@ -1127,7 +1127,7 @@ subroutine distributed_gather_var_real8_2d(values, global_values, parallel)
 
     if (uhalo==0 .and. size(values,1)==local_ewn-1) then
        ! Fixing this would require some generalization as is done for distributed_put_var
-       write(*,*) "distributed_gather does not currently work for"
+       write(*,*) "gather does not currently work for"
        write(*,*) "variables on the staggered grid when uhalo=0"
        call parallel_stop(__FILE__, __LINE__)
     end if
@@ -1199,10 +1199,10 @@ subroutine distributed_gather_var_real8_2d(values, global_values, parallel)
     end associate
     ! automatic deallocation
 
-  end subroutine distributed_gather_var_real8_2d
+  end subroutine gather_var_real8_2d
 
 
-  subroutine distributed_gather_var_real8_3d(values, global_values, parallel, ld1, ud1)
+  subroutine gather_var_real8_3d(values, global_values, parallel, ld1, ud1)
 
     ! Gather a distributed variable back to main_task node
     ! values = local portion of distributed variable
@@ -1236,7 +1236,7 @@ subroutine distributed_gather_var_real8_3d(values, global_values, parallel, ld1,
 
     if (uhalo==0 .and. size(values,1)==local_ewn-1) then
        ! Fixing this would require some generalization as is done for distributed_put_var
-       write(*,*) "distributed_gather does not currently work for"
+       write(*,*) "gather does not currently work for"
        write(*,*) "variables on the staggered grid when uhalo=0"
        call parallel_stop(__FILE__, __LINE__)
     end if
@@ -1325,16 +1325,16 @@ subroutine distributed_gather_var_real8_3d(values, global_values, parallel, ld1,
     end associate
     ! automatic deallocation
 
-  end subroutine distributed_gather_var_real8_3d
+  end subroutine gather_var_real8_3d
 
 !=======================================================================
 
-  ! subroutines belonging to the distributed_gather_var_row interface
+  ! subroutines belonging to the gather_var_row interface
 
-  subroutine distributed_gather_var_row_real8_2d(values, global_values, parallel)
+  subroutine gather_var_row_real8_2d(values, global_values, parallel)
 
     ! Gather data along a row of tasks onto the main task for that row.
-    ! Based on distributed_gather_var_real8_2d.
+    ! Based on gather_var_real8_2d.
     ! Note: The first index represents a data dimension that is the same on each task,
     !        whose size generally is less than own_ewn.
     !       The second index represents the north-south dimension, and is assumed
@@ -1369,7 +1369,7 @@ subroutine distributed_gather_var_row_real8_2d(values, global_values, parallel)
 
     if (size(values,2) /= own_nsn) then
        ! Note: Removing this restriction would require some recoding below.
-       write(*,*) "ERROR: distributed_gather_var_row requires N-S array size of own_nsn"
+       write(*,*) "ERROR: gather_var_row requires N-S array size of own_nsn"
        write(*,*) 'rank, own_nsn, size(values,2) =', this_rank, own_nsn, size(values,2)
        call parallel_stop(__FILE__, __LINE__)
     end if
@@ -1464,16 +1464,16 @@ subroutine distributed_gather_var_row_real8_2d(values, global_values, parallel)
     end associate
     ! automatic deallocation
 
-  end subroutine distributed_gather_var_row_real8_2d
+  end subroutine gather_var_row_real8_2d
 
 !=======================================================================
 
-  ! subroutines belonging to the distributed_gather_all_var_row interface
+  ! subroutines belonging to the gather_all_var_row interface
 
-  subroutine distributed_gather_all_var_row_real8_2d(values, global_values, parallel)
+  subroutine gather_all_var_row_real8_2d(values, global_values, parallel)
 
     ! Gather global data along a row of tasks onto each task for that row.
-    ! Based on distributed_gather_var_real8_2d.
+    ! Based on gather_var_real8_2d.
     ! Note: The first index represents a data dimension that is the same on each task,
     !        whose size generally is less than own_ewn.
     !       The second index represents the north-south dimension, and is assumed
@@ -1511,7 +1511,7 @@ subroutine distributed_gather_all_var_row_real8_2d(values, global_values, parall
        ! TODO: Do this recoding.  This subroutine currently fails with outflow BC, because
        !       the southern and western rows of tasks have an extra locally owned vertex,
        !       giving size(values,2) = own_nsn + 1
-       write(*,*) "ERROR: distributed_gather_var_row requires N-S array size of own_nsn"
+       write(*,*) "ERROR: gather_var_row requires N-S array size of own_nsn"
        write(*,*) 'rank, own_nsn, size(values,2) =', this_rank, own_nsn, size(values,2)
        call parallel_stop(__FILE__, __LINE__)
     end if
@@ -1586,16 +1586,16 @@ subroutine distributed_gather_all_var_row_real8_2d(values, global_values, parall
     end associate
     ! automatic deallocation
 
-  end subroutine distributed_gather_all_var_row_real8_2d
+  end subroutine gather_all_var_row_real8_2d
 
 !=======================================================================
 
-  ! subroutines belonging to the distributed_gather_var_col interface
+  ! subroutines belonging to the gather_var_col interface
 
-  subroutine distributed_gather_var_col_real8_2d(values, global_values, parallel)
+  subroutine gather_var_col_real8_2d(values, global_values, parallel)
 
     ! Gather data along a column of tasks onto the main task for that column.
-    ! Based on distributed_gather_var_real8_2d.
+    ! Based on gather_var_real8_2d.
     ! Note: The first index represents a data dimension that is the same on each task,
     !        whose size generally is less than own_nsn.
     !       The second index represents the east-west dimension, and is assumed
@@ -1630,7 +1630,7 @@ subroutine distributed_gather_var_col_real8_2d(values, global_values, parallel)
 
     if (size(values,2) /= own_ewn) then
        ! Note: Removing this restriction would require some recoding below.
-       write(*,*) "ERROR: distributed_gather_var_row requires E-W array size of own_ewn"
+       write(*,*) "ERROR: gather_var_row requires E-W array size of own_ewn"
        write(*,*) 'rank, own_ewn, size(values,2) =', this_rank, own_ewn, size(values,2)
        call parallel_stop(__FILE__, __LINE__)
     end if
@@ -1726,16 +1726,16 @@ subroutine distributed_gather_var_col_real8_2d(values, global_values, parallel)
     end associate
     ! automatic deallocation
 
-  end subroutine distributed_gather_var_col_real8_2d
+  end subroutine gather_var_col_real8_2d
 
 !=======================================================================
 
-  ! subroutines belonging to the distributed_gather_all_var_col interface
+  ! subroutines belonging to the gather_all_var_col interface
 
-  subroutine distributed_gather_all_var_col_real8_2d(values, global_values, parallel)
+  subroutine gather_all_var_col_real8_2d(values, global_values, parallel)
 
     ! Gather global data along a column of tasks onto each task for that column.
-    ! Based on distributed_gather_var_real8_2d.
+    ! Based on gather_var_real8_2d.
     ! Note: The first index represents a data dimension that is the same on each task,
     !        whose size generally is less than own_nsn.
     !       The second index represents the east-west dimension, and is assumed
@@ -1770,7 +1770,7 @@ subroutine distributed_gather_all_var_col_real8_2d(values, global_values, parall
 
     if (size(values,2) /= own_ewn) then
        ! Note: Removing this restriction would require some recoding below.
-       write(*,*) "ERROR: distributed_gather_var_row requires E-W array size of own_ewn"
+       write(*,*) "ERROR: gather_var_row requires E-W array size of own_ewn"
        write(*,*) 'rank, own_ewn, size(values,2) =', this_rank, own_ewn, size(values,2)
        call parallel_stop(__FILE__, __LINE__)
     end if
@@ -1845,7 +1845,7 @@ subroutine distributed_gather_all_var_col_real8_2d(values, global_values, parall
     end associate
     ! automatic deallocation
 
-  end subroutine distributed_gather_all_var_col_real8_2d
+  end subroutine gather_all_var_col_real8_2d
 
 !=======================================================================
 
@@ -4334,9 +4334,9 @@ end function distributed_put_var_real8_3d
 
 !=======================================================================
 
-  ! subroutines belonging to the distributed_scatter_var interface
+  ! subroutines belonging to the scatter_var interface
 
-  subroutine distributed_scatter_var_integer_2d(values, global_values, parallel)
+  subroutine scatter_var_integer_2d(values, global_values, parallel)
 
     ! Scatter a variable on the main_task node back to the distributed
     ! values = local portion of distributed variable
@@ -4366,7 +4366,7 @@ subroutine distributed_scatter_var_integer_2d(values, global_values, parallel)
 
     if (uhalo==0 .and. size(values,1)==local_ewn-1) then
        ! Fixing this would require some generalization as is done for distributed_put_var
-       write(iulog,*) "distributed_scatter does not currently work for"
+       write(iulog,*) "scatter does not currently work for"
        write(iulog,*) "variables on the staggered grid when uhalo=0"
        call parallel_stop(__FILE__, __LINE__)
     end if
@@ -4421,10 +4421,10 @@ subroutine distributed_scatter_var_integer_2d(values, global_values, parallel)
     deallocate(global_values)   ! TODO - Is this deallocation necessary, here and below?
     ! automatic deallocation
 
-  end subroutine distributed_scatter_var_integer_2d
+  end subroutine scatter_var_integer_2d
 
 
-  subroutine distributed_scatter_var_logical_2d(values, global_values, parallel)
+  subroutine scatter_var_logical_2d(values, global_values, parallel)
 
     ! Scatter a variable on the main_task node back to the distributed
     ! values = local portion of distributed variable
@@ -4454,7 +4454,7 @@ subroutine distributed_scatter_var_logical_2d(values, global_values, parallel)
 
     if (uhalo==0 .and. size(values,1)==local_ewn-1) then
        ! Fixing this would require some generalization as is done for distributed_put_var
-       write(iulog,*) "distributed_scatter does not currently work for"
+       write(iulog,*) "scatter does not currently work for"
        write(iulog,*) "variables on the staggered grid when uhalo=0"
        call parallel_stop(__FILE__, __LINE__)
     end if
@@ -4509,10 +4509,10 @@ subroutine distributed_scatter_var_logical_2d(values, global_values, parallel)
     deallocate(global_values)
     ! automatic deallocation
 
-  end subroutine distributed_scatter_var_logical_2d
+  end subroutine scatter_var_logical_2d
 
 
-  subroutine distributed_scatter_var_real4_2d(values, global_values, parallel)
+  subroutine scatter_var_real4_2d(values, global_values, parallel)
 
     ! Scatter a variable on the main_task node back to the distributed
     ! values = local portion of distributed variable
@@ -4542,7 +4542,7 @@ subroutine distributed_scatter_var_real4_2d(values, global_values, parallel)
 
     if (uhalo==0 .and. size(values,1)==local_ewn-1) then
        ! Fixing this would require some generalization as is done for distributed_put_var
-       write(iulog,*) "distributed_scatter does not currently work for"
+       write(iulog,*) "scatter does not currently work for"
        write(iulog,*) "variables on the staggered grid when uhalo=0"
        call parallel_stop(__FILE__, __LINE__)
     end if
@@ -4597,10 +4597,10 @@ subroutine distributed_scatter_var_real4_2d(values, global_values, parallel)
     deallocate(global_values)
     ! automatic deallocation
 
-  end subroutine distributed_scatter_var_real4_2d
+  end subroutine scatter_var_real4_2d
 
 
-  subroutine distributed_scatter_var_real4_3d(values, global_values, parallel)
+  subroutine scatter_var_real4_3d(values, global_values, parallel)
 
     ! Scatter a variable on the main_task node back to the distributed
     ! values = local portion of distributed variable
@@ -4630,7 +4630,7 @@ subroutine distributed_scatter_var_real4_3d(values, global_values, parallel)
 
     if (uhalo==0 .and. size(values,1)==local_ewn-1) then
        ! Fixing this would require some generalization as is done for distributed_put_var
-       write(iulog,*) "distributed_scatter does not currently work for"
+       write(iulog,*) "scatter does not currently work for"
        write(iulog,*) "variables on the staggered grid when uhalo=0"
        call parallel_stop(__FILE__, __LINE__)
     end if
@@ -4687,10 +4687,10 @@ subroutine distributed_scatter_var_real4_3d(values, global_values, parallel)
     deallocate(global_values)
     ! automatic deallocation
 
-  end subroutine distributed_scatter_var_real4_3d
+  end subroutine scatter_var_real4_3d
 
 
-  subroutine distributed_scatter_var_real8_2d(values, global_values, parallel)
+  subroutine scatter_var_real8_2d(values, global_values, parallel)
 
     ! Scatter a variable on the main_task node back to the distributed
     ! values = local portion of distributed variable
@@ -4720,7 +4720,7 @@ subroutine distributed_scatter_var_real8_2d(values, global_values, parallel)
 
     if (uhalo==0 .and. size(values,1)==local_ewn-1) then
        ! Fixing this would require some generalization as is done for distributed_put_var
-       write(iulog,*) "distributed_scatter does not currently work for"
+       write(iulog,*) "scatter does not currently work for"
        write(iulog,*) "variables on the staggered grid when uhalo=0"
        call parallel_stop(__FILE__, __LINE__)
     end if
@@ -4775,10 +4775,10 @@ subroutine distributed_scatter_var_real8_2d(values, global_values, parallel)
     deallocate(global_values)
     ! automatic deallocation
 
-  end subroutine distributed_scatter_var_real8_2d
+  end subroutine scatter_var_real8_2d
 
 
-  subroutine distributed_scatter_var_real8_3d(values, global_values, parallel, deallocflag)
+  subroutine scatter_var_real8_3d(values, global_values, parallel, deallocflag)
 
     ! Scatter a variable on the main_task node back to the distributed
     ! values = local portion of distributed variable
@@ -4810,7 +4810,7 @@ subroutine distributed_scatter_var_real8_3d(values, global_values, parallel, dea
 
     if (uhalo==0 .and. size(values,1)==local_ewn-1) then
        ! Fixing this would require some generalization as is done for distributed_put_var
-       write(iulog,*) "distributed_scatter does not currently work for"
+       write(iulog,*) "scatter does not currently work for"
        write(iulog,*) "variables on the staggered grid when uhalo=0"
        call parallel_stop(__FILE__, __LINE__)
     end if
@@ -4874,16 +4874,16 @@ subroutine distributed_scatter_var_real8_3d(values, global_values, parallel, dea
     if (deallocmem) deallocate(global_values)
     ! automatic deallocation
 
-  end subroutine distributed_scatter_var_real8_3d
+  end subroutine scatter_var_real8_3d
 
 !=======================================================================
 
-  ! subroutines belonging to the distributed_scatter_var_row interface
+  ! subroutines belonging to the scatter_var_row interface
 
-  subroutine distributed_scatter_var_row_real8_2d(values, global_values, parallel)
+  subroutine scatter_var_row_real8_2d(values, global_values, parallel)
 
     ! Scatter data to a row of tasks from the main task for that row.
-    ! Based on distributed_scatter_var_real8_2d.
+    ! Based on scatter_var_real8_2d.
     ! Note: The first index represents a data dimension that is the same on each task,
     !        whose size generally is less than own_ewn.
     !       The second index represents the north-south dimension, and is assumed
@@ -4917,7 +4917,7 @@ subroutine distributed_scatter_var_row_real8_2d(values, global_values, parallel)
 
     if (size(values,2) /= own_nsn) then
        ! Note: Removing this restriction would require some recoding below.
-       write(iulog,*) "ERROR: distributed_scatter_var_row requires N-S array size of own_nsn"
+       write(iulog,*) "ERROR: scatter_var_row requires N-S array size of own_nsn"
        call parallel_stop(__FILE__, __LINE__)
     end if
 
@@ -4974,16 +4974,16 @@ subroutine distributed_scatter_var_row_real8_2d(values, global_values, parallel)
     end associate
     ! automatic deallocation
 
-  end subroutine distributed_scatter_var_row_real8_2d
+  end subroutine scatter_var_row_real8_2d
 
 !=======================================================================
 
-  ! subroutines belonging to the distributed_scatter_var_col interface
+  ! subroutines belonging to the scatter_var_col interface
 
-  subroutine distributed_scatter_var_col_real8_2d(values, global_values, parallel)
+  subroutine scatter_var_col_real8_2d(values, global_values, parallel)
 
     ! Scatter data to a column of tasks from the main task for that column
-    ! Based on distributed_scatter_var_real8_2d.
+    ! Based on scatter_var_real8_2d.
     ! Note: The first index represents a data dimension that is the same on each task,
     !        whose size generally is less than own_nsn.
     !       The second index represents the east-west dimension, and is assumed
@@ -5016,7 +5016,7 @@ subroutine distributed_scatter_var_col_real8_2d(values, global_values, parallel)
 
     if (size(values,2) /= own_ewn) then
        ! Note: Removing this restriction would require some recoding below.
-       write(iulog,*) "ERROR: distributed_scatter_var_col requires E-W array size of own_nsn"
+       write(iulog,*) "ERROR: scatter_var_col requires E-W array size of own_nsn"
        call parallel_stop(__FILE__, __LINE__)
     end if
 
@@ -5073,7 +5073,7 @@ subroutine distributed_scatter_var_col_real8_2d(values, global_values, parallel)
     end associate
     ! automatic deallocation
 
-  end subroutine distributed_scatter_var_col_real8_2d
+  end subroutine scatter_var_col_real8_2d
 
 !=======================================================================
 
@@ -9402,7 +9402,7 @@ subroutine parallel_test_comm_row_col(parallel)
        enddo
     endif   ! this_rank
 
-    call distributed_gather_var_row(test_array, global_test_array, parallel)
+    call gather_var_row(test_array, global_test_array, parallel)
 
 !!    if (parallel%main_task_row) then
     if (parallel%main_task_row .and. this_rank == 0) then
@@ -9416,7 +9416,7 @@ subroutine parallel_test_comm_row_col(parallel)
        write(iulog,*) ' '
     endif
 
-    call distributed_scatter_var_row(test_array, global_test_array, parallel)
+    call scatter_var_row(test_array, global_test_array, parallel)
 
     if (this_rank == 0) then
        write(iulog,*) ' '
@@ -9456,7 +9456,7 @@ subroutine parallel_test_comm_row_col(parallel)
        enddo
     endif   ! this_rank
 
-    call distributed_gather_var_col(test_array, global_test_array, parallel)
+    call gather_var_col(test_array, global_test_array, parallel)
 
 !!    if (parallel%main_task_col) then
     if (parallel%main_task_col .and. this_rank == 0) then
@@ -9470,7 +9470,7 @@ subroutine parallel_test_comm_row_col(parallel)
        write(iulog,*) ' '
     endif
 
-    call distributed_scatter_var_col(test_array, global_test_array, parallel)
+    call scatter_var_col(test_array, global_test_array, parallel)
 
     if (this_rank == 0) then
        write(iulog,*) ' '
diff --git a/libglint/glint_initialise.F90 b/libglint/glint_initialise.F90
index a94a105e..79cd9038 100644
--- a/libglint/glint_initialise.F90
+++ b/libglint/glint_initialise.F90
@@ -607,7 +607,7 @@ subroutine setup_lgrid_fulldomain(instance, grid, grid_orog)
     use glint_global_grid  , only : global_grid
     use glimmer_coordinates, only : coordsystem_new
     use glide_types        , only : get_dew, get_dns
-    use cism_parallel       , only : parallel_type, distributed_gather_var
+    use cism_parallel       , only : parallel_type, gather_var
 
     implicit none
 
@@ -630,7 +630,7 @@ subroutine setup_lgrid_fulldomain(instance, grid, grid_orog)
     global_ewn = instance%model%parallel%global_ewn
     global_nsn = instance%model%parallel%global_nsn
 
-    call distributed_gather_var(instance%out_mask, out_mask_fulldomain, parallel)
+    call gather_var(instance%out_mask, out_mask_fulldomain, parallel)
 
     if (main_task) then
 
diff --git a/libglint/glint_interp.F90 b/libglint/glint_interp.F90
index 83491b99..5ef84eb0 100644
--- a/libglint/glint_interp.F90
+++ b/libglint/glint_interp.F90
@@ -248,7 +248,7 @@ subroutine interp_to_local (lgrid_fulldomain, global,        &
     use glimmer_utils
     use glimmer_coordinates
     use glimmer_log
-    use cism_parallel, only : main_task, parallel_type, distributed_scatter_var, parallel_halo
+    use cism_parallel, only : main_task, parallel_type, scatter_var, parallel_halo
 
     !TODO - Not sure we need localsp now that the code is fully double precision 
 
@@ -299,9 +299,9 @@ subroutine interp_to_local (lgrid_fulldomain, global,        &
 
     ! Allocate variables to hold result of interpolation
     ! We allocate size 0 arrays on non-main task (rather than leaving variables
-    ! unallocated there), because distributed_scatter_var tries to do a deallocate on all tasks
+    ! unallocated there), because scatter_var tries to do a deallocate on all tasks
     ! Note that coordsystem_allocate can't be used here because it only works on pointer
-    ! variables, and the *_fulldomain variables are non-pointers (as is required for distributed_scatter_var)
+    ! variables, and the *_fulldomain variables are non-pointers (as is required for scatter_var)
 
     if (present(localsp)) then
        if (main_task) then
@@ -450,25 +450,25 @@ subroutine interp_to_local (lgrid_fulldomain, global,        &
     end if  ! main_task
  
     ! Main task scatters interpolated data from the full domain to the task owning each point
-    ! Note that distributed_scatter_var doesn't set halo values, so we need to do a halo
+    ! Note that scatter_var doesn't set halo values, so we need to do a halo
     !  update if it's important to have correct values in the halo cells.
     ! Although it's not strictly necessary to have the halo values, we compute them just in
     !  case another part of the code (e.g., glissade_temp) assumes they are available.
 
     if (present(localsp)) then
        localsp(:,:) = 0.d0
-       call distributed_scatter_var(localsp, localsp_fulldomain, parallel)
+       call scatter_var(localsp, localsp_fulldomain, parallel)
        call parallel_halo(localsp, parallel)
     endif
 
     if (present(localdp)) then
        localdp(:,:) = 0.d0
-       call distributed_scatter_var(localdp, localdp_fulldomain, parallel)
+       call scatter_var(localdp, localdp_fulldomain, parallel)
        call parallel_halo(localdp, parallel)
     endif
 
     ! We do NOT deallocate the local*_fulldomain variables here, because the
-    ! distributed_scatter_var routines do this deallocation
+    ! scatter_var routines do this deallocation
 
   end subroutine interp_to_local
 
@@ -489,7 +489,7 @@ subroutine copy_to_local (lgrid_fulldomain, global,   &
     ! on the main task.
 
     use glimmer_coordinates
-    use cism_parallel, only : main_task, parallel_type, distributed_scatter_var, parallel_halo
+    use cism_parallel, only : main_task, parallel_type, scatter_var, parallel_halo
 
     ! Argument declarations
 
@@ -524,16 +524,16 @@ subroutine copy_to_local (lgrid_fulldomain, global,   &
     end if
 
     ! Main task scatters interpolated data from the full domain to the task owning each point
-    ! Note that distributed_scatter_var doesn't set halo values, so we need to do a halo
+    ! Note that scatter_var doesn't set halo values, so we need to do a halo
     !  update if it's important to have correct values in the halo cells.
     ! Although it's not strictly necessary to have the halo values, we compute them just in
     !  case another part of the code (e.g., glissade_temp) assumes they are available.
 
     local(:,:) = 0.d0
-    call distributed_scatter_var(local, local_fulldomain, parallel)
+    call scatter_var(local, local_fulldomain, parallel)
     call parallel_halo(local, parallel)
 
-    ! We do NOT deallocate local_fulldomain here, because the distributed_scatter_var
+    ! We do NOT deallocate local_fulldomain here, because the scatter_var
     ! routine does this deallocation
 
   end subroutine copy_to_local
@@ -695,7 +695,7 @@ subroutine local_to_global_avg(ups, parallel, local, global, mask)
     !> \texttt{interp\_to\_local} routine.
     !> \end{itemize}
 
-    use cism_parallel, only : main_task, parallel_type, distributed_gather_var
+    use cism_parallel, only : main_task, parallel_type, gather_var
 
     ! Arguments
 
@@ -729,8 +729,8 @@ subroutine local_to_global_avg(ups, parallel, local, global, mask)
 
     ! Gather 'local' and 'tempmask' onto main task, which is the only one that does the regridding
 
-    call distributed_gather_var(local, local_fulldomain, parallel)
-    call distributed_gather_var(tempmask, tempmask_fulldomain, parallel)
+    call gather_var(local, local_fulldomain, parallel)
+    call gather_var(tempmask, tempmask_fulldomain, parallel)
 
     ! Main task does regridding
 
@@ -785,7 +785,7 @@ subroutine local_to_global_sum(ups, parallel, local, global, mask)
     !> \item \texttt{gboxn} is the same size as \texttt{global}
     !> \end{itemize}
 
-    use cism_parallel, only : main_task, parallel_type, distributed_gather_var
+    use cism_parallel, only : main_task, parallel_type, gather_var
 
     ! Arguments
 
@@ -816,8 +816,8 @@ subroutine local_to_global_sum(ups, parallel, local, global, mask)
 
     ! Gather 'local' and 'tempmask' onto main task, which is the only one that does the regridding
 
-    call distributed_gather_var(local, local_fulldomain, parallel)
-    call distributed_gather_var(tempmask, tempmask_fulldomain, parallel)
+    call gather_var(local, local_fulldomain, parallel)
+    call gather_var(tempmask, tempmask_fulldomain, parallel)
 
     ! Main task does regridding
     if (main_task) then
@@ -854,7 +854,7 @@ subroutine local_to_global_min(ups, parallel, local, global, mask)
     !> \item \texttt{gboxn} is the same size as \texttt{global}
     !> \end{itemize}
 
-    use cism_parallel, only : main_task, parallel_type, distributed_gather_var
+    use cism_parallel, only : main_task, parallel_type, gather_var
 
     ! Arguments
 
@@ -885,8 +885,8 @@ subroutine local_to_global_min(ups, parallel, local, global, mask)
 
     ! Gather 'local' and 'tempmask' onto main task, which is the only one that does the regridding
 
-    call distributed_gather_var(local, local_fulldomain, parallel)
-    call distributed_gather_var(tempmask, tempmask_fulldomain, parallel)
+    call gather_var(local, local_fulldomain, parallel)
+    call gather_var(tempmask, tempmask_fulldomain, parallel)
 
     ! Main task does regridding
     if (main_task) then
diff --git a/libglissade/glissade.F90 b/libglissade/glissade.F90
index 6a11c5ac..72bab6f4 100644
--- a/libglissade/glissade.F90
+++ b/libglissade/glissade.F90
@@ -92,8 +92,7 @@ subroutine glissade_initialise(model, evolve_ice)
 
     ! initialise Glissade model instance
 
-    use cism_parallel, only: parallel_type, distributed_gather_var,  &
-         distributed_scatter_var, parallel_finalise, &
+    use cism_parallel, only: parallel_type, parallel_finalise, &
          distributed_grid, distributed_grid_active_blocks,  parallel_global_edge_mask, &
          parallel_halo, parallel_halo_extrapolate, parallel_reduce_max, &
          staggered_parallel_halo_extrapolate, staggered_no_penetration_mask, &
diff --git a/libglissade/glissade_basal_traction.F90 b/libglissade/glissade_basal_traction.F90
index a5142dab..3ffcca40 100644
--- a/libglissade/glissade_basal_traction.F90
+++ b/libglissade/glissade_basal_traction.F90
@@ -52,7 +52,7 @@ module glissade_basal_traction
   use glimmer_utils, only: point_diag
   use glide_types
   use cism_parallel, only : this_rank, main_task, parallel_type, &
-       parallel_halo, staggered_parallel_halo, parallel_globalindex, distributed_scatter_var
+       parallel_halo, staggered_parallel_halo, parallel_globalindex, scatter_var
 
   implicit none
 
@@ -347,7 +347,7 @@ subroutine glissade_calcbeta (&
        !      The following code sets beta on the full grid as prescribed by Pattyn et al. (2008).
 
        ! Allocate a global array on the main task only.
-       ! On other tasks, allocate a size 0 array, since distributed_scatter_var wants to deallocate on all tasks.
+       ! On other tasks, allocate a size 0 array, since scatter_var wants to deallocate on all tasks.
        if (main_task) then
           allocate(beta_global(parallel%global_ewn, parallel%global_nsn))
        else
@@ -377,9 +377,9 @@ subroutine glissade_calcbeta (&
        ! Note: beta_extend has dimensions (ewn,nsn), so it can receive scattered data from beta_global.
        allocate(beta_extend(ewn, nsn))
        beta_extend(:,:) = 0.d0
-       call distributed_scatter_var(beta_extend, beta_global, parallel)
+       call scatter_var(beta_extend, beta_global, parallel)
 
-       ! distributed_scatter_var does not update the halo, so do an update here
+       ! scatter_var does not update the halo, so do an update here
        call parallel_halo(beta_extend, parallel)
 
        ! Copy beta_extend to beta on the local processor.
@@ -391,7 +391,7 @@ subroutine glissade_calcbeta (&
           enddo
        enddo
 
-      ! beta_extend is no longer needed (beta_global is deallocated in distributed_scatter_var)
+      ! beta_extend is no longer needed (beta_global is deallocated in scatter_var)
       deallocate(beta_extend)
 
     case(HO_BABC_BETA_EXTERNAL)   ! use beta value from external file
diff --git a/libglissade/glissade_glacier.F90 b/libglissade/glissade_glacier.F90
index b896beb6..67744610 100644
--- a/libglissade/glissade_glacier.F90
+++ b/libglissade/glissade_glacier.F90
@@ -74,7 +74,7 @@ subroutine glissade_glacier_init(model, glacier)
     ! Another array, cism_to_rgi_glacier_id, identifies the RGI ID associated with each CISM ID.
     ! The CISM input file contains the RGI IDs.
 
-    use cism_parallel, only: distributed_gather_var, distributed_scatter_var, &
+    use cism_parallel, only: gather_var, scatter_var, &
          parallel_global_sum, parallel_reduce_max, parallel_reduce_min, parallel_is_zero, &
          broadcast, parallel_halo, staggered_parallel_halo, parallel_globalindex
 
@@ -228,10 +228,10 @@ subroutine glissade_glacier_init(model, glacier)
 
        ! Gather the RGI glacier IDs to the main task
        if (main_task) allocate(rgi_glacier_id_global(global_ewn, global_nsn))
-       call distributed_gather_var(glacier%rgi_glacier_id, rgi_glacier_id_global, parallel)
+       call gather_var(glacier%rgi_glacier_id, rgi_glacier_id_global, parallel)
 
        ! Allocate a global array for the CISM glacier IDs on the main task.
-       ! Allocate a size 0 array on other tasks; distributed_scatter_var wants arrays allocated on all tasks.
+       ! Allocate a size 0 array on other tasks; scatter_var wants arrays allocated on all tasks.
        if (main_task) then
           allocate(cism_glacier_id_global(global_ewn,global_nsn))
        else
@@ -377,8 +377,8 @@ subroutine glissade_glacier_init(model, glacier)
        endif   ! main_task
 
        ! Scatter cism_glacier_id_global to all processors
-       ! Note: This global array is deallocated in the distributed_scatter_var subroutine
-       call distributed_scatter_var(glacier%cism_glacier_id, cism_glacier_id_global, parallel)
+       ! Note: This global array is deallocated in the scatter_var subroutine
+       call scatter_var(glacier%cism_glacier_id, cism_glacier_id_global, parallel)
 
        call parallel_halo(glacier%cism_glacier_id, parallel)
 
diff --git a/libglissade/glissade_velo_higher_pcg.F90 b/libglissade/glissade_velo_higher_pcg.F90
index 1d4df22a..8d31e098 100644
--- a/libglissade/glissade_velo_higher_pcg.F90
+++ b/libglissade/glissade_velo_higher_pcg.F90
@@ -4557,9 +4557,8 @@ subroutine tridiag_solver_global_2d(ilocal,       jlocal,     &
                                       first_time,   gather_data)
 
     use glimmer_utils, only: tridiag
-    use cism_parallel, only: distributed_gather_var_row, distributed_gather_var_col, &
-         distributed_gather_all_var_row, distributed_gather_all_var_col, &
-         distributed_scatter_var_row, distributed_scatter_var_col
+    use cism_parallel, only: gather_var_row, gather_var_col, &
+         gather_all_var_row, gather_all_var_col, scatter_var_row, scatter_var_col
 
     integer, intent(in) :: &
          ilocal, jlocal            ! size of input/output arrays; number of locally owned vertices in each direction
@@ -4784,11 +4783,11 @@ subroutine tridiag_solver_global_2d(ilocal,       jlocal,     &
 
           if (tridiag_solver_flag == 'row') then
              call t_startf("pcg_tridiag_gather_row")
-             call distributed_gather_all_var_row(outdata, gather_data2, parallel)
+             call gather_all_var_row(outdata, gather_data2, parallel)
              call t_stopf ("pcg_tridiag_gather_row")
           elseif (tridiag_solver_flag == 'col') then
              call t_startf("pcg_tridiag_gather_col")
-             call distributed_gather_all_var_col(outdata, gather_data2, parallel)
+             call gather_all_var_col(outdata, gather_data2, parallel)
              call t_stopf ("pcg_tridiag_gather_col")
           endif
 
@@ -4799,11 +4798,11 @@ subroutine tridiag_solver_global_2d(ilocal,       jlocal,     &
 
           if (tridiag_solver_flag == 'row') then
              call t_startf("pcg_tridiag_gather_row")
-             call distributed_gather_var_row(outdata, gather_data2, parallel)
+             call gather_var_row(outdata, gather_data2, parallel)
              call t_stopf ("pcg_tridiag_gather_row")
           elseif (tridiag_solver_flag == 'col') then
              call t_startf("pcg_tridiag_gather_col")
-             call distributed_gather_var_col(outdata, gather_data2, parallel)
+             call gather_var_col(outdata, gather_data2, parallel)
              call t_stopf ("pcg_tridiag_gather_col")
           endif
 
@@ -4918,11 +4917,11 @@ subroutine tridiag_solver_global_2d(ilocal,       jlocal,     &
 
           if (tridiag_solver_flag == 'row') then
              call t_startf("pcg_tridiag_scatter_row")
-             call distributed_scatter_var_row(local_coeffs, global_coeffs, parallel)
+             call scatter_var_row(local_coeffs, global_coeffs, parallel)
              call t_stopf ("pcg_tridiag_scatter_row")
           elseif (tridiag_solver_flag == 'col') then
              call t_startf("pcg_tridiag_scatter_col")
-             call distributed_scatter_var_col(local_coeffs, global_coeffs, parallel)
+             call scatter_var_col(local_coeffs, global_coeffs, parallel)
              call t_stopf ("pcg_tridiag_scatter_col")
           endif
 

From 0aae0b7816593a975feb824aecc49ac8efca53cb Mon Sep 17 00:00:00 2001
From: William Lipscomb <lipscomb@ucar.edu>
Date: Fri, 19 Dec 2025 16:34:12 -0700
Subject: [PATCH 04/21] More work toward reproducible sums

This commit continues the work toward a reproducible sum capability:
an option to compute sums such that results on different numbers
of processors are BFB.

The basic capability is now working. In the various parallel_global_sum routines
(with interfaces parallel_global_sum, parallel_global_sum_stagger and
parallel_global_sum_patch), there is an option to call subroutine
parallel_reduce_reprosum in lieu of parallel_reduce sum.
Subroutine parallel_reduce_reprosum calls cism_reproduce_calc, which
converts dp variables to i8 and does the sums.

I verified that sums are now reproducible for some test problems, including
a Greenland configuration similar to what we are using for CESM3.
One issue was that the velocity solver was using local copies of
variables xVertex and yVertex for its node geometry, which led to a dependence
on processor count. Computing these variables based on the global coordinates
model%general%x0 and model%general%y0 solves the problem, although it leads
to slightly larger roundoff errors for some calculations.

I fixed another subtle issue by rewriting two equations of this form:
  c1 = c1 + a*b
  c2 = c2 + a*b
with equations of this form:
  f = a*b
  c1 = c1 + f
  c2 = c2 + f
I don't know why this was necessary.

Some results still depend on processor count: for instance, the scheme that routes
subglacial water downhill to neighboring cells. I will work on that in a future commit.

Other changes:

* Fixed a bug in the way the rmse for surface velocity is computed.

* Added a subroutine called double_to_binary in glimmer_utils.
  Given an input double-precision floating-point number, the output is a
  character string corresponding to the 64-bit internal representation of that number
  (1 bit for sign, 11 for exponent, and 52 for fraction).
  This is useful for debugging.

* Added two 'write_array_to_file' subroutines (real8_2d and real8_3d) in glissade_utils.
  These subroutines take an input array of dp variables, convert each variable
  to a character string corresponding to the 64-bit binary representation,
  and write that string to a file. This can be useful for debugging if we suspect
  that the values of some array elements depend on processor count, but we
  don't know which elements. The real8_3d version has an optional argument to make
  it work for either (k,i,j) arrays or (i,j,k) arrays.
---
 libglide/glide_diagnostics.F90           |  26 +-
 libglide/glide_setup.F90                 |   3 +
 libglimmer/cism_reprosum_mod.F90         | 131 +-----
 libglimmer/glimmer_utils.F90             |  85 ++++
 libglimmer/parallel_mpi.F90              | 570 +++++++++++++++++++----
 libglissade/glissade.F90                 |  28 +-
 libglissade/glissade_basal_water.F90     |  13 +
 libglissade/glissade_utils.F90           | 115 ++++-
 libglissade/glissade_velo_higher.F90     | 447 +++++++++++++-----
 libglissade/glissade_velo_higher_pcg.F90 | 176 +++----
 utils/build/generate_ncvars.py           |   8 +-
 11 files changed, 1142 insertions(+), 460 deletions(-)

diff --git a/libglide/glide_diagnostics.F90 b/libglide/glide_diagnostics.F90
index cd72557c..8aec8052 100644
--- a/libglide/glide_diagnostics.F90
+++ b/libglide/glide_diagnostics.F90
@@ -45,6 +45,8 @@ module glide_diagnostics
 
   implicit none
 
+  logical, parameter :: verbose_diagnostics = .false.
+
 contains
 
   subroutine glide_write_diagnostics (model,  time,    &
@@ -64,8 +66,6 @@ subroutine glide_write_diagnostics (model,  time,    &
 
     ! local arguments
 
-    logical, parameter :: verbose_diagnostics = .false.
-
     ! debug
     if (main_task .and. verbose_diagnostics) then
        write(iulog,*) '	'
@@ -234,6 +234,9 @@ subroutine glide_write_diag (model,       time)
          grounded_mask,            & ! = 1 where ice is present and grounded, else = 0
          glacier_ice_mask            ! = 1 where glacier ice is present, initially and/or currently
 
+    integer, dimension(model%general%ewn-1,model%general%nsn-1) ::  &
+         stag_ice_mask               ! staggered mask; = 1 if ice_mask = 1 for any of the four neighbors
+
     real(dp), dimension(model%general%upn) ::  &
          temp_diag,                     &    ! Note: sfc temp not included if temps are staggered
                                              !       (use artm instead)
@@ -275,9 +278,11 @@ subroutine glide_write_diag (model,       time)
 
     real(dp), dimension(model%general%ewn, model%general%nsn) ::  &
          mass_above_flotation,& ! ice mass above flotation (kg)
-         velo_sfc,            & ! surface ice speed (m/s)
          thck_obs               ! observed ice thickness (m), derived from usrf_obs and topg
 
+    real(dp), dimension(model%general%ewn-1, model%general%nsn-1) ::  &
+         velo_sfc               ! surface ice speed (m/s)
+
     real(dp), dimension(:,:,:), allocatable :: &
          local_energy           ! internal energy (J) per layer in a column
 
@@ -355,6 +360,17 @@ subroutine glide_write_diag (model,       time)
        enddo
     enddo
 
+    do j = 1, nsn-1
+       do i = 1, ewn-1
+          if (ice_mask(i,j+1) == 1 .or. ice_mask(i+1,j+1) == 1 .or.  &
+              ice_mask(i,j)   == 1 .or. ice_mask(i+1,j)   == 1) then
+             stag_ice_mask(i,j) = 1
+          else
+             stag_ice_mask(i,j) = 0
+          endif
+       enddo
+    enddo
+
     !-----------------------------------------------------------------
     ! Compute and write global diagnostics
     !-----------------------------------------------------------------
@@ -897,8 +913,8 @@ subroutine glide_write_diag (model,       time)
                      + model%velocity%vvel(1,:,:)**2)
 
        call glissade_rms_error(&
-            ewn,            nsn,                &
-            ice_mask,                           &
+            ewn-1,          nsn-1,              &
+            stag_ice_mask,                      &
             parallel,                           &
             velo_sfc * scyr,                    &  ! m/yr
             model%velocity%velo_sfc_obs * scyr, &  ! m/yr
diff --git a/libglide/glide_setup.F90 b/libglide/glide_setup.F90
index 3b2aa1e4..d28088be 100644
--- a/libglide/glide_setup.F90
+++ b/libglide/glide_setup.F90
@@ -2699,6 +2699,9 @@ subroutine print_parameters(model)
     if (model%options%which_ho_babc == HO_BABC_BETA_CONSTANT) then
        write(message,*) 'uniform beta (Pa yr/m)        : ',model%basal_physics%ho_beta_const
        call write_log(message)
+    elseif (model%options%which_ho_babc == HO_BABC_BETA_LARGE) then
+       write(message,*) 'large beta (Pa yr/m) : ',model%basal_physics%ho_beta_large
+       call write_log(message)
     elseif (model%options%which_ho_babc == HO_BABC_BETA_BPMP) then
        write(message,*) 'large (frozen) beta (Pa yr/m) : ',model%basal_physics%ho_beta_large
        call write_log(message)
diff --git a/libglimmer/cism_reprosum_mod.F90 b/libglimmer/cism_reprosum_mod.F90
index 55a034c1..2f071f82 100644
--- a/libglimmer/cism_reprosum_mod.F90
+++ b/libglimmer/cism_reprosum_mod.F90
@@ -42,16 +42,16 @@ module cism_reprosum_mod
    use glimmer_global, only: r8 => dp
    use glimmer_global, only: i8
    use glimmer_paramets, only: iulog
-!!   use cism_parallel, only: main_task
+   use glimmer_utils, only: double_to_binary
 !   use shr_log_mod,   only: s_loglev  => shr_log_Level
 !   use shr_log_mod,   only: s_logunit => shr_log_Unit
 !   use shr_sys_mod,   only: shr_sys_abort
-
    use cism_infnan_mod,only: cism_infnan_inf_type, assignment(=), &
                             cism_infnan_posinf, cism_infnan_neginf, &
                             cism_infnan_nan, &
                             cism_infnan_isnan, cism_infnan_isinf, &
                             cism_infnan_isposinf, cism_infnan_isneginf
+
 #ifdef TIMING
    use perf_mod
 #endif
@@ -456,21 +456,14 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands,     &
       real(r8) :: abs_diff               ! absolute difference between
                                          !  fixed and floating point
                                          !  sums
+      character(len=64) :: binary_str    ! string to represent 64 bits of i8 integer
+      integer :: n
 
 #ifdef _OPENMP
       integer omp_get_max_threads
       external omp_get_max_threads
 #endif
 
-      !WHL - debug
-      logical :: test_sum = .false.
-      real(r8), dimension(dsummands,nflds) :: arr_test
-      arr_test(:,:) = 0.0d0
-      call mpi_comm_rank(MPI_COMM_WORLD, mypid, ierr)
-      if (verbose_reprosum .and. mypid == 0) then
-         write(iulog,*) 'In cism_reprosum_mod'
-      endif
-!
 !-----------------------------------------------------------------------
 !
 ! initialize local statistics variables
@@ -505,10 +498,6 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands,     &
       !TODO - Remove the inf_nan option; assume abort_inf_nan = T, as in CICE
       if (abort_inf_nan) then
 
-         if (verbose_reprosum .and. mypid == 0) then
-            write(iulog,*) '   abort_inf_nan check'
-         endif
-
 ! check whether input contains NaNs or INFs, and abort if so
          nan_check = any(cism_infnan_isnan(arr))
          inf_check = any(cism_infnan_isinf(arr))
@@ -587,9 +576,6 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands,     &
 ! get number of MPI tasks
          call mpi_comm_size(mpi_comm, tasks, ierr)
 
-         if (verbose_reprosum .and. mypid == 0) then
-            write(iulog,*) 'Starting reprosum, tasks =', tasks
-         endif
 ! get number of OpenMP threads
 #ifdef _OPENMP
          omp_nthreads = omp_get_max_threads()
@@ -847,12 +833,6 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands,     &
                enddo
             endif
 
-            if (verbose_reprosum .and. mypid == 0) then
-               write(iulog,*) '   max_nsummands =', max_nsummands
-               write(iulog,*) '   max_levels(1), max_level =', max_levels(1), max_level
-               write(iulog,*) '   call reprosum_int, pid =', mypid
-            endif
-
 ! calculate sum
             validate = .false.
             call cism_reprosum_int(arr, arr_gsum, nsummands, dsummands, &
@@ -944,13 +924,20 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands,     &
          repro_sum_stats(6) = repro_sum_stats(6) + gbl_lor_red
       endif
 
-      !WHL - debug
-      if (verbose_reprosum .and. mypid == 0) then
-         write(iulog,*) 'Exiting cism_reprosum_calc, arr_gsum =', arr_gsum(:)
+      if (verbose_reprosum) then
+         call mpi_comm_rank(MPI_COMM_WORLD, mypid, ierr)
+         if (mypid == 0) then
+            write(iulog,*) 'Exit reprosum, nflds =', nflds
+            write(iulog,*) '   n, arr_gsum, binary_str:'
+!!            do n = 1, nflds
+            do n = 1, min(2,nflds)
+               call double_to_binary(arr_gsum(n), binary_str)
+               write(iulog,*) n, arr_gsum(n), binary_str
+            enddo
+         endif
       endif
 
    end subroutine cism_reprosum_calc
-
 !
 !========================================================================
 !
@@ -1079,13 +1066,6 @@ subroutine cism_reprosum_int (arr, arr_gsum, nsummands, dsummands, nflds, &
       !WHL - debug
       integer :: mypid, k
       call mpi_comm_rank(MPI_COMM_WORLD, mypid, ierr)
-
-      if (verbose_reprosum .and. mypid == 0) then
-         write(iulog,*) 'In cism_reprosum_int, pid, arr =', mypid, arr(:,:)
-         write(iulog,*) 'dsummands, nsummands =', dsummands, nsummands
-         write(iulog,*) 'size(arr) =', size(arr,1), size(arr,2)
-      endif
-
 !
 !-----------------------------------------------------------------------
 ! Save radix of i8 variables in an i8 variable
@@ -1108,20 +1088,8 @@ subroutine cism_reprosum_int (arr, arr_gsum, nsummands, dsummands, nflds, &
       veclth = offset(nflds) + max_levels(nflds)
 
 ! split summand index range over OpenMP threads
-! WHL - Should this be dsummands instead of nsummands?
-!!      call split_indices(nsummands, omp_nthreads, isum_beg, isum_end)
       call split_indices(dsummands, omp_nthreads, isum_beg, isum_end)
 
-      if (verbose_reprosum .and. mypid == 0) then
-         write(iulog,*) 'pid, i8_radix, voffset, veclth, nflds =', mypid, i8_radix, voffset, veclth, nflds
-         write(iulog,*) '   max_levels =', max_levels(:)
-         write(iulog,*) '   dsummands, omp_nthreads =', dsummands, omp_nthreads
-         write(iulog,*) '   isum_beg/end =', isum_beg, isum_end
-!         write(iulog,*) '   size(i8_arr_lsum_level) = ', size(i8_arr_lsum_level)
-!         write(iulog,*) '   size(i8_arr_gsum_level) = ', size(i8_arr_gsum_level)
-      endif
-
-
 ! convert local summands to vector of integers and sum
 ! (Using scale instead of set_exponent because arr_remainder may not be
 !  "normal" after level 1 calculation)
@@ -1152,27 +1120,12 @@ subroutine cism_reprosum_int (arr, arr_gsum, nsummands, dsummands, nflds, &
 
           do isum=isum_beg(ithread),isum_end(ithread)
 
-            if (verbose_reprosum .and. mypid == 1) then
-               write (iulog,*) '  pid, isum, ifld, arr(isum,ifld) =', mypid, isum, ifld, arr(isum,ifld)
-            endif
-
             arr_remainder = 0.0_r8
 
-            !WHL - debug
-            ! If isum_end > size(arr,1) = dsummands, then the next 'if' will try to access nonexistent memory.
-            ! Adding some logic to exit the do loop first.
-            ! Not necessary if split_indices is called with dsummands
-            !TODO: Check the split_indices logic for isum_end
-!!!!            if (isum > size(arr,1)) exit
-
             if (arr(isum,ifld) .ne. 0.0_r8) then
                arr_exp   = exponent(arr(isum,ifld))
                arr_frac  = fraction(arr(isum,ifld))
 
-               if (verbose_reprosum) then
-!!!                  write(iulog,*) '   pid, isum, arr_exp, arr_frac:', mypid, isum, arr_exp, arr_frac 
-               endif
-
 ! test that global maximum upper bound is an upper bound
                if (arr_exp > arr_gmax_exp(ifld)) then
                   max_error(ifld,ithread) = 1
@@ -1197,10 +1150,6 @@ subroutine cism_reprosum_int (arr, arr_gsum, nsummands, dsummands, nflds, &
                   ilevel = 1
                endif
 
-               if (verbose_reprosum .and. mypid == 1) then
-                  write (iulog,*) '  pid, ilevel, max_levels =', mypid, ilevel, max_levels(ifld)
-               endif
-
                if (ilevel .le. max_levels(ifld)) then
 ! apply first shift/truncate, add it to the relevant running
 ! sum, and calculate the remainder.
@@ -1209,30 +1158,17 @@ subroutine cism_reprosum_int (arr, arr_gsum, nsummands, dsummands, nflds, &
                   i8_arr_tlsum_level(ilevel,ifld,ithread) = &
                      i8_arr_tlsum_level(ilevel,ifld,ithread) + i8_arr_level
                   arr_remainder = arr_remainder - i8_arr_level
-
-                  if (verbose_reprosum .and. mypid <= 1) then
-                     write(iulog,*) '   pid, arr_old, i8_arr_level, arr_new:', &
-                          mypid, scale(arr_frac,arr_shift), i8_arr_level, arr_remainder
-                  endif
                   
 ! while the remainder is non-zero, continue to shift, truncate,
 ! sum, and calculate new remainder
                   do while ((arr_remainder .ne. 0.0_r8) &
                      .and. (ilevel < max_levels(ifld)))
                      ilevel = ilevel + 1
-                     if (verbose_reprosum .and. mypid <= 1) then
-                        write(iulog,*) '        pid, ilevel, arr_old =', mypid, ilevel, arr_remainder
-                     endif
                      arr_remainder = scale(arr_remainder,arr_max_shift)
                      i8_arr_level = int(arr_remainder,i8)
                      i8_arr_tlsum_level(ilevel,ifld,ithread) = &
                           i8_arr_tlsum_level(ilevel,ifld,ithread) + i8_arr_level
 
-                     if (verbose_reprosum .and. mypid <= 1) then
-                        write(iulog,*) '        pid, arr_scaled, i8_arr_level:', &
-                             mypid, arr_remainder, i8_arr_level
-                     endif
-
                      arr_remainder = arr_remainder - i8_arr_level
 
                   enddo  ! arr_remainder /= 0.0_r8
@@ -1272,22 +1208,12 @@ subroutine cism_reprosum_int (arr, arr_gsum, nsummands, dsummands, nflds, &
 ! sum contributions from different threads
       do ifld=1,nflds
 
-         if (verbose_reprosum .and. mypid == 0) then
-!!            write(iulog,*) 'ifld =', ifld
-         endif
-
          ioffset = offset(ifld)
          do ithread = 1,omp_nthreads
             do ilevel = 0,max_levels(ifld)
                i8_arr_lsum_level(ioffset+ilevel) = &
                   i8_arr_lsum_level(ioffset+ilevel) &
                   + i8_arr_tlsum_level(ilevel,ifld,ithread)
-
-               if (verbose_reprosum) then
-!!                  write(iulog,*) '   pid, ilevel, ioffset, i8_arr_lsum_level =', &
-!!                       mypid, ilevel, i8_arr_lsum_level(ioffset+ilevel)
-               endif
-               
             enddo
          enddo
       enddo
@@ -1322,22 +1248,6 @@ subroutine cism_reprosum_int (arr, arr_gsum, nsummands, dsummands, nflds, &
       call mpi_allreduce (i8_arr_lsum_level, i8_arr_gsum_level, &
                           veclth, MPI_INTEGER8, MPI_SUM, mpi_comm, ierr)
 
-      if (verbose_reprosum .and. mypid < 5) then
-         do k = 1, size(i8_arr_lsum_level)
-            if (i8_arr_lsum_level(k) /= 0) then
-               write(iulog,*) 'pid, k, i8_arr_lsum_level =', mypid, k, i8_arr_lsum_level(k)
-            endif
-         enddo
-      endif
-
-      if (verbose_reprosum .and. mypid == 0) then
-         do k = 1, size(i8_arr_gsum_level)
-            if (i8_arr_gsum_level(k) /= 0) then
-               write(iulog,*) 'pid, k, i8_arr_gsum_level =', mypid, k, i8_arr_gsum_level(k)
-            endif
-         enddo
-      endif
-
 #ifdef TIMING
       call t_stopf("repro_sum_allr_i8")
 #endif
@@ -1477,10 +1387,6 @@ subroutine cism_reprosum_int (arr, arr_gsum, nsummands, dsummands, nflds, &
                arr_gsum(ifld) = scale(RX_8,MINEXPONENT(1._r8))
             endif
 
-            if (verbose_reprosum .and. mypid == 0) then
-               write(iulog,*) '   ifld, arr_gsum =', ifld, arr_gsum(ifld)
-            endif
-
 ! if validate is .true. and some precision lost, test whether 'too much'
 !  was lost, due to too loose an upper bound, too stringent a limit on number
 !  of levels of expansion, cancellation, .... Calculated by comparing lower
@@ -1513,13 +1419,6 @@ subroutine cism_reprosum_int (arr, arr_gsum, nsummands, dsummands, nflds, &
 
       enddo
 
-      if (verbose_reprosum .and. mypid == 0) then
-         write(iulog,*) '   Done in cism_reprosum_int, pid =', mypid
-         k = 1
-         write(iulog,*) '   arr_gsum, exp, frac =', arr_gsum(k), &
-              exponent(arr_gsum(k)), fraction(arr_gsum(k))
-      endif
-
    end subroutine cism_reprosum_int
 
 !
diff --git a/libglimmer/glimmer_utils.F90 b/libglimmer/glimmer_utils.F90
index c21b05c9..00a3859e 100644
--- a/libglimmer/glimmer_utils.F90
+++ b/libglimmer/glimmer_utils.F90
@@ -589,6 +589,91 @@ subroutine point_diag_real8_2d(&
 
   end subroutine point_diag_real8_2d
 
+!--------------------------------------------------------------------------
+
+  subroutine double_to_binary(&
+       x, binary_str, binary_full, binary_sign, binary_exponent, binary_mantissa)
+
+    ! Find the internal binary representation of a double-precision floating point number
+    ! Based on the IEEE-754 standard
+
+    use glimmer_global, only: dp, i8
+    implicit none
+
+    real(dp), intent(in) :: x
+    character(len=64), intent(out) :: binary_str    ! string representation of the binary number
+
+    integer(i8), intent(out), optional :: binary_full      ! 64 bits
+    integer, intent(out), optional :: binary_sign          ! 1 bit
+    integer, intent(out), optional :: binary_exponent      ! 11 bits
+    integer(i8), intent(out), optional :: binary_mantissa  ! 52 bits
+
+    integer :: i
+    character(len=1) :: bin(64)
+    integer (i8) :: binary_number
+    integer :: sign_bit
+    integer :: exponent_bits
+    integer :: mantissa_bits
+
+    logical :: verbose_binary = .false.
+
+    ! Transfer the double value into a 64-bit integer
+    binary_number = transfer(x, binary_number)
+
+    ! Get the sign bit (bit 64)
+    sign_bit = ishft(binary_number, -63) .and. 1
+
+    ! Get the exponent bits (bits 63–53)
+    exponent_bits = ishft(binary_number, -52) .and. Z'7FF'
+
+    ! Extract mantissa (fraction) bits (bits 52–1)
+    mantissa_bits = binary_number .and. Z'FFFFFFFFFFFFF'
+
+    if (present(binary_full)) binary_full = binary_number
+    if (present(binary_sign)) binary_sign = sign_bit
+    if (present(binary_exponent)) binary_exponent = exponent_bits
+    if (present(binary_mantissa)) binary_mantissa = mantissa_bits
+
+    if (verbose_binary) then
+       write(iulog,*) ' '
+       write(iulog,*) 'x =', x
+       write(iulog,*) 'IEEE-754 double precision representation of x:'
+       write(iulog,*) 'Sign bit: ', sign_bit
+       write(iulog,*) 'Exponent (11 bits):', exponent_bits
+       write(iulog,*) 'Mantissa (52 bits):', mantissa_bits
+    endif
+
+    ! Convert full 64-bit integer to a binary string
+    do i = 1, 64
+        if (btest(binary_number, 64 - i)) then
+            bin(i) = '1'
+        else
+            bin(i) = '0'
+        end if
+    end do
+
+    binary_str = concat(bin)
+    if (verbose_binary) then
+       write(iulog,*) 'Full 64-bit binary:'
+       write(iulog,*), ' ', binary_str
+    endif
+
+  end subroutine double_to_binary
+
+
+  pure function concat(arr) result(str)
+    ! Turn a character array into a string
+
+    character(len=*), intent(in) :: arr(:)
+    character(len=size(arr)) :: str
+    integer :: k
+
+    do k = 1, size(arr)
+       str(k:k) = arr(k)
+    end do
+
+  end function concat
+
 !****************************************************************************
 
 end module glimmer_utils
diff --git a/libglimmer/parallel_mpi.F90 b/libglimmer/parallel_mpi.F90
index 6afb1d9f..bab339cd 100644
--- a/libglimmer/parallel_mpi.F90
+++ b/libglimmer/parallel_mpi.F90
@@ -348,6 +348,8 @@ module cism_parallel
   end interface
 
   interface parallel_is_zero
+     module procedure parallel_is_zero_integer_1d
+     module procedure parallel_is_zero_real8_1d
      module procedure parallel_is_zero_integer_2d
      module procedure parallel_is_zero_real8_2d
      module procedure parallel_is_zero_real8_3d
@@ -703,9 +705,7 @@ subroutine gather_var_integer_2d(values, global_values, parallel)
          mpi_integer,main_rank,comm)
 
     if (main_task) then
-       if (allocated(global_values)) then
-          deallocate(global_values)
-       endif
+       if (allocated(global_values)) deallocate(global_values)
        !WHL - See comments above on allocating the global_values array
 !!       allocate(global_values(&
 !!                 minval(d_gs_bounds(1,:)):maxval(d_gs_bounds(2,:)),&
@@ -724,9 +724,7 @@ subroutine gather_var_integer_2d(values, global_values, parallel)
        end do
        allocate(recvbuf(displs(tasks+1)))
     else
-       if (allocated(global_values)) then
-          deallocate(global_values)
-       endif
+       if (allocated(global_values)) deallocate(global_values)
        allocate(global_values(1,1))  ! This prevents a problem with NULL pointers later.
        allocate(displs(1))
        allocate(recvcounts(1))
@@ -811,9 +809,7 @@ subroutine gather_var_logical_2d(values, global_values, parallel)
          mpi_integer,main_rank,comm)
 
     if (main_task) then
-       if (allocated(global_values)) then
-          deallocate(global_values)
-       endif
+       if (allocated(global_values)) deallocate(global_values)
        !WHL - See comments above on allocating the global_values array
 !!       allocate(global_values(&
 !!                 minval(d_gs_bounds(1,:)):maxval(d_gs_bounds(2,:)),&
@@ -832,9 +828,7 @@ subroutine gather_var_logical_2d(values, global_values, parallel)
        end do
        allocate(recvbuf(displs(tasks+1)))
     else
-       if (allocated(global_values)) then
-          deallocate(global_values)
-       endif
+       if (allocated(global_values)) deallocate(global_values)
        allocate(global_values(1,1))  ! This prevents a problem with NULL pointers later.
        allocate(displs(1))
        allocate(recvcounts(1))
@@ -918,9 +912,7 @@ subroutine gather_var_real4_2d(values, global_values, parallel)
          mpi_integer,main_rank,comm)
 
     if (main_task) then
-       if (allocated(global_values)) then
-          deallocate(global_values)
-       endif
+       if (allocated(global_values)) deallocate(global_values)
        !WHL - See comments above on allocating the global_values array
 !!       allocate(global_values(&
 !!                 minval(d_gs_bounds(1,:)):maxval(d_gs_bounds(2,:)),&
@@ -939,9 +931,7 @@ subroutine gather_var_real4_2d(values, global_values, parallel)
        end do
        allocate(recvbuf(displs(tasks+1)))
     else
-       if (allocated(global_values)) then
-          deallocate(global_values)
-       endif
+       if (allocated(global_values)) deallocate(global_values)
        allocate(global_values(1,1))  ! This prevents a problem with NULL pointers later.
        allocate(displs(1))
        allocate(recvcounts(1))
@@ -1026,9 +1016,7 @@ subroutine gather_var_real4_3d(values, global_values, parallel, ld1, ud1)
          mpi_integer,main_rank,comm)
 
     if (main_task) then
-       if (allocated(global_values)) then
-          deallocate(global_values)
-       endif
+       if (allocated(global_values)) deallocate(global_values)
        if (present(ld1)) then
          d1l = ld1
        else
@@ -1062,9 +1050,7 @@ subroutine gather_var_real4_3d(values, global_values, parallel, ld1, ud1)
        end do
        allocate(recvbuf(displs(tasks+1)))
     else
-       if (allocated(global_values)) then
-          deallocate(global_values)
-       endif
+       if (allocated(global_values)) deallocate(global_values)
        allocate(global_values(1,1,1))  ! This prevents a problem with NULL pointers later.
        allocate(displs(1))
        allocate(recvcounts(1))
@@ -1151,9 +1137,7 @@ subroutine gather_var_real8_2d(values, global_values, parallel)
          mpi_integer,main_rank,comm)
 
     if (main_task) then
-       if (allocated(global_values)) then
-          deallocate(global_values)
-       endif
+       if (allocated(global_values)) deallocate(global_values)
        !WHL - See comments above on allocating the global_values array
 !!       allocate(global_values(&
 !!                 minval(d_gs_bounds(1,:)):maxval(d_gs_bounds(2,:)),&
@@ -1172,9 +1156,7 @@ subroutine gather_var_real8_2d(values, global_values, parallel)
        end do
        allocate(recvbuf(displs(tasks+1)))
     else
-       if (allocated(global_values)) then
-          deallocate(global_values)
-       endif
+       if (allocated(global_values)) deallocate(global_values)
        allocate(global_values(1,1))  ! This prevents a problem with NULL pointers later.
        allocate(displs(1))
        allocate(recvcounts(1))
@@ -1211,7 +1193,7 @@ subroutine gather_var_real8_3d(values, global_values, parallel, ld1, ud1)
 
     use mpi_mod
     implicit none
-    real(dp),dimension(:,:,:),intent(in) :: values
+    real(dp),dimension(:,:,:),intent(in) :: values    ! i and j are indices 2 and 3
     real(dp),dimension(:,:,:),allocatable,intent(inout) :: global_values
     integer,optional,intent(in) :: ld1, ud1
     type(parallel_type) :: parallel
@@ -1234,7 +1216,7 @@ subroutine gather_var_real8_3d(values, global_values, parallel, ld1, ud1)
          global_maxval_nsub => parallel%global_maxval_nsub   &
     )
 
-    if (uhalo==0 .and. size(values,1)==local_ewn-1) then
+    if (uhalo==0 .and. size(values,2)==local_ewn-1) then
        ! Fixing this would require some generalization as is done for distributed_put_var
        write(*,*) "gather does not currently work for"
        write(*,*) "variables on the staggered grid when uhalo=0"
@@ -1260,9 +1242,7 @@ subroutine gather_var_real8_3d(values, global_values, parallel, ld1, ud1)
          mpi_integer,main_rank,comm)
 
     if (main_task) then
-       if (allocated(global_values)) then
-          deallocate(global_values)
-       endif
+       if (allocated(global_values)) deallocate(global_values)
        if (present(ld1)) then
          d1l = ld1
        else
@@ -1296,9 +1276,7 @@ subroutine gather_var_real8_3d(values, global_values, parallel, ld1, ud1)
        end do
        allocate(recvbuf(displs(tasks+1)))
     else
-       if (allocated(global_values)) then
-          deallocate(global_values)
-       endif
+       if (allocated(global_values)) deallocate(global_values)
        allocate(global_values(1,1,1))  ! This prevents a problem with NULL pointers later.
        allocate(displs(1))
        allocate(recvcounts(1))
@@ -6112,7 +6090,7 @@ function parallel_global_sum_integer_3d(a, parallel, mask_3d)
     integer, dimension(:,:,:), intent(in), optional :: mask_3d
 
     integer :: i, j, k
-    integer :: kmax
+    integer :: nz
     integer, dimension(size(a,1),parallel%local_ewn,parallel%local_nsn) :: mask
     integer :: local_sum
     integer :: parallel_global_sum_integer_3d
@@ -6121,7 +6099,7 @@ function parallel_global_sum_integer_3d(a, parallel, mask_3d)
          local_ewn   => parallel%local_ewn,    &
          local_nsn   => parallel%local_nsn)
 
-    kmax = size(a,1)
+    nz = size(a,1)
 
     if (present(mask_3d)) then
        mask = mask_3d
@@ -6132,7 +6110,7 @@ function parallel_global_sum_integer_3d(a, parallel, mask_3d)
     local_sum = 0
     do j = nhalo+1, local_nsn-nhalo
        do i = nhalo+1, local_ewn-nhalo
-          do k = 1, kmax
+          do k = 1, nz
              if (mask(k,i,j) == 1) then
                 local_sum = local_sum + a(k,i,j)
              endif
@@ -6160,9 +6138,23 @@ function parallel_global_sum_real8_2d(a, parallel, mask_2d)
     real(dp) :: local_sum
     real(dp) :: parallel_global_sum_real8_2d
 
+    ! variables for computing reproductible sums
+    integer :: nsummands, nflds      ! dimensions of array passed to parallel_reduce_reprosum
+    integer :: count
+    real(dp), dimension(:,:), allocatable :: arr
+    real(dp), dimension(:), allocatable :: arr_gsum
+
     associate(  &
          local_ewn   => parallel%local_ewn,    &
-         local_nsn   => parallel%local_nsn)
+         local_nsn   => parallel%local_nsn,    &
+         own_ewn     => parallel%own_ewn,      &
+         own_nsn     => parallel%own_nsn)
+
+    !WHL - debug
+    if (verbose_reprosum .and. main_task) then
+!       write(iulog,*) 'In parallel_global_sum_real8_2d, reprosum =', parallel%reprosum
+!       write(iulog,*) 'nhalo, local ewn, local_nsn =', nhalo, local_ewn, local_nsn
+    endif
 
     if (present(mask_2d)) then
        mask = mask_2d
@@ -6170,10 +6162,44 @@ function parallel_global_sum_real8_2d(a, parallel, mask_2d)
        mask = 1
     endif
 
-    if (parallel%reprosum) then   ! compute using cism_reprosum_calc
+    if (parallel%reprosum) then   ! compute using parallel_reduce_reprosum
 
-       !TODO - Add the code here
-       call parallel_stop(__FILE__,__LINE__)
+       ! Allocate and fill arrays to pass to parallel_reduce_reprosum
+       nsummands = own_ewn*own_nsn
+       nflds = 1
+       allocate(arr(nsummands,nflds))
+       allocate(arr_gsum(nflds))
+
+       count = 0
+       do j = nhalo+1, local_nsn-nhalo
+          do i = nhalo+1, local_ewn-nhalo
+             count = count + 1
+             if (mask(i,j) == 1) then
+                arr(count,1) = a(i,j)
+             else
+                arr(count,1) = 0.0d0
+             endif
+          enddo
+       enddo
+
+       ! bug check
+       if (count /= nsummands) then
+          if (main_task) write(iulog,*) 'Error: count, nsummands =', count, nsummands
+          call parallel_stop(__FILE__,__LINE__)
+       endif
+
+       ! Call parallel_reduce_reprosum
+       call parallel_reduce_reprosum(arr, arr_gsum)
+
+       parallel_global_sum_real8_2d = arr_gsum(1)
+
+       !WHL - debug
+       if (verbose_reprosum .and. main_task) then
+!!          write(iulog,*) 'arr_gsum =', arr_gsum
+       endif
+
+       deallocate(arr)
+       deallocate(arr_gsum)
 
     else  ! compute using parallel_reduce_sum (not reproducible)
 
@@ -6207,16 +6233,24 @@ function parallel_global_sum_real8_3d(a, parallel, mask_2d)
     integer, dimension(:,:), intent(in), optional :: mask_2d
 
     integer :: i, j, k
-    integer :: kmax
+    integer :: nz
     integer, dimension(parallel%local_ewn,parallel%local_nsn) :: mask
     real(dp) :: local_sum
     real(dp) :: parallel_global_sum_real8_3d
 
+    ! variables for computing reproductible sums
+    integer :: nsummands, nflds      ! dimensions of array passed to parallel_reduce_reprosum
+    integer :: count
+    real(dp), dimension(:,:), allocatable :: arr
+    real(dp), dimension(:), allocatable :: arr_gsum
+
     associate(  &
          local_ewn   => parallel%local_ewn,    &
-         local_nsn   => parallel%local_nsn)
+         local_nsn   => parallel%local_nsn,    &
+         own_ewn     => parallel%own_ewn,      &
+         own_nsn     => parallel%own_nsn)
 
-    kmax = size(a,1)
+    nz = size(a,1)
 
     ! Note: The mask is 2D, since typically all layers in a column are either masked in or masked out
     if (present(mask_2d)) then
@@ -6227,8 +6261,47 @@ function parallel_global_sum_real8_3d(a, parallel, mask_2d)
 
     if (parallel%reprosum) then   ! compute using cism_reprosum_calc
 
-       !TODO - Add the code here
-       call parallel_stop(__FILE__,__LINE__)
+       ! Allocate and fill arrays to pass to parallel_reduce_reprosum
+       nsummands = own_ewn*own_nsn*nz
+       nflds = 1
+       allocate(arr(nsummands,nflds))
+       allocate(arr_gsum(nflds))
+
+       count = 0
+       do j = nhalo+1, local_nsn-nhalo
+          do i = nhalo+1, local_ewn-nhalo
+             if (mask(i,j) == 1) then
+                do k = 1, nz
+                   count = count + 1
+                   arr(count,1) = a(k,i,j)
+                enddo
+             else
+                do k = 1, nz
+                   count = count + 1
+                   arr(count,1) = 0.0d0
+                enddo
+             endif
+          enddo
+       enddo
+
+       ! bug check
+       if (count /= nsummands) then
+          if (main_task) write(iulog,*) 'Error: count, nsummands =', count, nsummands
+          call parallel_stop(__FILE__,__LINE__)
+       endif
+
+       ! Call parallel_reduce_reprosum
+       call parallel_reduce_reprosum(arr, arr_gsum)
+
+       parallel_global_sum_real8_3d = arr_gsum(1)
+
+       !WHL - debug
+       if (verbose_reprosum .and. main_task) then
+!          write(iulog,*) 'arr_gsum =', arr_gsum
+       endif
+
+       deallocate(arr)
+       deallocate(arr_gsum)
 
     else  ! compute using parallel_reduce_sum (not reproducible)
 
@@ -6236,7 +6309,7 @@ function parallel_global_sum_real8_3d(a, parallel, mask_2d)
        do j = nhalo+1, local_nsn-nhalo
           do i = nhalo+1, local_ewn-nhalo
              if (mask(i,j) == 1) then
-                do k = 1, kmax
+                do k = 1, nz
                    local_sum = local_sum + a(k,i,j)
                 enddo
              endif
@@ -6311,22 +6384,73 @@ function parallel_global_sum_patch_real8_2d(a, npatch, patch_id, parallel)
     real(dp), dimension(npatch) :: local_patch_sum
     real(dp), dimension(npatch) :: parallel_global_sum_patch_real8_2d
 
+    ! variables for computing reproductible sums
+    integer :: nsummands, nflds      ! dimensions of array passed to parallel_reduce_reprosum
+    integer :: count
+    real(dp), dimension(:,:), allocatable :: arr
+    real(dp), dimension(:), allocatable :: arr_gsum
+
     associate(  &
          local_ewn   => parallel%local_ewn,    &
-         local_nsn   => parallel%local_nsn)
+         local_nsn   => parallel%local_nsn,    &
+         own_ewn     => parallel%own_ewn,      &
+         own_nsn     => parallel%own_nsn)
 
-    local_patch_sum = 0.0d0
+    if (parallel%reprosum) then   ! compute using cism_reprosum_calc
 
-    do j = nhalo+1, local_nsn-nhalo
-       do i = nhalo+1, local_ewn-nhalo
-          np = patch_id(i,j)
-          if (np > 0) then
-             local_patch_sum(np) = local_patch_sum(np) + a(i,j)
-          endif
+       ! Allocate and fill arrays to pass to parallel_reduce_reprosum
+       nsummands = own_ewn*own_nsn
+       nflds = npatch
+       allocate(arr(nsummands,nflds))
+       allocate(arr_gsum(nflds))
+
+       count = 0
+       arr(:,:) = 0.0d0
+
+       do j = nhalo+1, local_nsn-nhalo
+          do i = nhalo+1, local_ewn-nhalo
+             count = count + 1
+             np = patch_id(i,j)
+             if (np > 0) then
+                arr(count,np) = a(i,j)
+             endif
+          enddo
        enddo
-    enddo
 
-    parallel_global_sum_patch_real8_2d = parallel_reduce_sum(local_patch_sum)
+       ! bug check
+       if (count /= nsummands) then
+          if (main_task) write(iulog,*) 'Error: count, nsummands =', count, nsummands
+          call parallel_stop(__FILE__,__LINE__)
+       endif
+
+       ! Call parallel_reduce_reprosum
+       call parallel_reduce_reprosum(arr, arr_gsum)
+
+       parallel_global_sum_patch_real8_2d(:) = arr_gsum(:)
+
+       if (verbose_reprosum .and. main_task) then
+!!          write(iulog,*) 'arr_gsum =', arr_gsum
+       endif
+
+       deallocate(arr)
+       deallocate(arr_gsum)
+
+    else   ! compute using parallel_reduce_sum (not reproducible)
+
+       local_patch_sum = 0.0d0
+
+       do j = nhalo+1, local_nsn-nhalo
+          do i = nhalo+1, local_ewn-nhalo
+             np = patch_id(i,j)
+             if (np > 0) then
+                local_patch_sum(np) = local_patch_sum(np) + a(i,j)
+             endif
+          enddo
+       enddo
+
+       parallel_global_sum_patch_real8_2d = parallel_reduce_sum(local_patch_sum)
+
+    endif  ! reprosum
 
     end associate
 
@@ -6352,6 +6476,13 @@ function parallel_global_sum_stagger_real8_2d(arr1, parallel, arr2)
          staggered_ilo, staggered_ihi, &  ! bounds of locally owned vertices on staggered grid
          staggered_jlo, staggered_jhi
 
+    ! variables for computing reproductible sums
+    integer :: nsummands, nflds      ! dimensions of array passed to parallel_reduce_reprosum
+    integer :: count
+    real(dp), dimension(:,:), allocatable :: arr
+    real(dp), dimension(:), allocatable :: arr_gsum
+
+    !TODO - associate
     staggered_ilo = parallel%staggered_ilo
     staggered_ihi = parallel%staggered_ihi
     staggered_jlo = parallel%staggered_jlo
@@ -6359,8 +6490,54 @@ function parallel_global_sum_stagger_real8_2d(arr1, parallel, arr2)
 
     if (parallel%reprosum) then   ! compute using cism_reprosum_calc
 
-       !TODO - Add the code here
-       call parallel_stop(__FILE__,__LINE__)
+       ! Allocate and fill arrays to pass to parallel_reduce_reprosum
+       nsummands = (staggered_ihi-staggered_ilo+1) * (staggered_jhi-staggered_jlo+1)
+       nflds = 1
+       allocate(arr(nsummands,nflds))
+       allocate(arr_gsum(nflds))
+
+       arr(:,:) = 0.0d0
+
+       if (present(arr2)) then  ! compute global sum of arr1 + arr2
+
+          count = 0
+          do j = staggered_jlo, staggered_jhi
+             do i = staggered_ilo, staggered_ihi
+                count = count + 1
+                arr(count,1) = arr1(i,j) + arr2(i,j)
+             enddo
+          enddo
+
+       else  ! compute global sum of arr1
+
+          count = 0
+          do j = staggered_jlo, staggered_jhi
+             do i = staggered_ilo, staggered_ihi
+                count = count + 1
+                arr(count,1) = arr1(i,j)
+             enddo
+          enddo
+
+       endif
+
+       ! bug check
+       if (count /= nsummands) then
+          if (main_task) write(iulog,*) 'Error: count, nsummands =', count, nsummands
+          call parallel_stop(__FILE__,__LINE__)
+       endif
+
+       ! Call parallel_reduce_reprosum
+       call parallel_reduce_reprosum(arr, arr_gsum)
+
+       parallel_global_sum_stagger_real8_2d = arr_gsum(1)
+
+       !WHL - debug
+       if (verbose_reprosum .and. main_task) then
+!          write(iulog,*) 'arr_gsum =', arr_gsum
+       endif
+
+       deallocate(arr)
+       deallocate(arr_gsum)
 
     else  ! compute using parallel_reduce_sum (not reproducible)
 
@@ -6405,10 +6582,17 @@ function parallel_global_sum_stagger_real8_3d(arr1, parallel, arr2)
     real(dp) :: local_sum
     real(dp) :: parallel_global_sum_stagger_real8_3d
 
+    ! variables for computing reproductible sums
+    integer :: nsummands, nflds      ! dimensions of array passed to parallel_reduce_reprosum
+    integer :: count
+    real(dp), dimension(:,:), allocatable :: arr
+    real(dp), dimension(:), allocatable :: arr_gsum
+
     integer :: &
          staggered_ilo, staggered_ihi, &  ! bounds of locally owned vertices on staggered grid
          staggered_jlo, staggered_jhi
 
+    !TODO - Associate these variables (and not the ones above)
     staggered_ilo = parallel%staggered_ilo
     staggered_ihi = parallel%staggered_ihi
     staggered_jlo = parallel%staggered_jlo
@@ -6418,8 +6602,58 @@ function parallel_global_sum_stagger_real8_3d(arr1, parallel, arr2)
 
     if (parallel%reprosum) then   ! compute using cism_reprosum_calc
 
-       !TODO - Add the code here
-       call parallel_stop(__FILE__,__LINE__)
+       ! Allocate and fill arrays to pass to parallel_reduce_reprosum
+       nsummands = (staggered_ihi-staggered_ilo+1) * (staggered_jhi-staggered_jlo+1) * nz
+       nflds = 1
+       allocate(arr(nsummands,nflds))
+       allocate(arr_gsum(nflds))
+
+       arr(:,:) = 0.0d0
+
+       if (present(arr2)) then  ! compute global sum of arr1 + arr2
+
+          count = 0
+          do j = staggered_jlo, staggered_jhi
+             do i = staggered_ilo, staggered_ihi
+                do k = 1, nz
+                   count = count + 1
+                   arr(count,1) = arr1(k,i,j) + arr2(k,i,j)
+                enddo
+             enddo
+          enddo
+
+       else  ! compute global sum of arr1
+
+          count = 0
+          do j = staggered_jlo, staggered_jhi
+             do i = staggered_ilo, staggered_ihi
+                do k = 1, nz
+                   count = count + 1
+                   arr(count,1) = arr1(k,i,j)
+                enddo
+             enddo
+          enddo
+
+       endif
+
+       ! bug check
+       if (count /= nsummands) then
+          if (main_task) write(iulog,*) 'Error: count, nsummands =', count, nsummands
+          call parallel_stop(__FILE__,__LINE__)
+       endif
+
+       ! Call parallel_reduce_reprosum
+       call parallel_reduce_reprosum(arr, arr_gsum)
+
+       parallel_global_sum_stagger_real8_3d = arr_gsum(1)
+
+       !WHL - debug
+       if (verbose_reprosum .and. main_task) then
+!          write(iulog,*) 'arr_gsum =', arr_gsum
+       endif
+
+       deallocate(arr)
+       deallocate(arr_gsum)
 
     else  ! compute using parallel_reduce_sum (not reproducible)
 
@@ -6459,6 +6693,7 @@ function parallel_global_sum_stagger_real8_2d_nflds(arr1, nflds, parallel, arr2)
 
     ! Sum one or two local arrays on the staggered grid, then take the global sum.
     ! The final index is equal to the number of independent fields to be summed.
+    !TODO - Don't have to pass in nflds, since it equals size(a,3)?
 
     real(dp), dimension(:,:,:), intent(in) :: arr1
 
@@ -6479,6 +6714,13 @@ function parallel_global_sum_stagger_real8_2d_nflds(arr1, nflds, parallel, arr2)
          staggered_ilo, staggered_ihi, &  ! bounds of locally owned vertices on staggered grid
          staggered_jlo, staggered_jhi
 
+    ! variables for computing reproductible sums
+!!    integer :: nsummands, nflds      ! dimensions of array passed to parallel_reduce_reprosum
+    integer :: nsummands               ! dimensions of array passed to parallel_reduce_reprosum
+    integer :: count
+    real(dp), dimension(:,:), allocatable :: arr
+    real(dp), dimension(:), allocatable :: arr_gsum
+
     staggered_ilo = parallel%staggered_ilo
     staggered_ihi = parallel%staggered_ihi
     staggered_jlo = parallel%staggered_jlo
@@ -6486,8 +6728,54 @@ function parallel_global_sum_stagger_real8_2d_nflds(arr1, nflds, parallel, arr2)
 
     if (parallel%reprosum) then   ! compute using cism_reprosum_calc
 
-       !TODO - Add the code here
-       call parallel_stop(__FILE__,__LINE__)
+       ! Allocate and fill arrays to pass to parallel_reduce_reprosum
+       nsummands = (staggered_ihi-staggered_ilo+1) * (staggered_jhi-staggered_jlo+1)
+!       nflds = size(a,3)
+       allocate(arr(nsummands,nflds))
+       allocate(arr_gsum(nflds))
+
+       arr(:,:) = 0.0d0
+
+       do n = 1, nflds
+
+          if (present(arr2)) then  ! compute global sum of arr1 + arr2
+             count = 0
+             do j = staggered_jlo, staggered_jhi
+                do i = staggered_ilo, staggered_ihi
+                   count = count + 1
+                   arr(count,n) = arr1(i,j,n) + arr2(i,j,n)
+                enddo
+             enddo
+          else  ! compute global sum of arr1
+             count = 0
+             do j = staggered_jlo, staggered_jhi
+                do i = staggered_ilo, staggered_ihi
+                   count = count + 1
+                   arr(count,n) = arr1(i,j,n)
+                enddo
+             enddo
+          endif
+
+       enddo   ! nflds
+
+       ! bug check
+       if (count /= nsummands) then
+          if (main_task) write(iulog,*) 'Error: count, nsummands =', count, nsummands
+          call parallel_stop(__FILE__,__LINE__)
+       endif
+
+       ! Call parallel_reduce_reprosum
+       call parallel_reduce_reprosum(arr, arr_gsum)
+
+       parallel_global_sum_stagger_real8_2d_nflds = arr_gsum(:)
+
+       !WHL - debug
+       if (verbose_reprosum .and. main_task) then
+!          write(iulog,*) 'arr_gsum =', arr_gsum
+       endif
+
+       deallocate(arr)
+       deallocate(arr_gsum)
 
     else  ! compute using parallel_reduce_sum (not reproducible)
 
@@ -6513,10 +6801,10 @@ function parallel_global_sum_stagger_real8_2d_nflds(arr1, nflds, parallel, arr2)
 
        enddo   ! nflds
 
-    endif   ! reprosum
+       ! take the global sum
+       parallel_global_sum_stagger_real8_2d_nflds = parallel_reduce_sum(local_sum(:))
 
-    ! take the global sum
-    parallel_global_sum_stagger_real8_2d_nflds = parallel_reduce_sum(local_sum(:))
+    endif   ! reprosum
 
   end function parallel_global_sum_stagger_real8_2d_nflds
 
@@ -6547,6 +6835,14 @@ function parallel_global_sum_stagger_real8_3d_nflds(arr1, nflds, parallel, arr2)
          staggered_ilo, staggered_ihi, &  ! bounds of locally owned vertices on staggered grid
          staggered_jlo, staggered_jhi
 
+    ! variables for computing reproductible sums
+!!    integer :: nsummands, nflds      ! dimensions of array passed to parallel_reduce_reprosum
+    integer :: nsummands               ! dimensions of array passed to parallel_reduce_reprosum
+    integer :: count
+    real(dp), dimension(:,:), allocatable :: arr
+    real(dp), dimension(:), allocatable :: arr_gsum
+
+    !TODO - Associate these variables
     staggered_ilo = parallel%staggered_ilo
     staggered_ihi = parallel%staggered_ihi
     staggered_jlo = parallel%staggered_jlo
@@ -6556,8 +6852,63 @@ function parallel_global_sum_stagger_real8_3d_nflds(arr1, nflds, parallel, arr2)
 
     if (parallel%reprosum) then   ! compute using cism_reprosum_calc
 
-       !TODO - Add the code here
-       call parallel_stop(__FILE__,__LINE__)
+       !WHL - debug
+       if (verbose_reprosum .and. main_task) then
+          write(iulog,*) '   In global_sum_stagger_real8_3d_nflds'
+       endif
+
+       ! Allocate and fill arrays to pass to parallel_reduce_reprosum
+       nsummands = (staggered_ihi-staggered_ilo+1) * (staggered_jhi-staggered_jlo+1) * nz
+!       nflds = size(a,3)
+       allocate(arr(nsummands,nflds))
+       allocate(arr_gsum(nflds))
+
+       arr(:,:) = 0.0d0
+
+       do n = 1, nflds
+
+          if (present(arr2)) then  ! compute global sum of arr1 + arr2
+             count = 0
+             do j = staggered_jlo, staggered_jhi
+                do i = staggered_ilo, staggered_ihi
+                   do k = 1, nz
+                      count = count + 1
+                      arr(count,n) = arr1(k,i,j,n) + arr2(k,i,j,n)
+                   enddo
+                enddo
+             enddo
+          else  ! compute global sum of arr1
+             count = 0
+             do j = staggered_jlo, staggered_jhi
+                do i = staggered_ilo, staggered_ihi
+                   do k = 1, nz
+                      count = count + 1
+                      arr(count,n) = arr1(k,i,j,n)
+                   enddo
+                enddo
+             enddo
+          endif
+
+       enddo   ! nflds
+
+       ! bug check
+       if (count /= nsummands) then
+          if (main_task) write(iulog,*) 'Error: count, nsummands =', count, nsummands
+          call parallel_stop(__FILE__,__LINE__)
+       endif
+
+       ! Call parallel_reduce_reprosum
+       call parallel_reduce_reprosum(arr, arr_gsum)
+
+       parallel_global_sum_stagger_real8_3d_nflds = arr_gsum(:)
+
+       !WHL - debug
+       if (verbose_reprosum .and. main_task) then
+!          write(iulog,*) 'arr_gsum =', arr_gsum
+       endif
+
+       deallocate(arr)
+       deallocate(arr_gsum)
 
     else  ! compute using parallel_reduce_sum (not reproducible)
 
@@ -6597,6 +6948,48 @@ end function parallel_global_sum_stagger_real8_3d_nflds
 !=======================================================================
   ! functions belonging to the parallel_is_zero interface
 
+  function parallel_is_zero_integer_1d(a)
+
+    ! returns .true. if the field has all zero values, else returns .false.
+
+    integer, dimension(:), intent(in) :: a
+    logical :: parallel_is_zero_integer_1d
+
+    integer :: maxval_a
+
+    maxval_a = maxval(abs(a))
+    maxval_a = parallel_reduce_max(maxval_a)
+    if (maxval_a > 0) then
+       parallel_is_zero_integer_1d = .false.
+    else
+       parallel_is_zero_integer_1d = .true.
+    endif
+
+  end function parallel_is_zero_integer_1d
+
+!=======================================================================
+
+  function parallel_is_zero_real8_1d(a)
+
+    ! returns .true. if the field has all zero values, else returns .false.
+
+    real(dp), dimension(:), intent(in) :: a
+    logical :: parallel_is_zero_real8_1d
+
+    real(dp) :: maxval_a
+
+    maxval_a = maxval(abs(a))
+    maxval_a = parallel_reduce_max(maxval_a)
+    if (maxval_a > 0.0d0) then
+       parallel_is_zero_real8_1d = .false.
+    else
+       parallel_is_zero_real8_1d = .true.
+    endif
+
+  end function parallel_is_zero_real8_1d
+
+!=======================================================================
+
   function parallel_is_zero_integer_2d(a)
 
     ! returns .true. if the field has all zero values, else returns .false.
@@ -6604,7 +6997,7 @@ function parallel_is_zero_integer_2d(a)
     integer, dimension(:,:), intent(in) :: a
     logical :: parallel_is_zero_integer_2d
 
-    real(dp) :: maxval_a
+    integer :: maxval_a
 
     maxval_a = maxval(abs(a))
     maxval_a = parallel_reduce_max(maxval_a)
@@ -8905,7 +9298,6 @@ subroutine parallel_reduce_reprosum(arr, arr_gsum)
     ! Compute a reproducible global sum for a floating-point variable or array.
     ! Can be called from parallel_global_sum, parallel_global_sum_patch, or
     ! parallel_global_sum_stagger.
-    ! Still under construction
 
     implicit none
 
@@ -8965,21 +9357,11 @@ subroutine parallel_reduce_reprosum(arr, arr_gsum)
          repro_sum_validate     ! flag enabling/disabling testing that gmax and max_levels
                                 ! are accurate/sufficient. Default is enabled.
 
-
     ! Set parameters and allocate arrays
     dsummands = size(arr,1)
     nflds = size(arr,2)
     nsummands = dsummands
 
-    !WHL - debug
-    if (verbose_reprosum .and. main_task) then
-       write(iulog,*) 'In parallel_reduce_reprosum'
-       write(iulog,*) 'dsummands, nflds:', dsummands, nflds
-    endif
-    if (verbose_reprosum) then
-!!       write(iulog,*) 'rank, arr:', this_rank, arr(:,:)
-    endif
-
     allocate (arr_gbl_max(nflds))
     allocate (arr_gbl_max_out(nflds))
     allocate (arr_max_levels(nflds))
@@ -8990,12 +9372,6 @@ subroutine parallel_reduce_reprosum(arr, arr_gsum)
     ddpdd_sum = .false.
     repro_sum_validate = .true.
 
-#ifdef CCSMCOUPLED
-
-!!      call shr_reprosum_calc
-
-#else
-
     ! The following subroutine is adapted from shr_reprosum_calc in CESM shared code.
 
     call cism_reprosum_calc(&
@@ -9013,15 +9389,13 @@ subroutine parallel_reduce_reprosum(arr, arr_gsum)
          repro_sum_stats = repro_sum_stats,             &
          rel_diff = rel_diff)
 
-#endif
-
     if (verbose_reprosum .and. main_task) then
-       write(iulog,*) 'arr_gbl_max_out =', arr_gbl_max_out
-       write(iulog,*) 'arr_max_levels_out =', arr_max_levels_out
-       write(iulog,*) 'gbl_max_nsummands_out =', gbl_max_nsummands_out
-       write(iulog,*) 'rel diff =', rel_diff(1,:)
-       write(iulog,*) 'abs diff =', rel_diff(2,:)
-       write(iulog,*) 'stats =', repro_sum_stats(:)
+!       write(iulog,*) 'arr_gbl_max_out =', arr_gbl_max_out
+!       write(iulog,*) 'arr_max_levels_out =', arr_max_levels_out
+!       write(iulog,*) 'gbl_max_nsummands_out =', gbl_max_nsummands_out
+!       write(iulog,*) 'rel diff =', rel_diff(1,:)
+!       write(iulog,*) 'abs diff =', rel_diff(2,:)
+!       write(iulog,*) 'stats =', repro_sum_stats(:)
     endif
 
     deallocate(arr_gbl_max, arr_gbl_max_out)
diff --git a/libglissade/glissade.F90 b/libglissade/glissade.F90
index 72bab6f4..6c020822 100644
--- a/libglissade/glissade.F90
+++ b/libglissade/glissade.F90
@@ -85,7 +85,7 @@ module glissade
 !=======================================================================
 
 ! Note: There is no glissade_config subroutine; glide_config works for all dycores.
-
+!       glide_config is called from cism_init_dycore before glissade_initialise.
 !=======================================================================
 
   subroutine glissade_initialise(model, evolve_ice)
@@ -366,6 +366,32 @@ subroutine glissade_initialise(model, evolve_ice)
     itest = model%numerics%idiag_local
     jtest = model%numerics%jdiag_local
 
+    ! Check whether x0 and y0 were read in. If not, then compute them from x1 and y1.
+    if (parallel_is_zero(model%general%x1)) then
+       if (main_task) write(iulog,*) 'Warning: model%general%x1 = 0'
+    else
+!       if (main_task) write(iulog,*) 'x1_global:', model%general%x1_global(:)
+!       if (main_task) write(iulog,*) 'y1_global:', model%general%y1_global(:)
+!       if (main_task) write(iulog,*) 'x1:', model%general%x1(:)
+!       if (main_task) write(iulog,*) 'y1:', model%general%y1(:)
+    endif
+
+    if (parallel_is_zero(model%general%x0)) then
+       if (main_task) write(iulog,*) 'Initialize x0'
+       do i = 1, model%general%ewn-1
+          model%general%x0(i) = 0.5d0 * (model%general%x1(i) + model%general%x1(i+1))
+       enddo
+    endif
+    if (main_task) write(iulog,*) 'x0:', model%general%x0(:)
+
+    if (parallel_is_zero(model%general%y0)) then
+       if (main_task) write(iulog,*) 'Initialize y0'
+       do j = 1, model%general%nsn-1
+          model%general%y0(j) = 0.5d0 * (model%general%y1(j) + model%general%y1(j+1))
+       enddo
+    endif
+    if (main_task) write(iulog,*) 'y0:', model%general%y0(:)
+
     ! Check that lat and lon fields were read in, if desired
     !TODO - Use the parallel_is_nonzero function instead, here and below
     if (model%options%read_lat_lon) then
diff --git a/libglissade/glissade_basal_water.F90 b/libglissade/glissade_basal_water.F90
index 18b03292..593b33b2 100644
--- a/libglissade/glissade_basal_water.F90
+++ b/libglissade/glissade_basal_water.F90
@@ -34,12 +34,16 @@ module glissade_basal_water
    use glide_types
    use cism_parallel, only: main_task, this_rank, nhalo, parallel_type, parallel_halo
 
+   !WHL - debug
+   use glimmer_utils, only: double_to_binary
+
    implicit none
 
    private
    public :: glissade_basal_water_init, glissade_calcbwat, glissade_bwat_flux_routing
 
    logical, parameter :: verbose_bwat = .false.
+!!   logical, parameter :: verbose_bwat = .true.
 
  contains
 
@@ -535,6 +539,9 @@ subroutine route_basal_water(&
 
     character(len=100) :: message
 
+    !WHL - debug
+    character(len=64) :: binary_str
+
     ! Allocate the sorted_ij array
 
     nlocal = parallel%own_ewn * parallel%own_nsn
@@ -842,8 +849,14 @@ subroutine route_basal_water(&
 
     if (verbose_bwat .and. this_rank == rtest) then
        write(iulog,*) 'Total bwatflx at margin (m^3/s):', total_flux_margin
+       call double_to_binary(total_flux_margin, binary_str)
+       write(iulog,*) '   ', binary_str
        write(iulog,*) 'Total bwatflx_refreeze (m^3/s)=', total_flux_refreeze
+       call double_to_binary(total_flux_refreeze, binary_str)
+       write(iulog,*) '   ', binary_str
        write(iulog,*) 'Total bwatflx (m^3/s)=', total_flux_out
+       call double_to_binary(total_flux_out, binary_str)
+       write(iulog,*) '   ', binary_str
        write(iulog,*) 'Difference between output and input =', total_flux_out - total_flux_in
     endif
 
diff --git a/libglissade/glissade_utils.F90 b/libglissade/glissade_utils.F90
index eb5b629f..0a959f1e 100644
--- a/libglissade/glissade_utils.F90
+++ b/libglissade/glissade_utils.F90
@@ -44,7 +44,12 @@ module glissade_utils
        glissade_basin_sum, glissade_basin_average, &
        glissade_usrf_to_thck, glissade_thck_to_usrf, &
        glissade_edge_fluxes, glissade_input_fluxes, &
-       glissade_rms_error
+       glissade_rms_error, write_array_to_file
+
+  interface write_array_to_file
+     module procedure write_array_to_file_real8_2d
+     module procedure write_array_to_file_real8_3d
+  end interface
 
 contains
 
@@ -945,10 +950,116 @@ subroutine glissade_input_fluxes(&
 
   end subroutine glissade_input_fluxes
 
+
+  ! subroutines belonging to the write_array_to_file interface
+  subroutine write_array_to_file_real8_2d(arr, fileunit, filename, parallel)
+
+    ! Copy the input array into a global array and write all values to an output file.
+    ! This can be useful for debugging, if we want to find differences between two fields
+    ! (e.g., in two different runs).
+    ! This version writes out 64-bit character strings corresponding to the binary representation
+    ! of each floating-point variable. This can be useful for BFB comparisons.
+    ! Sometimes, two floating-point variables appear to have the same values in base 10,
+    ! when the last few bits actually vary.
+    !TODO - Allow either float or binary output
+
+    use glimmer_utils, only: double_to_binary
+    use cism_parallel, only: gather_var
+
+    real(dp), dimension(:,:), intent(in) :: arr
+    integer, intent(in) :: fileunit
+    character(len=*), intent(in) :: filename
+    type(parallel_type), intent(in) :: parallel
+
+    integer :: i, j
+    character(len=64) :: binary_str
+    real(dp), dimension(:,:), allocatable :: arr_global
+
+    call gather_var(arr, arr_global, parallel)
+    if (main_task) then
+       open(unit=fileunit, file=trim(filename), status='replace', position='append')
+       do j = 1, parallel%global_nsn
+          do i = 1, parallel%global_ewn
+             call double_to_binary(arr_global(i,j), binary_str)
+             write (fileunit, '(2i6,a4,a64)') i, j, '    ', binary_str
+          enddo
+       enddo
+       close(unit=fileunit)
+       deallocate(arr_global)
+    endif
+
+  end subroutine write_array_to_file_real8_2d
+
+
+  subroutine write_array_to_file_real8_3d(arr, fileunit, filename, parallel, cycle_indices)
+
+    ! Copy the input array into a global array and write all values to an output file.
+    ! This can be useful for debugging, if we want to find differences between two fields
+    ! (e.g., in two different runs).
+    ! This version writes out 64-bit character strings corresponding to the binary representation
+    ! of each floating-point variable. This can be useful for BFB comparisons.
+    ! Sometimes, two floating-point variables appear to have the same values in base 10,
+    ! when the last few bits actually vary.
+    !TODO - Allow either float or binary output
+
+    use glimmer_utils, only: double_to_binary
+    use cism_parallel, only: gather_var
+
+    real(dp), dimension(:,:,:), intent(in) :: arr    ! first two indices are i and j
+    integer, intent(in) :: fileunit
+    character(len=*), intent(in) :: filename
+    type(parallel_type), intent(in) :: parallel
+    logical, intent(in), optional :: cycle_indices   ! if true, then index 3->1, 1->2, 2->3
+
+    integer :: i, j, k, kmax
+    character(len=64) :: binary_str
+    real(dp), dimension(:,:,:), allocatable :: arr_global
+    real(dp), dimension(:,:,:), allocatable :: arr_cycle
+    logical :: cycle_ind
+
+    if (present(cycle_indices)) then
+       cycle_ind = cycle_indices
+    else
+       cycle_ind = .false.
+    endif
+
+    if (cycle_ind) then
+       allocate(arr_cycle(size(arr,3), size(arr,1), size(arr,2)))
+       kmax = size(arr,3)
+       do j = 1, size(arr,2)
+          do i = 1, size(arr,1)
+             do k = 1, kmax
+                arr_cycle(k,i,j) = arr(i,j,k)
+             enddo
+          enddo
+       enddo
+       call gather_var(arr_cycle, arr_global, parallel)
+       deallocate(arr_cycle)
+    else
+       kmax = size(arr,1)
+       call gather_var(arr, arr_global, parallel)
+    endif
+
+    if (main_task) then
+       open(unit=fileunit, file=trim(filename), status='unknown')
+       do j = 1, parallel%global_nsn
+          do i = 1, parallel%global_ewn
+             do k = 1, kmax
+                call double_to_binary(arr_global(k,i,j), binary_str)
+                write (fileunit, '(3i6,a4,a64)') i, j, k, '    ', binary_str
+             enddo
+          enddo
+       enddo
+       close(unit=fileunit)
+       deallocate(arr_global)
+    endif
+
+  end subroutine write_array_to_file_real8_3d
+
 !****************************************************************************
 
 !TODO - Other utility subroutines to add here?
-!       E.g., tridiag; calclsrf; subroutines to zero out tracers
+!       E.g., calclsrf; subroutines to zero out tracers
 
 !****************************************************************************
 
diff --git a/libglissade/glissade_velo_higher.F90 b/libglissade/glissade_velo_higher.F90
index 6ea89c0a..3b3ec31b 100644
--- a/libglissade/glissade_velo_higher.F90
+++ b/libglissade/glissade_velo_higher.F90
@@ -56,11 +56,11 @@
 
   module glissade_velo_higher
 
-    use glimmer_global, only: dp
+    use glimmer_global, only: dp, i8
     use glimmer_physcon, only: n_glen, rhoi, rhoo, grav, scyr, pi
-    use glimmer_paramets, only: iulog, eps08, eps10, eps11
+    use glimmer_paramets, only: iulog, eps11, eps10
     use glimmer_paramets, only: velo_scale, len_scale   ! used for whichefvs = HO_EFVS_FLOWFACT
-    use glimmer_utils, only: point_diag
+    use glimmer_utils, only: point_diag, double_to_binary
     use glimmer_log
     use glimmer_sparse_type
     use glimmer_sparse
@@ -92,7 +92,9 @@ module glissade_velo_higher
     use cism_parallel, only: this_rank, main_task, nhalo, tasks, &
          parallel_type, parallel_halo, staggered_parallel_halo, parallel_globalindex, &
          parallel_reduce_max, parallel_reduce_sum, not_parallel
-
+    !WHL - debug
+    use cism_parallel, only: parallel_global_sum_stagger
+    use cism_reprosum_mod, only: verbose_reprosum
     implicit none
 
     private
@@ -225,8 +227,8 @@ module glissade_velo_higher
 !    logical :: verbose_bfric = .true.
     logical :: verbose_trilinos = .false.
 !    logical :: verbose_trilinos = .true.
-    logical :: verbose_beta = .false.
-!    logical :: verbose_beta = .true.
+!    logical :: verbose_beta = .false.
+    logical :: verbose_beta = .true.
     logical :: verbose_efvs = .false.
 !    logical :: verbose_efvs = .true.
     logical :: verbose_tau = .false.
@@ -261,6 +263,10 @@ module glissade_velo_higher
        dphi_dyr_3d_vav,    &! vertical avg of dphi_dyr_3d
        dphi_dzr_3d_vav      ! vertical avg of dphi_dzr_3d
 
+    !WHL - debug for reprosum
+    character(len=64) :: binary_str   ! string representation of binary number (chain of 0's and 1's)
+    character(len=64) :: binary_str1, binary_str2
+
   contains
 
 !****************************************************************************
@@ -671,6 +677,9 @@ subroutine glissade_velo_higher_solve(model,                &
     use glide_thck, only: glide_calclsrf
     use profile, only: t_startf, t_stopf
 
+    !WHL - debug
+    use glissade_utils, only: write_array_to_file
+
     !----------------------------------------------------------------
     ! Input-output arguments
     !----------------------------------------------------------------
@@ -1068,6 +1077,15 @@ subroutine glissade_velo_higher_solve(model,                &
     integer :: itest, jtest    ! coordinates of diagnostic point
     integer :: rtest           ! task number for processor containing diagnostic point
 
+    !WHL - debug
+    real(dp), dimension(nNodeNeighbors_2d) :: &
+         sum_Auu, sum_Auv, sum_Avu, sum_Avv
+    real(dp) :: sum_uvel, sum_vvel
+    real(dp) :: sum_bu, sum_bv
+    real(dp) :: sum_flwa, sum_flwafact, sum_btrx, sum_btry, sum_stagusrf, sum_stagthck
+    real(dp) :: sum_betax, sum_betay, sum_omega, sum_stag_omega
+    real(dp), dimension(:,:), allocatable :: arr_global  ! temporary global array
+
     call t_startf('glissade_vhs_init')
     rtest = -999
     itest = 1
@@ -1587,19 +1605,21 @@ subroutine glissade_velo_higher_solve(model,                &
     !  unique local ID to each such node.
     !------------------------------------------------------------------------------
 
-!pw call t_startf('glissade_get_vertex_geom')
-    call get_vertex_geometry(nx,             ny,                   &
-                             nz,             nhalo,                &
-                             parallel,                             &
-                             dx,             dy,                   &
-                             itest,  jtest,  rtest,                &
-                             ice_mask,                      &
-                             xVertex,        yVertex,              &
-                             active_cell,    active_vertex,        &
-                             nNodesSolve,    nVerticesSolve,       &
-                             nodeID,         vertexID,             &
-                             iNodeIndex,     jNodeIndex,  kNodeIndex, &
-                             iVertexIndex,   jVertexIndex)
+    !pw call t_startf('glissade_get_vertex_geom')
+    call get_vertex_geometry(&
+         nx,               ny,                 &
+         nz,               nhalo,              &
+         parallel,                             &
+         model%general%x0, model%general%y0,   &
+         dx,               dy,                 &
+         itest,  jtest,    rtest,              &
+         ice_mask,                             &
+         xVertex,          yVertex,            &
+         active_cell,      active_vertex,      &
+         nNodesSolve,      nVerticesSolve,     &
+         nodeID,           vertexID,           &
+         iNodeIndex,       jNodeIndex,   kNodeIndex, &
+         iVertexIndex,     jVertexIndex)
 !pw call t_stopf('glissade_get_vertex_geom')
 
     ! Zero out the velocity for inactive vertices
@@ -2302,6 +2322,10 @@ subroutine glissade_velo_higher_solve(model,                &
                whichbeta_limit,                  &
                itest = itest, jtest = jtest, rtest = rtest)
 
+          if (verbose_basal) then
+             call point_diag(beta_internal, 'After calcbeta, beta', itest, jtest, rtest, 7, 7, '(f12.0)')
+          endif
+
 !          if (verbose_beta) then
 !             maxbeta = maxval(beta_internal(:,:))
 !             maxbeta = parallel_reduce_max(maxbeta)
@@ -2329,6 +2353,36 @@ subroutine glissade_velo_higher_solve(model,                &
              usav_2d(:,:) = uvel_2d(:,:)
              vsav_2d(:,:) = vvel_2d(:,:)
 
+             !WHL - debug - BFB check
+             sum_uvel = parallel_global_sum_stagger(uvel_2d, parallel)
+             sum_vvel = parallel_global_sum_stagger(vvel_2d, parallel)
+             sum_flwa = parallel_global_sum_stagger(flwa, parallel)
+             sum_flwafact = parallel_global_sum_stagger(flwafact, parallel)
+             sum_btrx = parallel_global_sum_stagger(btractx, parallel)
+             sum_btry = parallel_global_sum_stagger(btracty, parallel)
+             sum_stagusrf = parallel_global_sum_stagger(stagusrf, parallel)
+             sum_stagthck = parallel_global_sum_stagger(stagthck, parallel)
+!!             if (this_rank == rtest) then
+             if (0 == 1) then
+                write(iulog,*) ' '
+                call double_to_binary(sum_uvel, binary_str)
+                write(iulog,*) 'Before assembly: sum_uvel, binary_str:', sum_uvel, binary_str
+                call double_to_binary(sum_vvel, binary_str)
+                write(iulog,*) 'Before assembly: sum_vvel, binary_str:', sum_vvel, binary_str
+                call double_to_binary(sum_flwa, binary_str)
+                write(iulog,*) 'Before assembly: sum_flwa, binary_str:', sum_flwa, binary_str
+                call double_to_binary(sum_flwafact, binary_str)
+                write(iulog,*) 'Before assembly: sum_flwafact, binary_str:', sum_flwafact, binary_str
+                call double_to_binary(sum_btrx, binary_str)
+                write(iulog,*) 'Before assembly: sum_btrx, binary_str:', sum_btrx, binary_str
+                call double_to_binary(sum_btry, binary_str)
+                write(iulog,*) 'Before assembly: sum_btry, binary_str:', sum_btry, binary_str
+                call double_to_binary(sum_stagusrf, binary_str)
+                write(iulog,*) 'Before assembly: sum_stagusrf, binary_str:', sum_stagusrf, binary_str
+                call double_to_binary(sum_stagthck, binary_str)
+                write(iulog,*) 'Before assembly: sum_stagthck, binary_str:', sum_stagthck, binary_str
+             endif
+
              ! Assemble the matrix
 
              call assemble_stiffness_matrix_2d(nx,               ny,              &
@@ -2353,6 +2407,53 @@ subroutine glissade_velo_higher_solve(model,                &
                                                omega_k,          omega,   &
                                                efvs_qp_3d)
 
+             !WHL - debug - BFB check
+             if (0 == 1) then
+!!             if (verbose_reprosum .and. counter == 1) then
+                if (main_task) write(iulog,*) 'Write out matrices after assemble_stiffness_matrix'
+                call write_array_to_file(Auu_2d, 21, 'global_Auu1', parallel, cycle_indices = .true.)
+                call write_array_to_file(Auv_2d, 22, 'global_Auv1', parallel, cycle_indices = .true.)
+                call write_array_to_file(Avu_2d, 23, 'global_Avu1', parallel, cycle_indices = .true.)
+                call write_array_to_file(Avv_2d, 24, 'global_Avv1', parallel, cycle_indices = .true.)
+                call write_array_to_file(bu_2d,  25, 'global_bu1',  parallel)
+                call write_array_to_file(bv_2d,  26, 'global_bv1',  parallel)
+             endif
+
+             !WHL - debug - BFB check
+!!             if (0 == 1) then
+             if (verbose_reprosum) then
+                sum_Auu(:) = parallel_global_sum_stagger(Auu_2d, nNodeNeighbors_2d, parallel)
+                sum_Auv(:) = parallel_global_sum_stagger(Auv_2d, nNodeNeighbors_2d, parallel)
+                sum_Avu(:) = parallel_global_sum_stagger(Avu_2d, nNodeNeighbors_2d, parallel)
+                sum_Avv(:) = parallel_global_sum_stagger(Avv_2d, nNodeNeighbors_2d, parallel)
+                if (main_task) then
+                   write(iulog,*) ' '
+                   write(iulog,*) 'After assembly: n, sum_Auu(n), binary_str:'
+                   do n = 1, nNodeNeighbors_2d
+                      call double_to_binary(sum_Auu(n), binary_str)
+                      write(iulog,*) n, sum_Auu(n), binary_str
+                   enddo
+                   write(iulog,*) ' '
+                   write(iulog,*) 'After assembly: n, sum_Auv(n), binary_str:'
+                   do n = 1, nNodeNeighbors_2d
+                      call double_to_binary(sum_Auv(n), binary_str)
+                      write(iulog,*) n, sum_Auv(n), binary_str
+                   enddo
+                   write(iulog,*) ' '
+                   write(iulog,*) 'After assembly: n, sum_Avu(n), binary_str:'
+                   do n = 1, nNodeNeighbors_2d
+                      call double_to_binary(sum_Avu(n), binary_str)
+                      write(iulog,*) n, sum_Avu(n), binary_str
+                   enddo
+                   write(iulog,*) ' '
+                   write(iulog,*) 'After assembly: n, sum_Avv(n), binary_str:'
+                   do n = 1, nNodeNeighbors_2d
+                      call double_to_binary(sum_Avv(n), binary_str)
+                      write(iulog,*) n, sum_Avv(n), binary_str
+                   enddo
+                endif
+             endif   ! verbose_reprosum
+
              if (whichapprox == HO_APPROX_DIVA) then
 
                 ! Halo update for omega
@@ -2411,8 +2512,8 @@ subroutine glissade_velo_higher_solve(model,                &
                 beta_eff_x(:,:) = 0.d0
                 beta_eff_y(:,:) = 0.d0
 
-                !Note: The 'if' is not strictly needed, since the corrected beta_eff is equal
-                !      to the uncorrected beta_eff whe slope_factor = 1.0 and theta_slope = 0.0
+                !Note: The 'if diva_slope_correction' is not strictly needed, since the more complicated
+                !      equations reduce to the simpler ones when slope_factor = 1.0 and theta_slope = 0.0.
                 if (diva_slope_correction) then  ! compute a larger beta_eff at each vertex based on the slope
 
                    if (whichbabc == HO_BABC_NO_SLIP) then
@@ -2435,8 +2536,8 @@ subroutine glissade_velo_higher_solve(model,                &
                          beta_eff_y = 1.d0 / stag_omega
                       endwhere
                    else   ! slip allowed at bed
-                      beta_eff_x = beta_internal(:,:) / (1.d0 + beta_internal*stag_omega)
-                      beta_eff_y = beta_internal(:,:) / (1.d0 + beta_internal*stag_omega)
+                      beta_eff_x = beta_internal / (1.d0 + beta_internal*stag_omega)
+                      beta_eff_y = beta_internal / (1.d0 + beta_internal*stag_omega)
                    endif
 
                 endif
@@ -2454,6 +2555,36 @@ subroutine glissade_velo_higher_solve(model,                &
                    call point_diag(beta_eff_y, 'beta_eff_y', itest, jtest, rtest, 7, 7, '(e10.3)')
                 endif
 
+                if (verbose_reprosum) then
+                   sum_omega = parallel_global_sum_stagger(omega, parallel)
+                   sum_stag_omega = parallel_global_sum_stagger(stag_omega, parallel)
+                   sum_betax = parallel_global_sum_stagger(beta_eff_x, parallel)
+                   sum_betay = parallel_global_sum_stagger(beta_eff_y, parallel)
+                   if (main_task) then
+                      call double_to_binary(sum_omega, binary_str)
+                      write(iulog,*) 'Before bc_2d: sum_omega, binary_str:', sum_omega, binary_str
+                      call double_to_binary(sum_stag_omega, binary_str)
+                      write(iulog,*) '              sum_stag_omega, binary_str:', sum_stag_omega, binary_str
+                      call double_to_binary(sum_betax, binary_str)
+                      write(iulog,*) '              sum_betax, binary_str:', sum_betax, binary_str
+                      call double_to_binary(sum_betay, binary_str)
+                      write(iulog,*) '              sum_betay, binary_str:', sum_betay, binary_str
+                   endif
+                   if (this_rank == rtest) then
+                      do j = jtest-2, jtest+2
+                         do i = itest-2, itest+2
+                            if (i >= staggered_ilo .and. i <= staggered_ihi .and. &
+                                j >= staggered_jlo .and. j <= staggered_jhi) then
+                               call double_to_binary(stag_omega(i,j), binary_str)
+                               write(iulog,*) 'i, j, stag_omega:', i, j, binary_str
+                               call double_to_binary(beta_eff_x(i,j), binary_str)
+                               write(iulog,*) '      beta_eff_x:', i, j, binary_str
+                            endif
+                         enddo
+                      enddo
+                   endif
+                endif   ! verbose_reprosum
+
                 if (diva_slope_correction) then
 
                    ! Incorporate basal sliding boundary conditions with basal curvature,
@@ -2568,6 +2699,18 @@ subroutine glissade_velo_higher_solve(model,                &
              call staggered_parallel_halo(bv_2d(:,:), parallel)
              call t_stopf('glissade_halo_bxxs')
 
+             !WHL - debug - Write all the matrix elements and rhs elements (in binary form) to files
+!!             if (verbose_reprosum .and. counter == 1) then
+             if (0 == 1) then
+                if (main_task) write(iulog,*) 'Write out matrices after adding BC'
+                call write_array_to_file(Auu_2d, 21, 'global_Auu3', parallel, cycle_indices = .true.)
+                call write_array_to_file(Auv_2d, 22, 'global_Auv3', parallel, cycle_indices = .true.)
+                call write_array_to_file(Avu_2d, 23, 'global_Avu3', parallel, cycle_indices = .true.)
+                call write_array_to_file(Avv_2d, 24, 'global_Avv3', parallel, cycle_indices = .true.)
+                call write_array_to_file(bu_2d,  25, 'global_bu3',  parallel)
+                call write_array_to_file(bv_2d,  26, 'global_bv3',  parallel)
+             endif
+
              !---------------------------------------------------------------------------
              ! Check symmetry of assembled matrix
              !
@@ -2597,6 +2740,41 @@ subroutine glissade_velo_higher_solve(model,                &
                                     active_vertex,           &
                                     nNonzeros)
 
+             if (verbose_reprosum) then
+                sum_Auu(:) = parallel_global_sum_stagger(Auu_2d, nNodeNeighbors_2d, parallel)
+                sum_Auv(:) = parallel_global_sum_stagger(Auv_2d, nNodeNeighbors_2d, parallel)
+                sum_Avu(:) = parallel_global_sum_stagger(Avu_2d, nNodeNeighbors_2d, parallel)
+                sum_Avv(:) = parallel_global_sum_stagger(Avv_2d, nNodeNeighbors_2d, parallel)
+                sum_bu = parallel_global_sum_stagger(bu_2d, parallel)
+                sum_bv = parallel_global_sum_stagger(bv_2d, parallel)
+                if (main_task) then
+                   write(iulog,*) ' '
+                   write(iulog,*) 'After assembly: n, sum_Auu(n), binary_str:'
+                   do n = 1, nNodeNeighbors_2d
+                      call double_to_binary(sum_Auu(n), binary_str)
+                      write(iulog,*) n, sum_Auu(n), binary_str
+                   enddo
+                   write(iulog,*) ' '
+                   write(iulog,*) 'After assembly: n, sum_Auv(n), binary_str:'
+                   do n = 1, nNodeNeighbors_2d
+                      call double_to_binary(sum_Auv(n), binary_str)
+                      write(iulog,*) n, sum_Auv(n), binary_str
+                   enddo
+                   write(iulog,*) ' '
+                   write(iulog,*) 'After assembly: n, sum_Avu(n), binary_str:'
+                   do n = 1, nNodeNeighbors_2d
+                      call double_to_binary(sum_Avu(n), binary_str)
+                      write(iulog,*) n, sum_Avu(n), binary_str
+                   enddo
+                   write(iulog,*) ' '
+                   write(iulog,*) 'After assembly: n, sum_Avv(n), binary_str:'
+                   do n = 1, nNodeNeighbors_2d
+                      call double_to_binary(sum_Avv(n), binary_str)
+                      write(iulog,*) n, sum_Avv(n), binary_str
+                   enddo
+                endif
+             endif   ! verbose_reprosum
+
              if (write_matrix) then
                 if (counter == 1) then    ! first outer iteration only
                    call t_startf('glissade_wrt_mat')
@@ -2610,7 +2788,7 @@ subroutine glissade_velo_higher_solve(model,                &
                 endif
              endif   ! write_matrix
 
-             if (verbose_matrix .and. this_rank==rtest) then
+             if (verbose_matrix .and. main_task) then
                 i = itest
                 j = jtest
                 write(iulog,*) ' '
@@ -3886,6 +4064,7 @@ end subroutine glissade_velo_higher_scale_output
   subroutine get_vertex_geometry(nx,             ny,                   &
                                  nz,             nhalo,                &
                                  parallel,                             &
+                                 x0,             y0,                   &
                                  dx,             dy,                   &
                                  itest,  jtest,  rtest,                &
                                  ice_mask,                             &
@@ -3916,11 +4095,17 @@ subroutine get_vertex_geometry(nx,             ny,                   &
        nhalo                      ! number of halo layers
 
     type(parallel_type), intent(in) :: &
-         parallel                  ! info for parallel communication
+         parallel                 ! info for parallel communication
+
+    real(dp), dimension(nx-1), intent(in) ::  &
+         x0                       ! x coordinates of vertices
+
+    real(dp), dimension(ny-1), intent(in) ::  &
+         y0                       ! y coordinates of vertices
 
     real(dp), intent(in) ::  &
-       dx,  dy                ! grid cell length and width (m)
-                              ! assumed to have the same value for each grid cell
+         dx,  dy                  ! grid cell length and width (m)
+                                  ! assumed to have the same value for each grid cell
 
     integer, intent(in) :: &
        itest, jtest, rtest    ! coordinates of diagnostic point
@@ -3976,13 +4161,11 @@ subroutine get_vertex_geometry(nx,             ny,                   &
     ! By convention, vertex (i,j) lies at the NE corner of cell(i,j).
     !----------------------------------------------------------------
 
-    xVertex(:,:) = 0.d0
-    yVertex(:,:) = 0.d0
     do j = 1, ny-1
-    do i = 1, nx-1
-       xVertex(i,j) = dx * i
-       yVertex(i,j) = dy * j
-    enddo
+       do i = 1, nx-1
+          xVertex(i,j) = x0(i)
+          yVertex(i,j) = y0(j)
+       enddo
     enddo
 
     ! Identify the active cells.
@@ -4254,7 +4437,8 @@ subroutine load_vector_gravity(nx,               ny,              &
                       loadv(kNode,iNode,jNode) = loadv(kNode,iNode,jNode) - &
                            rhoi*grav * wqp_3d(p) * detJ/vol0 * dsdy_qp * phi_3d(n,p)
 
-                      if (verbose_load .and. this_rank==rtest .and. i==itest .and. j==jtest .and. k==ktest .and. p==ptest) then
+                      if (verbose_load .and. this_rank==rtest .and. &
+                           i==itest .and. j==jtest .and. k==ktest .and. p==ptest) then
                          write(iulog,*) ' '
                          write(iulog,*) 'n, phi_3d(n), delta(loadu), delta(loadv):', n, phi_3d(n,p), &
                                   rhoi*grav*wqp_3d(p)*detJ/vol0 * dsdx_qp * phi_3d(n,p), &
@@ -5122,7 +5306,7 @@ subroutine assemble_stiffness_matrix_2d(nx,               ny,              &
        thck               ! ice thickness (m)
 
     real(dp), dimension(nx-1,ny-1), intent(in), optional ::   &
-       btractx, btracty         ! components of basal traction (Pa)
+       btractx, btracty   ! components of basal traction (Pa)
 
     real(dp), dimension(nz,nx,ny), intent(out), optional :: &
        omega_k            ! single integral, defined by Goldberg (2011) eq. 32
@@ -5881,6 +6065,11 @@ subroutine compute_3d_velocity_L1L2(nx,               ny,              &
     uvel(1:nz-1,:,:) = 0.d0
     vvel(1:nz-1,:,:) = 0.d0
 
+    !WHL - debug
+    call point_diag(xVertex, 'xVertex', itest, jtest, rtest, 7, 7)
+    call point_diag(yVertex, 'yVertex', itest, jtest, rtest, 7, 7)
+    call point_diag(active_cell, 'active_cell', itest, jtest, rtest, 7, 7)
+
     ! Compute viscosity integral and strain rates in elements.
     ! Loop over all cells that border locally owned vertices.
 
@@ -6280,9 +6469,9 @@ subroutine get_basis_function_derivatives_3d(xNode,       yNode,       zNode,
        prod = matmul(Jac,Jinv)
        do col = 1, 3
           do row = 1, 3
-             if (abs(prod(row,col) - identity3(row,col)) > 1.d-11) then
-                write(iulog,*) 'stopping, Jac * Jinv /= identity'
-                write(iulog,*) 'i, j, k, p:', i, j, k, p
+             if (abs(prod(row,col) - identity3(row,col)) > eps10) then
+                write(iulog,*) '3d Jacobian, stopping, Jac * Jinv /= identity'
+                write(iulog,*) 'rank, i, j, k, p:', this_rank, i, j, k, p
                 write(iulog,*) 'Jac*Jinv:'
                 write(iulog,*) prod(1,:)
                 write(iulog,*) prod(2,:)
@@ -6478,8 +6667,8 @@ subroutine get_basis_function_derivatives_2d(xNode,       yNode,         &
        do col = 1, 2
           do row = 1, 2
              if (abs(prod(row,col) - identity3(row,col)) > 1.d-12) then
-                write(iulog,*) 'stopping, Jac * Jinv /= identity'
-                write(iulog,*) 'i, j, p:', i, j, p
+                write(iulog,*) '2d Jacobian, stopping, Jac * Jinv /= identity'
+                write(iulog,*) 'rank, i, j, p:', this_rank, i, j, p
                 write(iulog,*) 'Jac*Jinv:'
                 write(iulog,*) prod(1,:)
                 write(iulog,*) prod(2,:)
@@ -6936,7 +7125,7 @@ subroutine compute_internal_stress (nx,            ny,            &
 
     integer :: i, j, k, n, p
     integer :: iNode, jNode, kNode
-   
+
     ! initialize stresses
     tau_xz (:,:,:) = 0.d0
     tau_yz (:,:,:) = 0.d0
@@ -7545,7 +7734,7 @@ subroutine compute_effective_viscosity_diva(whichefvs,
                                               efvs,                                 &
                                               itest,  jtest,    rtest,              &
                                               i, j, p )
-    
+
     ! Compute the effective viscosity at each layer of an ice column corresponding
     !  to a particular quadrature point, based on the depth-integrated formulation.
     ! See Goldberg(2011) for details.
@@ -7917,7 +8106,7 @@ subroutine compute_element_matrix(whichapprox, nNodesPerElement,     &
                                      (2.d0*dphi_dy(nr)*dphi_dx(nc) + dphi_dx(nr)*dphi_dy(nc))
 
              Kvv(nr,nc) = Kvv(nr,nc) + efvs_factor *                                             &
-                                    ( 4.d0*dphi_dy(nr)*dphi_dy(nc) + dphi_dx(nr)*dphi_dx(nc)        &
+                                    ( 4.d0*dphi_dy(nr)*dphi_dy(nc) + dphi_dx(nr)*dphi_dx(nc)     &
                                     + dphi_dz(nr)*dphi_dz(nc) )
 
           enddo  ! nr (rows)
@@ -8050,7 +8239,7 @@ subroutine element_to_global_matrix_2d(nx,           ny,        &
        do nc = 1, nNodesPerElement_2d    ! columns of K
 
           ! Determine column of A to be incremented
-          iA = ishift(nr,nc)           ! similarly for i and j indices 
+          iA = ishift(nr,nc)           ! similarly for i and j indices
           jA = jshift(nr,nc)           ! these indices can take values -1, 0 and 1
           m = indxA_2d(iA,jA)
 
@@ -8063,6 +8252,8 @@ subroutine element_to_global_matrix_2d(nx,           ny,        &
           if (verbose_matrix .and. this_rank==rtest .and. iElement==itest .and. jElement==jtest) then
              write(iulog,*) 'Increment Auu, element i, j, nr, nc =', iElement, jElement, nr, nc
              write(iulog,*) '     i, j, m, Kuu, new Auu:', i, j, m, Kuu(nr,nc), Auu(i,j,m)
+!!             write(iulog,*) 'Increment Avv, element i, j, nr, nc =', iElement, jElement, nr, nc
+!!             write(iulog,*) '     i, j, m, Kvv, new Avv:', i, j, m, Kvv(nr,nc), Avv(i,j,m)
           endif
 
        enddo     ! nc
@@ -8080,7 +8271,7 @@ end subroutine element_to_global_matrix_2d
     !       Set diva_slope_correction = F to reproduce older results.
 
   !TODO - Call this subroutine for both 2D and 3D solvers.
-  !       First need to switch the index order for 3D matrices.
+  !       First would need to switch the index order for 3D matrices.
   subroutine basal_sliding_bc_2d(nx,               ny,              &
                                  nNeighbors,       nhalo,           &
                                  parallel,                          &
@@ -8093,7 +8284,6 @@ subroutine basal_sliding_bc_2d(nx,               ny,              &
                                  whichassemble_beta,                &
                                  Auu,              Avv)
 
-
     !------------------------------------------------------------------------
     ! Increment the Auu and Avv matrices with basal traction terms.
     ! Do a surface integral over all basal faces that contain at least one node with a stress BC.
@@ -8166,6 +8356,7 @@ subroutine basal_sliding_bc_2d(nx,               ny,              &
        dphi_dx_2d, dphi_dy_2d, dphi_dz_2d  ! derivatives of basis functions, evaluated at quad pts
 
     real(dp) ::   &
+       increment,   & ! incremental change in matrix element
        beta_qp,     & ! beta evaluated at quadrature point
        detJ           ! determinant of Jacobian for the transformation
                       !  between the reference element and true element
@@ -8173,24 +8364,23 @@ subroutine basal_sliding_bc_2d(nx,               ny,              &
     real(dp), dimension(nNodesPerElement_2d, nNodesPerElement_2d) ::   &
        Kuu, Kvv       ! components of element matrix associated with basal sliding
 
-    if (verbose_basal) then
-       call point_diag(beta, 'beta', itest, jtest, rtest, 7, 7, '(f10.0)')
-    endif
-
     if (whichassemble_beta == HO_ASSEMBLE_BETA_LOCAL) then
 
-       if (nNeighbors == nNodeNeighbors_3d) then  ! 3D problem
-          m = indxA_3d(0,0,0)
-       else  ! 2D problem
-          m = indxA_2d(0,0)
-       endif
-
+       m = indxA_2d(0,0)
        ! Sum over active vertices
+       !WHL, 12/13/25: The following minor change makes the results independent of processor count.
+       !               Without the commented code, the 1-core result differs from the 4-core result
+       !                for some cells near the 4-core processor boundary (i = 211 in a GrIS run).
+       !               The difference is in the last of 64 bits.
+       !               I don't know why the original results differ or why the change fixes it.
        do j = 1, ny-1
           do i = 1, nx-1
              if (active_vertex(i,j)) then
-                Auu(i,j,m) = Auu(i,j,m) + dx*dy/vol0 * beta(i,j)
-                Avv(i,j,m) = Avv(i,j,m) + dx*dy/vol0 * beta(i,j)
+!                Auu(i,j,m) = Auu(i,j,m) + (dx*dy/vol0) * beta(i,j)
+!                Avv(i,j,m) = Avv(i,j,m) + (dx*dy/vol0) * beta(i,j)
+                increment = (dx*dy/vol0) * beta(i,j)
+                Auu(i,j,m) = Auu(i,j,m) + increment
+                Avv(i,j,m) = Avv(i,j,m) + increment
              endif   ! active_vertex
           enddo   ! i
        enddo   ! j
@@ -8331,20 +8521,6 @@ subroutine basal_sliding_bc_2d(nx,               ny,              &
 
     endif   ! whichassemble_beta
 
-    if (verbose_basal .and. this_rank==rtest) then
-       i = itest
-       j = jtest
-       if (nNeighbors == nNodeNeighbors_3d) then  ! 3D problem
-          m = indxA_3d(0,0,0)
-       else
-          m = indxA_2d(0,0)
-       endif
-       write(iulog,*) ' '
-       write(iulog,*) 'Basal BC: i, j, diagonal index =', i, j, m
-       write(iulog,*) 'New Auu diagonal:', Auu(i,j,m)
-       write(iulog,*) 'New Avv diagonal:', Avv(i,j,m)
-    endif
-
   end subroutine basal_sliding_bc_2d
 
 !****************************************************************************
@@ -10407,7 +10583,7 @@ subroutine check_symmetry_element_matrix(nNodesPerElement,  &
 
     do j = 1, nNodesPerElement
        do i = j, nNodesPerElement
-          if (abs(Kuu(i,j) - Kuu(j,i)) > eps10) then
+          if (abs(Kuu(i,j) - Kuu(j,i)) > eps11) then
              write(iulog,*) 'Kuu is not symmetric'
              write(iulog,*) 'i, j, Kuu(i,j), Kuu(j,i):', i, j, Kuu(i,j), Kuu(j,i)
              stop
@@ -10419,7 +10595,7 @@ subroutine check_symmetry_element_matrix(nNodesPerElement,  &
 
     do j = 1, nNodesPerElement
        do i = j, nNodesPerElement
-          if (abs(Kvv(i,j) - Kvv(j,i)) > eps10) then
+          if (abs(Kvv(i,j) - Kvv(j,i)) > eps11) then
              write(iulog,*) 'Kvv is not symmetric'
              write(iulog,*) 'i, j, Kvv(i,j), Kvv(j,i):', i, j, Kvv(i,j), Kvv(j,i)
              stop
@@ -10431,7 +10607,7 @@ subroutine check_symmetry_element_matrix(nNodesPerElement,  &
 
     do j = 1, nNodesPerElement
        do i = 1, nNodesPerElement
-          if (abs(Kuv(i,j) - Kvu(j,i)) > eps10) then
+          if (abs(Kuv(i,j) - Kvu(j,i)) > eps11) then
              write(iulog,*) 'Kuv /= (Kvu)^T'
              write(iulog,*) 'i, j, Kuv(i,j), Kvu(j,i):', i, j, Kuv(i,j), Kvu(j,i)
              stop
@@ -10483,12 +10659,14 @@ subroutine check_symmetry_assembled_matrix_3d(nx,    ny,     nz,      &
 
     real(dp) :: val1, val2          ! values of matrix coefficients
 
-    real(dp) :: maxdiff, diag_entry, avg_val
+    real(dp) :: maxdiff, global_maxdiff, diag_entry, avg_val
 
     integer :: &
          staggered_ilo, staggered_ihi, &  ! bounds of locally owned vertices on staggered grid
          staggered_jlo, staggered_jhi
 
+    integer :: rmax, imax, jmax, kmax, mmax
+
     staggered_ilo = parallel%staggered_ilo
     staggered_ihi = parallel%staggered_ihi
     staggered_jlo = parallel%staggered_jlo
@@ -10503,6 +10681,7 @@ subroutine check_symmetry_assembled_matrix_3d(nx,    ny,     nz,      &
     !  and/or aborts.
 
     maxdiff = 0.d0
+    rmax = 0; imax = 0; jmax = 0; kmax = 0; mmax = 0
 
     ! Loop over locally owned vertices.
     ! Each active vertex is associate with 2*nz matrix rows belonging to this processor.
@@ -10541,12 +10720,15 @@ subroutine check_symmetry_assembled_matrix_3d(nx,    ny,     nz,      &
 
                       if (val2 /= val1) then
                           
-                         if (abs(val2 - val1) > maxdiff) maxdiff = abs(val2 - val1)
+                         if (abs(val2 - val1) > maxdiff) then
+                            maxdiff = abs(val2 - val1)
+                            rmax = this_rank; imax = i; jmax = j; kmax = k; mmax = m
+                         endif
 
                          ! if difference is small, then fix the asymmetry by averaging values
                          ! else print a warning and abort
 
-                         if ( abs(val2-val1) < eps08*abs(diag_entry) ) then
+                         if ( abs(val2-val1) < eps11*abs(diag_entry) ) then
                             avg_val = 0.5d0 * (val1 + val2)
                             Auu( m, k,   i,   j   ) = avg_val
                             Auu(mm, k+kA,i+iA,j+jA) = avg_val
@@ -10568,12 +10750,15 @@ subroutine check_symmetry_assembled_matrix_3d(nx,    ny,     nz,      &
 
                       if (val2 /= val1) then
 
-                         if (abs(val2 - val1) > maxdiff) maxdiff = abs(val2 - val1)
+                         if (abs(val2 - val1) > maxdiff) then
+                            maxdiff = abs(val2 - val1)
+                            rmax = this_rank; imax = i; jmax = j; kmax = k; mmax = m
+                         endif
 
                          ! if difference is small, then fix the asymmetry by averaging values
                          ! else print a warning and abort
 
-                         if ( abs(val2-val1) < eps08*abs(diag_entry) ) then
+                         if ( abs(val2-val1) < eps11*abs(diag_entry) ) then
                             avg_val = 0.5d0 * (val1 + val2)
                             Auv( m, k,   i,   j   ) = avg_val
                             Avu(mm, k+kA,i+iA,j+jA) = avg_val
@@ -10623,12 +10808,15 @@ subroutine check_symmetry_assembled_matrix_3d(nx,    ny,     nz,      &
 
                       if (val2 /= val1) then
                           
-                         if (abs(val2 - val1) > maxdiff) maxdiff = abs(val2 - val1)
+                         if (abs(val2 - val1) > maxdiff) then
+                            maxdiff = abs(val2 - val1)
+                            rmax = this_rank; imax = i; jmax = j; kmax = k; mmax = m
+                         endif
 
                          ! if difference is small, then fix the asymmetry by averaging values
                          ! else print a warning and abort
 
-                         if ( abs(val2-val1) < eps08*abs(diag_entry) ) then
+                         if ( abs(val2-val1) < eps11*abs(diag_entry) ) then
                             avg_val = 0.5d0 * (val1 + val2)
                             Avv( m, k,   i,   j   ) = avg_val
                             Avv(mm, k+kA,i+iA,j+jA) = avg_val
@@ -10648,14 +10836,17 @@ subroutine check_symmetry_assembled_matrix_3d(nx,    ny,     nz,      &
                       val1 = Avu( m, k,    i,    j)      ! value of Avu(row,col)
                       val2 = Auv(mm, k+kA, i+iA, j+jA)   ! value of Auv(col,row)
 
-                      if (abs(val2 - val1) > maxdiff) maxdiff = abs(val2 - val1)
+                      if (abs(val2 - val1) > maxdiff) then
+                         maxdiff = abs(val2 - val1)
+                         rmax = this_rank; imax = i; jmax = j; kmax = k; mmax = m
+                      endif
 
                       if (val2 /= val1) then
 
                          ! if difference is small, then fix the asymmetry by averaging values
                          ! else print a warning and abort
 
-                         if ( abs(val2-val1) < eps08*abs(diag_entry) ) then
+                         if ( abs(val2-val1) < eps11*abs(diag_entry) ) then
                             avg_val = 0.5d0 * (val1 + val2)
                             Avu( m, k,   i,   j   ) = avg_val
                             Auv(mm, k+kA,i+iA,j+jA) = avg_val
@@ -10681,11 +10872,14 @@ subroutine check_symmetry_assembled_matrix_3d(nx,    ny,     nz,      &
        enddo           ! i
     enddo              ! j
 
-    if (verbose_matrix) maxdiff = parallel_reduce_max(maxdiff)
-
-    if (verbose_matrix .and. main_task) then
-       write(iulog,*) ' '
-       write(iulog,*) 'Max difference from symmetry =', maxdiff
+    if (verbose_matrix) then
+       global_maxdiff = parallel_reduce_max(maxdiff)
+       if (maxdiff == global_maxdiff) then
+          ! maxdiff is on this processor; compute and broadcast the global index
+          call parallel_globalindex(imax, jmax, iglobal, jglobal, parallel)
+          write(iulog,*) 'Max asymmetry =', global_maxdiff
+          write(iulog,*) '   i, j, ig, jg, k, m =', imax, jmax, iglobal, jglobal, kmax, mmax
+       endif
     endif
 
   end subroutine check_symmetry_assembled_matrix_3d
@@ -10730,7 +10924,9 @@ subroutine check_symmetry_assembled_matrix_2d(nx,            ny,       &
 
     real(dp) :: val1, val2          ! values of matrix coefficients
 
-    real(dp) :: maxdiff, diag_entry, avg_val
+    real(dp) :: maxdiff, global_maxdiff, diag_entry, avg_val
+
+    integer :: rmax, imax, jmax, mmax
 
     integer :: &
          staggered_ilo, staggered_ihi, &  ! bounds of locally owned vertices on staggered grid
@@ -10750,6 +10946,7 @@ subroutine check_symmetry_assembled_matrix_2d(nx,            ny,       &
     !  and/or aborts.
 
     maxdiff = 0.d0
+    rmax = 0; imax = 0; jmax = 0; mmax = 0
 
     ! Loop over locally owned vertices.
     ! Each active vertex is associate with 2*nz matrix rows belonging to this processor.
@@ -10778,13 +10975,18 @@ subroutine check_symmetry_assembled_matrix_2d(nx,            ny,       &
                 val1 = Auu(i,    j,    m )   ! value of Auu(row,col)
                 val2 = Auu(i+iA, j+jA, mm)   ! value of Auu(col,row)
                 if (val2 /= val1) then
-                   if (abs(val2 - val1) > maxdiff) maxdiff = abs(val2 - val1)
+                   if (abs(val2 - val1) > maxdiff) then
+                      maxdiff = abs(val2 - val1)
+                      rmax = this_rank; imax = i; jmax = j; mmax = m
+                   endif
                    ! if difference is small, then fix the asymmetry by averaging values
+                   !WHL - Here and below, I commented out the code to average asymmetric values.
+                   !      The hope is that the asymmetries are too small to matter.
                    ! else print a warning and abort
-                   if ( abs(val2-val1) < eps08*abs(diag_entry) ) then
-                      avg_val = 0.5d0 * (val1 + val2)
-                      Auu(i,    j,    m ) = avg_val
-                      Auu(i+iA, j+jA, mm) = avg_val
+                   if ( abs(val2-val1) < eps11*abs(diag_entry) ) then
+!                      avg_val = 0.5d0 * (val1 + val2)
+!                      Auu(i,    j,    m ) = avg_val
+!                      Auu(i+iA, j+jA, mm) = avg_val
                    else
                       write(iulog,*) 'WARNING: Auu is not symmetric: this_rank, i, j, iA, jA =', this_rank, i, j, iA, jA
                       write(iulog,*) 'Auu(row,col), Auu(col,row), diff/diag:', val1, val2, (val2 - val1)/diag_entry
@@ -10798,13 +11000,16 @@ subroutine check_symmetry_assembled_matrix_2d(nx,            ny,       &
                 val1 = Auv(i,    j,    m )   ! value of Auv(row,col)
                 val2 = Avu(i+iA, j+jA, mm)   ! value of Avu(col,row)
                 if (val2 /= val1) then
-                   if (abs(val2 - val1) > maxdiff) maxdiff = abs(val2 - val1)
+                   if (abs(val2 - val1) > maxdiff) then
+                      maxdiff = abs(val2 - val1)
+                      rmax = this_rank; imax = i; jmax = j; mmax = m
+                   endif
                    ! if difference is small, then fix the asymmetry by averaging values
                    ! else print a warning and abort
-                   if ( abs(val2-val1) < eps08*abs(diag_entry) ) then
-                      avg_val = 0.5d0 * (val1 + val2)
-                      Auv(i,    j,    m ) = avg_val
-                      Avu(i+iA, j+jA, mm) = avg_val
+                   if ( abs(val2-val1) < eps11*abs(diag_entry) ) then
+!                      avg_val = 0.5d0 * (val1 + val2)
+!                      Auv(i,    j,    m ) = avg_val
+!                      Avu(i+iA, j+jA, mm) = avg_val
                    else
                       write(iulog,*) 'WARNING: Auv is not equal to (Avu)^T, this_rank, i, j, iA, jA =', this_rank, i, j, iA, jA
                       write(iulog,*) 'Auv(row,col), Avu(col,row), diff/diag:', val1, val2, (val2 - val1)/diag_entry
@@ -10822,14 +11027,16 @@ subroutine check_symmetry_assembled_matrix_2d(nx,            ny,       &
                 val2 = Avv(i+iA, j+jA, mm)   ! value of Avv(col,row)
 
                 if (val2 /= val1) then
-                   if (abs(val2 - val1) > maxdiff) maxdiff = abs(val2 - val1)
-
+                   if (abs(val2 - val1) > maxdiff) then
+                      maxdiff = abs(val2 - val1)
+                      rmax = this_rank; imax = i; jmax = j; mmax = m
+                   endif
                    ! if difference is small, then fix the asymmetry by averaging values
                    ! else print a warning and abort
-                   if ( abs(val2-val1) < eps08*abs(diag_entry) ) then
-                      avg_val = 0.5d0 * (val1 + val2)
-                      Avv(i,    j,    m ) = avg_val
-                      Avv(i+iA, j+jA, mm) = avg_val
+                   if ( abs(val2-val1) < eps11*abs(diag_entry) ) then
+!                      avg_val = 0.5d0 * (val1 + val2)
+!                      Avv(i,    j,    m ) = avg_val
+!                      Avv(i+iA, j+jA, mm) = avg_val
                    else
                       write(iulog,*) 'WARNING: Avv is not symmetric: this_rank, i, j, iA, jA =', this_rank, i, j, iA, jA
                       write(iulog,*) 'Avv(row,col), Avv(col,row), diff/diag:', val1, val2, (val2 - val1)/diag_entry
@@ -10844,16 +11051,17 @@ subroutine check_symmetry_assembled_matrix_2d(nx,            ny,       &
                 val1 = Avu(i,    j,    m )   ! value of Avu(row,col)
                 val2 = Auv(i+iA, j+jA, mm)   ! value of Auv(col,row)
 
-                if (abs(val2 - val1) > maxdiff) maxdiff = abs(val2 - val1)
-
                 if (val2 /= val1) then
-
+                   if (abs(val2 - val1) > maxdiff) then
+                      maxdiff = abs(val2 - val1)
+                      rmax = this_rank; imax = i; jmax = j; mmax = m
+                   endif
                    ! if difference is small, then fix the asymmetry by averaging values
                    ! else print a warning and abort
-                   if ( abs(val2-val1) < eps08*abs(diag_entry) ) then
-                      avg_val = 0.5d0 * (val1 + val2)
-                      Avu(i,    j,    m ) = avg_val
-                      Auv(i+iA, j+jA, mm) = avg_val
+                   if ( abs(val2-val1) < eps11*abs(diag_entry) ) then
+!                      avg_val = 0.5d0 * (val1 + val2)
+!                      Avu(i,    j,    m ) = avg_val
+!                      Auv(i+iA, j+jA, mm) = avg_val
                    else
                       write(iulog,*) 'WARNING: Avu is not equal to (Auv)^T, this_rank, i, j, iA, jA =', this_rank, i, j, iA, jA
                       write(iulog,*) 'Avu(row,col), Auv(col,row), diff/diag:', val1, val2, (val2 - val1)/diag_entry
@@ -10870,11 +11078,14 @@ subroutine check_symmetry_assembled_matrix_2d(nx,            ny,       &
     enddo     ! iA
     enddo     ! jA
 
-    if (verbose_matrix) maxdiff = parallel_reduce_max(maxdiff)
-
-    if (verbose_matrix .and. main_task) then
-       write(iulog,*) ' '
-       write(iulog,*) 'Max difference from symmetry =', maxdiff
+    if (verbose_matrix) then
+       global_maxdiff = parallel_reduce_max(maxdiff)
+       if (maxdiff == global_maxdiff) then
+          ! maxdiff is on this processor; compute and broadcast the global index
+          call parallel_globalindex(imax, jmax, iglobal, jglobal, parallel)
+          write(iulog,*) 'Max asymmetry =', global_maxdiff
+          write(iulog,*) '   ig, jg, m =', iglobal, jglobal, mmax
+       endif
     endif
 
   end subroutine check_symmetry_assembled_matrix_2d
diff --git a/libglissade/glissade_velo_higher_pcg.F90 b/libglissade/glissade_velo_higher_pcg.F90
index 8d31e098..6d162af7 100644
--- a/libglissade/glissade_velo_higher_pcg.F90
+++ b/libglissade/glissade_velo_higher_pcg.F90
@@ -531,7 +531,7 @@ subroutine pcg_solver_standard_3d(nx,        ny,            &
  
     if (niters == maxiters) then
        if (verbose_pcg .and. main_task) then
-          write(iulog,*) 'Glissade PCG solver not converged'
+          write(iulog,*) 'Glissade PCG solver not yet converged'
           write(iulog,*) 'niters, err, tolerance:', niters, err, tolerance
        endif
     endif
@@ -746,7 +746,7 @@ subroutine pcg_solver_standard_2d(nx,        ny,            &
     iter_loop: do iter = 1, maxiters
 
        if (verbose_pcg .and. main_task) then
-          write(iulog,*) 'iter =', iter
+!          write(iulog,*) 'iter =', iter
        endif
 
        call t_startf("pcg_precond")
@@ -933,7 +933,7 @@ subroutine pcg_solver_standard_2d(nx,        ny,            &
  
     if (niters == maxiters) then
        if (verbose_pcg .and. main_task) then
-          write(iulog,*) 'Glissade PCG solver not converged'
+          write(iulog,*) 'Glissade PCG solver not yet converged'
           write(iulog,*) 'niters, err, tolerance:', niters, err, tolerance
        endif
     endif
@@ -1233,12 +1233,6 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
     ilocal = staggered_ihi - staggered_ilo + 1
     jlocal = staggered_jhi - staggered_jlo + 1
 
-    !WHL - debug
-    if (verbose_pcg .and. this_rank == rtest) then
-       write(iulog,*) 'stag_ihi, stag_ilo, ilocal:', staggered_ihi, staggered_ilo, ilocal
-       write(iulog,*) 'stag_jhi, stag_jlo, jlocal:', staggered_jhi, staggered_jlo, jlocal
-    endif
-
     !---- Set up matrices for preconditioning
 
     call t_startf("pcg_precond_init")
@@ -1483,28 +1477,6 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
                             active_vertex,         &
                             Mvv,  rv,   zv)      ! solve Mvv*zv = rv for zv
 
-       !WHL - debug
-       if (verbose_pcg .and. this_rank == rtest) then
-          j = jtest
-          write(iulog,*) 'Standard SIA PC:'
-          write(iulog,*) ' '
-          write(iulog,*) 'i, zu_sia(1), zu_sia(nz):'
-          do i = itest-3, itest+3
-             write(iulog,*) ' '
-             do k = 1, nz
-                write(iulog,'(i4, 2f16.10)') i, zu(1,i,j), zu(nz,i,j)
-             enddo
-          enddo  ! i
-          write(iulog,*) ' '
-          write(iulog,*) 'i, zv_sia(1), zv_sia(nz):'
-          do i = itest-3, itest+3
-             write(iulog,*) ' '
-             do k = 1, nz
-                write(iulog,'(i4, 2f16.10)') i, zv(1,i,j), zv(nz,i,j)
-             enddo
-          enddo  ! i
-       endif
-
     elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then
 
        ! Use a local tridiagonal solver to find an approximate solution of A*z = r
@@ -1618,30 +1590,14 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
     rv(:,:,:) = rv(:,:,:) - alpha*qv(:,:,:)
     call t_stopf("pcg_vecupdate")
 
-    !WHL - debug
-    if (verbose_pcg .and. this_rank == rtest) then
-       j = jtest
-       write(iulog,*) ' '
-       write(iulog,*) 'alpha =', alpha
-       write(iulog,*) 'iter = 1: i, k, xu, xv, ru, rv:'
-       do i = staggered_ilo, staggered_ihi
-!!       do i = itest-3, itest+3
-          write(iulog,*) ' '
-          do k = 1, nz
-             write(iulog,'(2i4, 4f16.10)') i, k, xu(k,i,j), xv(k,i,j), ru(k,i,j), rv(k,i,j)
-          enddo
-       enddo
-    endif
-
     !---------------------------------------------------------------
     ! Iterate to solution
     !---------------------------------------------------------------
 
     iter_loop: do iter = 2, maxiters_chrongear  ! first iteration done above
 
-       if (verbose_pcg .and. this_rank == rtest) then
-          write(iulog,*) ' '
-          write(iulog,*) 'iter =', iter
+       if (verbose_pcg .and. main_task) then
+!          write(iulog,*) 'iter =', iter
        endif
 
        !---- Compute PC(r) = solution z of Mz = r
@@ -1727,9 +1683,6 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
 
        endif    ! precond
 
-    !WHL - debug
-    if (verbose_pcg .and. this_rank == rtest) write(iulog,*) 'L1'
-
        call t_stopf("pcg_precond_iter")
 
        !---- Compute Az = A*z
@@ -1832,7 +1785,7 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
 
        if (mod(iter, linear_solve_ncheck) == 0 .or. iter == 5) then
 
-          if (verbose_pcg .and. this_rank == rtest) then
+          if (verbose_pcg .and. main_task) then
              write(iulog,*) ' '
              write(iulog,*) 'Check convergence, iter =', iter
           endif
@@ -1870,7 +1823,7 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
           L2_resid = sqrt(rr)          ! L2 norm of residual
           err = L2_resid/L2_rhs        ! normalized error
 
-          if (verbose_pcg .and. this_rank == rtest) then
+          if (verbose_pcg .and. main_task) then
              write(iulog,*) 'iter, L2_resid, L2_rhs, error =', iter, L2_resid, L2_rhs, err
           endif
 
@@ -1903,14 +1856,14 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
 
           if (err < tolerance) then
              niters = iter
-             if (verbose_pcg .and. this_rank == rtest) then
+             if (verbose_pcg .and. main_task) then
                 write(iulog,*) 'Glissade PCG solver has converged, iter =', niters
                 write(iulog,*) ' '
              endif
              exit iter_loop
           elseif (niters == maxiters_chrongear) then
-             if (verbose_pcg .and. this_rank == rtest) then
-                write(iulog,*) 'Glissade PCG solver not converged'
+             if (verbose_pcg .and. main_task) then
+                write(iulog,*) 'Glissade PCG solver not yet converged'
                 write(iulog,*) 'niters, err, tolerance:', niters, err, tolerance
                 write(iulog,*) ' '
              endif
@@ -2114,6 +2067,7 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
     !WHL - debug
     integer :: iu_max, ju_max, iv_max, jv_max
     real(dp) :: ru_max, rv_max
+    real(dp) :: sum_temp
 
     staggered_ilo = parallel%staggered_ilo
     staggered_ihi = parallel%staggered_ihi
@@ -2133,7 +2087,7 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
        maxiters_chrongear = maxiters
     endif
 
-    if (verbose_pcg .and. this_rank == rtest) then
+    if (verbose_pcg .and. main_task) then
        write(iulog,*) 'Using native PCG solver (Chronopoulos-Gear)'
        write(iulog,*) 'tolerance, maxiters, precond =', tolerance, maxiters_chrongear, precond
     endif
@@ -2148,7 +2102,7 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
 
     if (precond == HO_PRECOND_NONE) then    ! no preconditioner
 
-       if (verbose_pcg .and. this_rank == rtest) then
+       if (verbose_pcg .and. main_task) then
           write(iulog,*) 'Using no preconditioner'
        endif
 
@@ -2170,15 +2124,15 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
 
     elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then
 
-          !WHL - debug
-          if (verbose_tridiag .and. this_rank==rtest) then
-             i = itest
-             j = jtest
-             write(iulog,*) ' '
-             write(iulog,*) 'r, i, j =', this_rank, i, j
-             write(iulog,*) 'Auu =', Auu(i,j,:)
-             write(iulog,*) 'Avv =', Avv(i,j,:)
-          endif
+       !WHL - debug
+       if (verbose_tridiag .and. this_rank==rtest) then
+          i = itest
+          j = jtest
+          write(iulog,*) ' '
+          write(iulog,*) 'r, i, j =', this_rank, i, j
+          write(iulog,*) 'Auu =', Auu(i,j,:)
+          write(iulog,*) 'Avv =', Avv(i,j,:)
+       endif
 
        allocate(Adiag_u   (nx-1,ny-1))
        allocate(Asubdiag_u(nx-1,ny-1))
@@ -2282,7 +2236,7 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
     endif   ! precond
 
     !WHL - debug
-    if (verbose_pcg .and. this_rank == rtest) write(iulog,*) 'Done in PC setup'
+    if (verbose_pcg .and. main_task) write(iulog,*) 'Done in PC setup'
 
     call t_stopf("pcg_precond_init")
 
@@ -2421,7 +2375,7 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
           enddo
        endif
 
-       if (verbose_pcg .and. this_rank == rtest) then
+       if (verbose_pcg .and. main_task) then
           write(iulog,*) 'call tridiag_solver_local_2d'
        endif
 
@@ -2617,11 +2571,10 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
     !WHL - debug
     if (verbose_pcg .and. this_rank == rtest) then
        j = jtest
-       write(iulog,*) ' '
-       write(iulog,*) 'iter = 1: i, xu, xv, ru, rv:'
-!!          do i = itest-3, itest+3
-       do i = staggered_ilo, staggered_ihi
-          write(iulog,'(i4, 4f16.10)') i, xu(i,j), xv(i,j), ru(i,j), rv(i,j)
+!!       write(iulog,*) ' '
+!!       write(iulog,*) 'iter = 1: i, xu, xv, ru, rv:'
+       do i = itest-3, itest+3
+!!          write(iulog,'(i4, 4f16.10)') i, xu(i,j), xv(i,j), ru(i,j), rv(i,j)
        enddo  ! i
     endif
 
@@ -2631,9 +2584,8 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
 
     iter_loop: do iter = 2, maxiters_chrongear  ! first iteration done above
 
-       if (verbose_pcg .and. this_rank == rtest) then
-          write(iulog,*) ' '
-          write(iulog,*) 'iter =', iter
+       if (verbose_pcg .and. main_task) then
+!          write(iulog,*) 'iter =', iter
        endif
 
        !---- Compute PC(r) = solution z of Mz = r
@@ -2740,17 +2692,16 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
           enddo
 
           !WHL - debug
-    if (verbose_pcg .and. this_rank == rtest) then
-       j = jtest
-       write(iulog,*) ' '
-       write(iulog,*) 'Before global tridiag PC u solve, r, j =', rtest, jtest
-       write(iulog,*) ' '
-       write(iulog,*) 'i, Adiag_u, Asubdiag_u, Asupdiag_u, b_u:'
-       do i = itest-3, itest+3
-          write(iulog,'(i4, 4e16.8)') i, Adiag_u(i,j), Asubdiag_u(i,j), Asupdiag_u(i,j), b_u(i,j)
-       enddo
-    endif
-
+          if (verbose_pcg .and. this_rank == rtest) then
+             j = jtest
+             write(iulog,*) ' '
+             write(iulog,*) 'Before global tridiag PC u solve, r, j =', rtest, jtest
+             write(iulog,*) ' '
+             write(iulog,*) 'i, Adiag_u, Asubdiag_u, Asupdiag_u, b_u:'
+             do i = itest-3, itest+3
+                write(iulog,'(i4, 4e16.8)') i, Adiag_u(i,j), Asubdiag_u(i,j), Asupdiag_u(i,j), b_u(i,j)
+             enddo
+          endif
 
           call tridiag_solver_global_2d(ilocal,       jlocal,      &
                                         parallel,     tasks_row,   &
@@ -2787,17 +2738,6 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
              enddo
           enddo
 
-    if (verbose_pcg .and. this_rank == rtest) then
-       j = jtest
-       write(iulog,*) ' '
-       write(iulog,*) 'Before global tridiag PC v solve, r, j =', rtest, jtest
-       write(iulog,*) ' '
-       write(iulog,*) 'i, Adiag_v, Asubdiag_v, Asupdiag_v, b_v:'
-       do i = itest-3, itest+3
-          write(iulog,'(i4, 4e16.8)') i, Adiag_v(i,j), Asubdiag_v(i,j), Asupdiag_v(i,j), b_v(i,j)
-       enddo
-    endif
-
           call tridiag_solver_global_2d(jlocal,       ilocal,      &
                                         parallel,     tasks_col,   &
                                         'col',                     &  ! tridiagonal solve for each column
@@ -2911,16 +2851,6 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
 
        call t_stopf("pcg_vecupdate")
 
-       !WHL - debug
-       if (verbose_pcg .and. this_rank == rtest) then
-          j = jtest
-          write(iulog,*) 'i, xu, xv, ru, rv:'
-!!             do i = staggered_ihi, staggered_ilo, -1
-          do i = itest-3, itest+3
-             write(iulog,'(i4, 4f16.10)') i, xu(i,j), xv(i,j), ru(i,j), rv(i,j)
-          enddo
-       endif
-
        ! Check for convergence every linear_solve_ncheck iterations.
        ! Also check at iter = 5, to reduce iterations when the nonlinear solver is close to convergence.
        ! TODO: Check at iter = linear_solve_ncheck/2 instead of 5?  This would be answer-changing.
@@ -2928,15 +2858,18 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
        ! For convergence check, use r = b - Ax
 
        if (mod(iter, linear_solve_ncheck) == 0 .or. iter == 5) then
-!!       if (mod(iter, linear_solve_ncheck) == 0 .or. iter == linear_solve_ncheck/2) then
 
-          if (verbose_pcg .and. this_rank == rtest) then
+          if (verbose_pcg .and. main_task) then
              write(iulog,*) ' '
              write(iulog,*) '   check convergence, iter =', iter
           endif
 
           !---- Compute z = A*x  (use z as a temp vector for A*x)
            
+          !WHL - debug - don't think this is needed, but try just in case
+!!          call staggered_parallel_halo(xu, parallel)
+!!          call staggered_parallel_halo(xv, parallel)
+
           call t_startf("pcg_matmult_resid")
           call matvec_multiply_structured_2d(nx,            ny,            &
                                              parallel,                     &
@@ -2949,11 +2882,22 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
 
           !---- Compute residual r = b - A*x
 
+          !WHL - debug - don't think this is needed, but try just in case
+!!          call staggered_parallel_halo(bu, parallel)
+!!          call staggered_parallel_halo(bv, parallel)
+          !WHL - debug - don't think this is needed, but try just in case
+!!          call staggered_parallel_halo(zu, parallel)
+!!          call staggered_parallel_halo(zv, parallel)
+
           call t_startf("pcg_vecupdate")
           ru(:,:) = bu(:,:) - zu(:,:)
           rv(:,:) = bv(:,:) - zv(:,:)
           call t_stopf("pcg_vecupdate")
 
+          !WHL - debug - don't think this is needed, but try just in case
+!!          call staggered_parallel_halo(ru, parallel)
+!!          call staggered_parallel_halo(rv, parallel)
+
           !---- Compute dot product (r, r)
 
           call t_startf("pcg_dotprod")
@@ -2968,7 +2912,7 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
           L2_resid = sqrt(rr)          ! L2 norm of residual
           err = L2_resid/L2_rhs        ! normalized error
 
-          if (verbose_pcg .and. this_rank == rtest) then
+          if (verbose_pcg .and. main_task) then
              write(iulog,*) 'iter, L2_resid, L2_rhs, error =', iter, L2_resid, L2_rhs, err
           endif
 
@@ -3001,13 +2945,13 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
 
           if (err < tolerance) then
              niters = iter
-             if (verbose_pcg .and. this_rank == rtest) then
+             if (verbose_pcg .and. main_task) then
                 write(iulog,*) 'Glissade PCG solver has converged, iter =', niters
              endif
              exit iter_loop
           elseif (niters == maxiters_chrongear) then
-             if (verbose_pcg .and. this_rank == rtest) then
-                write(iulog,*) 'Glissade PCG solver not converged'
+             if (verbose_pcg .and. main_task) then
+                write(iulog,*) 'Glissade PCG solver not yet converged'
                 write(iulog,*) 'niters, err, tolerance:', niters, err, tolerance
              endif
           endif
diff --git a/utils/build/generate_ncvars.py b/utils/build/generate_ncvars.py
index 993c2c0a..e791670d 100755
--- a/utils/build/generate_ncvars.py
+++ b/utils/build/generate_ncvars.py
@@ -193,9 +193,9 @@ def write(self,vars):
         self.print_warning()
         for l in self.infile.readlines():
             for token in self.handletoken:
-                if l.find(token) is not -1:
+                if l.find(token) != -1:
                     break
-            if l.find(token) is not -1:
+            if l.find(token) != -1:
                 for v in vars.keys():
                     self.handletoken[token](vars[v])
             else:
@@ -267,9 +267,9 @@ def write(self,vars):
             for k in module.keys():
                 l = l.replace(k.upper(),module[k])
             for token in self.handletoken:
-                if l.find(token) is not -1:
+                if l.find(token) != -1:
                     break
-            if l.find(token) is not -1:
+            if l.find(token) != -1:
                 for v in vars.keys():
                     self.handletoken[token](vars[v])
             elif '!GENVAR_DIMS!' in l:

From deb5384fcd189b3f6869d6921f04cfa7902a3927 Mon Sep 17 00:00:00 2001
From: William Lipscomb <lipscomb@ucar.edu>
Date: Mon, 5 Jan 2026 15:03:40 -0700
Subject: [PATCH 05/21] Change for reprosum testing

This commit makes a very small change in glissade_velo_higher.F90,
subroutine basal_sliding_bc_2d. A recent commit swaps the following lines:

    Auu(i,j,m) = Auu(i,j,m) + (dx*dy/vol0) * beta(i,j)
    Avv(i,j,m) = Avv(i,j,m) + (dx*dy/vol0) * beta(i,j)

for these:

   increment = (dx*dy/vol0) * beta(i,j)
   Auu(i,j,m) = Auu(i,j,m) + increment
   Avv(i,j,m) = Avv(i,j,m) + increment

Before the swap, the reprosum capability was not working for a Greenland test case.
The results differ for a 1-year run with 1 task v. 4 tasks.
After the swap, the results for 1 task v. 4 tasks are BFB.
I discovered this more or less by accident. I don't see why such a change
would matter for the reprosum algorithm.

This commit backs out the change above.
With the change backed out, the 1-task and 4-task results differ.
A working hypothesis is that the differences are due to a logic error
in the original reprosum code by Pat Worley.
In 2023, Pat introduced some corrections. The next step is to implement
Pat's corrections and see if they fix the reprosum capability.
---
 libglissade/glissade_velo_higher.F90 | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/libglissade/glissade_velo_higher.F90 b/libglissade/glissade_velo_higher.F90
index 3b3ec31b..73b45f52 100644
--- a/libglissade/glissade_velo_higher.F90
+++ b/libglissade/glissade_velo_higher.F90
@@ -8376,11 +8376,11 @@ subroutine basal_sliding_bc_2d(nx,               ny,              &
        do j = 1, ny-1
           do i = 1, nx-1
              if (active_vertex(i,j)) then
-!                Auu(i,j,m) = Auu(i,j,m) + (dx*dy/vol0) * beta(i,j)
-!                Avv(i,j,m) = Avv(i,j,m) + (dx*dy/vol0) * beta(i,j)
-                increment = (dx*dy/vol0) * beta(i,j)
-                Auu(i,j,m) = Auu(i,j,m) + increment
-                Avv(i,j,m) = Avv(i,j,m) + increment
+                Auu(i,j,m) = Auu(i,j,m) + (dx*dy/vol0) * beta(i,j)
+                Avv(i,j,m) = Avv(i,j,m) + (dx*dy/vol0) * beta(i,j)
+!!                increment = (dx*dy/vol0) * beta(i,j)
+!!                Auu(i,j,m) = Auu(i,j,m) + increment
+!!                Avv(i,j,m) = Avv(i,j,m) + increment
              endif   ! active_vertex
           enddo   ! i
        enddo   ! j

From b9d44df3aab71ccd30b722a15ed534cd1cac0d54 Mon Sep 17 00:00:00 2001
From: William Lipscomb <lipscomb@ucar.edu>
Date: Mon, 5 Jan 2026 17:14:02 -0700
Subject: [PATCH 06/21] Replaced the reprosum code with Pat Worley's 2023
 version

This commit replaces the old cism_reprosum_mod - which was based on the current copy
of shr_reprosum_mod in the CESM repo - with a new cism_reprosum_mod which is based on
a current copy of shr_reprosum_mod in the E3SM repo. The E3SM version includes
changes made by Pat Worley in 2023.

Pat's changes are contained in three PRs described here:
* https://github.com/E3SM-Project/E3SM/pull/5534
* https://github.com/E3SM-Project/E3SM/pull/5549
* https://github.com/E3SM-Project/E3SM/pull/5560

The new copy of shr_reprosum_mod is here:
* https://github.com/E3SM-Project/E3SM/blob/master/share/util/shr_reprosum_mod.F90

Many of Pat's changes were cosmetic, but he also changed some logic to prevent
potential failures of reproducibility. I had hoped that these changes would fix
the problem described in the previous commit ('Change for reprosum testing').
However, I am getting the same answers as with the previous reprosum code.
The runs with 1 and 4 tasks still fail the BFB test with the unswapped code
in glissade_velo_higher.F90, while passing the BFB test with the swapped code.
This commit restores the code swap. I still don't understand why this swap is needed.

Thanks to Bill Sacks for pointing me to Pat's changes.
---
 libglimmer/cism_reprosum_mod.F90     | 1613 +++++++++++++++++---------
 libglissade/glissade_velo_higher.F90 |   10 +-
 2 files changed, 1088 insertions(+), 535 deletions(-)

diff --git a/libglimmer/cism_reprosum_mod.F90 b/libglimmer/cism_reprosum_mod.F90
index 2f071f82..0740f459 100644
--- a/libglimmer/cism_reprosum_mod.F90
+++ b/libglimmer/cism_reprosum_mod.F90
@@ -1,8 +1,5 @@
-!WHL, Nov. 2025: Adapted from shr_reprosum_mod.F90, part of CESM shared code
-!                Changed 'shr' to 'cism' to avoid name conflicts with shared code
-  
 module cism_reprosum_mod
-!-----------------------------------------------------------------------
+!------------------------------------------------------------------------
 !
 ! Purpose:
 ! Compute reproducible global sums of a set of arrays across an MPI
@@ -12,105 +9,136 @@ module cism_reprosum_mod
 ! Compute using either or both a scalable, reproducible algorithm and a
 ! scalable, nonreproducible algorithm:
 ! * Reproducible (scalable):
-!    Convert to fixed point (integer vector representation) to enable
-!    reproducibility when using MPI_Allreduce
+!    Convert each floating point summand to an integer vector
+!    representation, to enable reproducibility when using
+!    MPI_Allreduce, then convert the resulting global sum back to a
+!    floating point representation locally;
 ! * Alternative usually reproducible (scalable):
 !    Use parallel double-double algorithm due to Helen He and
-!    Chris Ding, based on David Bailey's/Don Knuth's DDPDD algorithm
+!    Chris Ding, based on David Bailey's/Don Knuth's DDPDD algorithm;
 ! * Nonreproducible (scalable):
 !    Floating point and MPI_Allreduce based.
 ! If computing both reproducible and nonreproducible sums, compare
 ! these and report relative difference (if absolute difference
 ! less than sum) or absolute difference back to calling routine.
 !
-! Author: P. Worley (based on suggestions from J. White for fixed
-!                    point algorithm and on He/Ding paper for ddpdd
+! Author: P. Worley (based on suggestions from J. White for integer
+!                    vector algorithm and on He/Ding paper for DDPDD
 !                    algorithm)
 !
-!-----------------------------------------------------------------------
-
-!-----------------------------------------------------------------------
-!- use statements ------------------------------------------------------
-!-----------------------------------------------------------------------
-!!#if ( defined noI8 )
-!!  ! Workaround for when shr_kind_i8 is not supported.
-!!   use shr_kind_mod,  only: r8 => shr_kind_r8, i8 => shr_kind_i4
-!!#else
-!!   use shr_kind_mod,  only: r8 => shr_kind_r8, i8 => shr_kind_i8
-!!#endif
-   !WHL - Use CISM modules instead of shared modules if possible
+! William Lipscomb, Jan. 2026:
+! Renamed this module to cism_reprosum_mod. Modified as follows to build
+!  without using CESM shared code:
+! * Replaced some use statements
+! * Changed 'shr' to 'cism' to avoid name conflicts with shared code
+! * Output directed to iulog (declared in glimmer_paramets)
+! * Added some optional debugging statements
+! I started from a version that includes some logic fixes and code cleanup
+!  done by Pat Worley in 2023. Pat's revised version differs from the code
+!  in the CESM repo as of Jan. 2026.
+!------------------------------------------------------------------------
+
+!------------------------------------------------------------------------
+!- use statements -------------------------------------------------------
+!------------------------------------------------------------------------
+   !WHL - Use CISM modules instead of shared modules
+!#if ( defined noI8 )
+!   ! Workaround for when shr_kind_i8 is not supported.
+!   use shr_kind_mod,  only: r8 => shr_kind_r8, i8 => shr_kind_i4
+!#else
+!   use shr_kind_mod,  only: r8 => shr_kind_r8, i8 => shr_kind_i8
+!#endif
+!   use shr_log_mod,   only: s_loglev  => shr_log_Level
+!   use shr_log_mod,   only: s_logunit => shr_log_Unit
+!   use shr_sys_mod,   only: shr_sys_abort
+!   use shr_infnan_mod,only: shr_infnan_inf_type, assignment(=), &
+!                            shr_infnan_posinf, shr_infnan_neginf, &
+!                            shr_infnan_nan, &
+!                            shr_infnan_isnan, shr_infnan_isinf, &
+!                            shr_infnan_isposinf, shr_infnan_isneginf
+!#ifndef EAMXX_STANDALONE
+!   use perf_mod
+!#endif
    use glimmer_global, only: r8 => dp
    use glimmer_global, only: i8
    use glimmer_paramets, only: iulog
    use glimmer_utils, only: double_to_binary
-!   use shr_log_mod,   only: s_loglev  => shr_log_Level
-!   use shr_log_mod,   only: s_logunit => shr_log_Unit
-!   use shr_sys_mod,   only: shr_sys_abort
    use cism_infnan_mod,only: cism_infnan_inf_type, assignment(=), &
-                            cism_infnan_posinf, cism_infnan_neginf, &
-                            cism_infnan_nan, &
-                            cism_infnan_isnan, cism_infnan_isinf, &
-                            cism_infnan_isposinf, cism_infnan_isneginf
-
-#ifdef TIMING
-   use perf_mod
-#endif
-!-----------------------------------------------------------------------
-!- module boilerplate --------------------------------------------------
-!-----------------------------------------------------------------------
+                             cism_infnan_posinf, cism_infnan_neginf, &
+                             cism_infnan_nan, &
+                             cism_infnan_isnan, cism_infnan_isinf, &
+                             cism_infnan_isposinf, cism_infnan_isneginf
+   use profile, only: t_startf, t_stopf  !WHL - replace with perf_mod?
+   ! end WHL mods
+
+   ! Import MPI fcns/types
+   use mpi
+
+!------------------------------------------------------------------------
+!- module boilerplate ---------------------------------------------------
+!------------------------------------------------------------------------
    implicit none
    private
 
-!-----------------------------------------------------------------------
-!- include statements --------------------------------------------------
-!-----------------------------------------------------------------------
-#include <mpif.h>
-
    save
 
-!-----------------------------------------------------------------------
-! Public interfaces ----------------------------------------------------
-!-----------------------------------------------------------------------
+!------------------------------------------------------------------------
+! Public interfaces -----------------------------------------------------
+!------------------------------------------------------------------------
    public :: &
-!!      shr_reprosum_setopts,        &! set runtime options
-!!      shr_reprosum_calc,           &! calculate distributed sum
-!!      shr_reprosum_tolExceeded      ! utility function to check relative
-!!                                    !  differences against the tolerance
       cism_reprosum_setopts,        &! set runtime options
       cism_reprosum_calc,           &! calculate distributed sum
       cism_reprosum_tolExceeded      ! utility function to check relative
                                      !  differences against the tolerance
 
-!-----------------------------------------------------------------------
-! Public data ----------------------------------------------------------
-!-----------------------------------------------------------------------
+!------------------------------------------------------------------------
+! Public data -----------------------------------------------------------
+!------------------------------------------------------------------------
    logical, public     :: cism_reprosum_recompute = .false.
 
    real(r8), public    :: cism_reprosum_reldiffmax = -1.0_r8
 
-   logical, parameter, public :: verbose_reprosum = .false.
-!!   logical, parameter, public :: verbose_reprosum = .true.
+   !WHL mod
+!!   logical, parameter, public :: verbose_reprosum = .false.
+   logical, parameter, public :: verbose_reprosum = .true.
+   ! end WHL mod
 
-!-----------------------------------------------------------------------
+!------------------------------------------------------------------------
 ! Private interfaces ----------------------------------------------------
-!-----------------------------------------------------------------------
+!------------------------------------------------------------------------
    private :: &
       ddpdd,           &! double-double sum routine
       split_indices     ! split indices among OMP threads
 
-!-----------------------------------------------------------------------
+!------------------------------------------------------------------------
 ! Private data ----------------------------------------------------------
-!-----------------------------------------------------------------------
+!------------------------------------------------------------------------
 
-   !----------------------------------------------------------------------------
+   !---------------------------------------------------------------------
    ! cism_reprosum_mod options
-   !----------------------------------------------------------------------------
+   !---------------------------------------------------------------------
    logical            :: repro_sum_use_ddpdd = .false.
 
    logical            :: repro_sum_allow_infnan = .false.
 
-   !WHLmod
-!!   integer :: s_logunit = 6
+   !WHL - Should this code be declared?
+   !      Not sure what EAMXX_STANDALONE means
+#ifdef EAMXX_STANDALONE
+   ! Declare the C function interface
+   interface
+      subroutine cism_reprosumx86_fix_start(arg) bind(c)
+        use iso_c_binding
+        integer, intent(out) :: arg
+      end subroutine cism_reprosumx86_fix_start
+   end interface
+
+   interface
+      subroutine cism_reprosumx86_fix_end(arg) bind(c)
+        use iso_c_binding
+        integer, intent(in) :: arg
+      end subroutine cism_reprosumx86_fix_end
+   end interface
+#endif
 
    CONTAINS
 
@@ -124,55 +152,53 @@ subroutine cism_reprosum_setopts(repro_sum_use_ddpdd_in,    &
                                    repro_sum_master,          &
                                    repro_sum_logunit          )
 
-!-----------------------------------------------------------------------
+!------------------------------------------------------------------------
 ! Purpose: Set runtime options
 ! Author: P. Worley
-!-----------------------------------------------------------------------
-!------------------------------Arguments--------------------------------
-      ! Use DDPDD algorithm instead of fixed precision algorithm
+!------------------------------------------------------------------------
+!------------------------------Arguments---------------------------------
+      ! Use DDPDD algorithm instead of integer vector algorithm
       logical, intent(in), optional :: repro_sum_use_ddpdd_in
-
       ! Allow INF or NaN in summands
       logical, intent(in), optional :: repro_sum_allow_infnan_in
-
       ! maximum permissible difference between reproducible and
       ! nonreproducible sums
       real(r8), intent(in), optional :: repro_sum_rel_diff_max_in
-
       ! recompute using different algorithm when difference between
       ! reproducible and nonreproducible sums is too great
       logical, intent(in), optional  :: repro_sum_recompute_in
-
-      ! flag indicating whether this process should output log messages
+      ! flag indicating whether this MPI task should output
+      ! log messages
       logical, intent(in), optional  :: repro_sum_master
-
       ! unit number for log messages
       integer, intent(in), optional  :: repro_sum_logunit
-
-!---------------------------Local Workspace-----------------------------
+!---------------------------Local Workspace------------------------------
       integer logunit            ! unit number for log messages
       logical master             ! local master?
       logical,save :: firstcall = .true.  ! first call
       integer :: ierr            ! MPI error return
-!-----------------------------------------------------------------------
+!------------------------------------------------------------------------
 
       if ( present(repro_sum_master) ) then
          master = repro_sum_master
       else
-         master = .false.  !WHL master = main_task?
+         master = .false.
       endif
 
       if ( present(repro_sum_logunit) ) then
          logunit = repro_sum_logunit
       else
 !!         logunit = s_logunit
-         logunit = iulog   ! CISM default
+         logunit = iulog
       endif
 
       if (.not. firstcall) then
-!!         call shr_sys_abort('shr_reprosum_setopts ERROR: multiple calls')
+         !WHL mod
+!         write(logunit,*) 'shr_reprosum_setopts: ERROR can only be called once'
+!         call shr_sys_abort('shr_reprosum_setopts ERROR: multiple calls')
          write(logunit,*) 'cism_reprosum_setopts: ERROR can only be called once'
          call mpi_abort(MPI_COMM_WORLD, 1001, ierr)
+         ! end WHL mod
       endif
       firstcall = .false.
 
@@ -190,34 +216,34 @@ subroutine cism_reprosum_setopts(repro_sum_use_ddpdd_in,    &
       endif
       if (master) then
          if ( repro_sum_use_ddpdd ) then
-            write(logunit,*) 'CISM_REPROSUM_SETOPTS: ',&
+            write(logunit,*) 'cism_REPROSUM_SETOPTS: ',&
               'Using double-double-based (scalable) usually reproducible ', &
               'distributed sum algorithm'
          else
-            write(logunit,*) 'CISM_REPROSUM_SETOPTS: ',&
-              'Using fixed-point-based (scalable) reproducible ', &
+            write(logunit,*) 'cism_REPROSUM_SETOPTS: ',&
+              'Using integer-vector-based (scalable) reproducible ', &
               'distributed sum algorithm'
          endif
 
          if ( repro_sum_allow_infnan ) then
-            write(logunit,*) 'CISM_REPROSUM_SETOPTS: ',&
+            write(logunit,*) 'cism_REPROSUM_SETOPTS: ',&
               'Will calculate sum when INF or NaN are included in summands'
          else
-            write(logunit,*) 'CISM_REPROSUM_SETOPTS: ',&
+            write(logunit,*) 'cism_REPROSUM_SETOPTS: ',&
               'Will abort if INF or NaN are included in summands'
          endif
 
-         if (cism_reprosum_reldiffmax >= 0._r8) then
+         if (cism_reprosum_reldiffmax >= 0.0_r8) then
             write(logunit,*) '                    ',&
               'with a maximum relative error tolerance of ', &
-               cism_reprosum_reldiffmax
+              cism_reprosum_reldiffmax
             if (cism_reprosum_recompute) then
                write(logunit,*) '                    ',&
                  'If tolerance exceeded, sum is recomputed using ', &
                  'a serial algorithm.'
             else
                write(logunit,*) '                    ',&
-                 'If tolerance exceeded, fixed-precision is sum used ', &
+                 'If tolerance exceeded, integer-vector-based sum is used ', &
                  'but a warning is output.'
             endif
          else
@@ -239,48 +265,192 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands,     &
                                  gbl_max_nsummands, gbl_max_nsummands_out,&
                                  gbl_count, repro_sum_validate,           &
                                  repro_sum_stats, rel_diff, commid        )
-!----------------------------------------------------------------------
+!------------------------------------------------------------------------
 !
 ! Purpose:
-! Compute the global sum of each field in "arr" using the indicated
+! Compute the global sum of each field in 'arr' using the indicated
 ! communicator with a reproducible yet scalable implementation based
-! on a fixed point algorithm. An alternative is to use an "almost
-! always reproducible" floating point algorithm, as described below.
-!
-! The accuracy of the fixed point algorithm is controlled by the
-! number of "levels" of integer expansion. The algorithm will calculate
-! the number of levels that is required for the sum to be essentially
-! exact. (The sum as represented by the integer expansion will be exact,
-! but roundoff may perturb the least significant digit of the returned
-! real*8 representation of the sum.) The optional parameter arr_max_levels
-! can be used to override the calculated value. The optional parameter
-! arr_max_levels_out can be used to return the values used.
-!
-! The algorithm also requires an upper bound on
-! the maximum summand (in absolute value) for each field, and will
-! calculate this internally. However, if the optional parameters
+! on first converting each floating point summand into an equivalent
+! representation using a vector of integers, summing the integer
+! vectors, then converting the resulting sum back to a floating point
+! representation. An alternative is to use an 'almost always
+! reproducible' floating point algorithm (DDPDD), as described below.
+!
+! Description of integer vector algorithm:
+!-----------------------------------------
+! The basic idea is to represent the mantissa of each floating point
+! value as an integer, add these integers, and then convert back to a
+! floating point value. For a real*8 value, there are enough digits in
+! an integer*8 variable to not lose any information (in the
+! mantissa). However, each of these integers would have a different
+! implicit exponent if done in a naive way, and so the sum would not
+! be accurate. Also, even with the same 'normalization', the sum might
+! exceed the maximum value representable by an integer*8, causing
+! an overflow. Instead, a vector of integers is generated, where a
+! given element (or level using the terminology used in the code) of
+! the vector is associated with a particular exponent. The mantissa
+! for a given floating point value is then converted to some number of
+! integer values, depending on the exponent of the floating point
+! value, the normalization of its mantissa, the maximum number of
+! summands, the number of participating MPI tasks and of OpenMP
+! threads, and the exponents associated with the levels of the integer
+! vector, and added into the appropriate levels of the integer
+! vector. Each MPI task has its own integer vector representing the
+! local sum. This is then summed across all participating MPI tasks
+! using an MPI_Allreduce, and, lastly, converted back to a floating
+! point value. Note that the same approach works for a vector of
+! integer*4 variables, simply requiring more levels, both for the full
+! summation vector and for each individual real*8 summand. This is a
+! compile time option in the code, in support of systems for which the
+! compiler or MPI library has issues when using integer*8. As
+! implemented, this algorithm should work for any floating point and
+! integer type as long as they share the same base. The code is
+! written as if for real*8 and integer*8 variables, but the only
+! dependence is on the types 'r8' and 'i8', which are defined in the
+! code, currently with reference to the corresponding types in
+! shr_kind_mod. This is how integer*4 support is implemented, by
+! defining i8 to be shr_kind_i4 instead of shr_kind_i8.
+     ! !WHL - The CISM types are declared in glimmer_global.
+! For this to work, each MPI task must have the same number of levels
+! and same implicit exponent for each level. These levels must be
+! sufficient to represent the smallest and largest nonzero individual
+! summands (in absolute value) and the largest possible intermediate
+! sum, including the final sum. Most of the complexity in the
+! algorithm is in identifying the number of levels, the exponent
+! associated with each level, and the appropriate levels to target
+! when converting a floating point value into its integer vector
+! representation. There are also some subtleties in reconstructing the
+! final sum from the integer vector, as described below. For each
+! floating point value, the exponent and mantissa are extracted using
+! the fortran intrinsics 'exponent' and 'fraction'. The mantissa is
+! then 'shifted' to match the exponent for a target level in the
+! integer vector using the 'scale' intrinsic. 'int(X,i8)' is used
+! for the conversion for the given level, and subtraction between
+! this integer and the original 'shifted' value identifies the
+! remainder that will be converted to an integer for the next level
+! in the vector. The logic continues until the remainder is zero. As
+! mentioned above, the only requirement, due to the implementation
+! using these fortran intrinsics, is that floating point and integer
+! models use the same base, e.g.
+!  radix(1.0_r8) == radix(1_i8)
+! for real*8 and integer*8. If not, then the alternative algorithm
+! DDPDD mentioned above and described below is used instead. The
+! integer representation must also have enough digits for the
+! potential growth of the sum for each level, so could conceivably be
+! too small for a large number of summands.
+!
+! Upper bounds on the total number of summands and on all intermediate
+! sums are calculated as
+!  <number of MPI tasks>*<max number of summands per MPI task>
+! and
+!  <number of MPI tasks>*<max number of summands per MPI task>
+!   *<max absolute value over all nonzero summands>
+! respectively. The maximum number of summands per MPI task and the
+! maximum absolute value over all nonzero summands are global
+! information that need to be determined with additional MPI
+! collectives. The minimum nonzero absolute value summand is also
+! global information. Fortunately, all of these can be determined with
+! a single MPI_Allreduce call, so only one more than that required for
+! the sum itself. (Note that, in actuality, the exponents of max and
+! min summands are determined, and these are used to calculate bounds
+! on the maximum and minimum, allowing the use of an MPI_INTEGER
+! vector in the MPI_Allreduce call.)
+!
+! The actual code is made a little messier by (a) supporting summation
+! of multiple fields without increasing the number of MPI_Allreduce
+! calls, (b) supporting OpenMP threading of the local arithmetic, (c)
+! allowing the user to specify estimates for the global information
+! (to avoid the additional MPI_Allreduce), (d) including a check of
+! whether user specified bounds were sufficient and, if not,
+! determining the actual bounds and recomputing the sum, and (e)
+! allowing the user to specify the maximum number of levels to use,
+! potentially losing accuracy but still preserving reproducibility and
+! being somewhat cheaper to compute.
+!
+! The conversion of the local summands to vectors of integers, the
+! summation of the local vectors of integers, and the summation of the
+! distributed vectors of integers will be exact (if optional parameters
+! are not used to decrease the accuracy - see below). However, the
+! conversion of the vector of integer representation to a floating
+! point value may be subject to rounding errors. Before the
+! conversion, the vector of integers is adjusted so that all elements
+! have the same sign, and so that the value, in absolute value, at a
+! given level is strictly less than what can be represented at the
+! next lower level (larger exponent) and strictly greater than what
+! can represented at the next higher level (smaller exponent). Since
+! all elements have the same sign, the sign is set to positive
+! temporarily and then restored when the conversion to floating point
+! is complete. These are all integer operations, so no accuracy is
+! lost. These adjustments eliminate the possibility of catastrophic
+! cancellation. Also, when converting the individual elements to
+! floating point values and summing them, the summation is now
+! equivalent to concatenating the digits in the mantissas for the
+! component summands. In consequence, in the final step when each
+! element of this modified vector of integers is converted to a
+! floating point value and added into the intermediate sum, any
+! rounding is limited to the least significant digit representable
+! in the final floating point sum.
+!
+! Any such rounding error will be sensitive to the particular floating
+! values generated from the integer vector, and so will be
+! sensitive to the number of levels in the vector and the implicit
+! exponent associated with each level, which are themselves functions
+! of the numbers of MPI tasks and OpenMP threads and the number of
+! digits representable in an integer. To avoid this sensitivity,
+! (effectively) generate a new integer vector in which each component
+! integer has a fixed number of significant digits (e.g.,
+! digits(1.0_r8)) and generate the floating point values from these
+! before summing. (See comments in code for more details.) This
+! creates a sequence of floating point values to be summed that are
+! independent of, for example, the numbers of MPI tasks and OpenMP
+! threads or whether using integer*8 or integer*4 internal
+! representations in the integer vector, and thus ensure
+! reproducibility with respect to these options.
+!
+! Description of optional parameters for integer vector algorithm:
+!-----------------------------------------------------------------
+! The accuracy of the integer vector algorithm is controlled by the
+! total number of levels of integer expansion. The algorithm
+! calculates the number of levels that is required for the sum to be
+! essentially exact. (The sum as represented by the integer expansion
+! is exact, but roundoff may perturb the least significant digit of
+! the returned floating point representation of the sum.) The optional
+! parameter arr_max_levels can be used to override the calculated
+! value for each field. The optional parameter arr_max_levels_out can
+! be used to return the values used.
+!
+! The algorithm requires an upper bound on the maximum summand
+! (in absolute value) for each field, and will calculate this internally
+! using an MPI_Allreduce. However, if the optional parameters
 ! arr_max_levels and arr_gbl_max are both set, then the algorithm will
-! use the values in arr_gbl_max for the upper bounds instead. If these
-! are not upper bounds, or if the upper bounds are not tight enough
-! to achieve the requisite accuracy, and if the optional parameter
-! repro_sum_validate is NOT set to .false., the algorithm will repeat the
-! computation with appropriate upper bounds. If only arr_gbl_max is present,
-! then the maxima are computed internally (and the specified values are
-! ignored). The optional parameter arr_gbl_max_out can be
-! used to return the values used.
-!
-! Finally, the algorithm requires an upper bound on the number of
-! local summands across all processes. This will be calculated internally,
-! using an MPI collective, but the value in the optional argument
-! gbl_max_nsummands will be used instead if (1) it is present, (2)
-! it is > 0, and (3) the maximum value and required number of levels
-! are also specified. (If the maximum value is calculated, the same
-! MPI collective is used to determine the maximum number of local
-! summands.) The accuracy of the user-specified value is not checked.
-! However, if set to < 1, the value will instead be calculated. If the
-! optional parameter gbl_max_nsummands_out is present, then the value
-! used (gbl_max_nsummands if >= 1; calculated otherwise) will be
-! returned.
+! use the values in arr_gbl_max for the upper bounds instead. If only
+! arr_gbl_max is present, then the maxima are computed internally
+! (and the specified values are ignored). The optional parameter
+! arr_gbl_max_out can be used to return the values used.
+!
+! The algorithm also requires an upper bound on the number of
+! local summands across all MPI tasks. (By definition, the number of
+! local summands is the same for each field on a given MPI task, i.e.,
+! the input parameter nsummands.) This will be calculated internally,
+! using an MPI_Allreduce, but the value in the optional argument
+! gbl_max_nsummands will be used instead if (1) it is present,
+! (2) the value is > 0, and (3) the maximum values and required number
+! of levels are also specified. (If the maximum values are calculated,
+! then the same MPI_Allreduce is used to determine the maximum numbers
+! of local summands.) The accuracy of the user-specified value is not
+! checked. However, if set to < 1, the value will instead be calculated.
+! If the optional parameter gbl_max_nsummands_out is present,
+! then the value used (gbl_max_nsummands if >= 1; calculated otherwise)
+! will be returned.
+!
+! If the user-specified upper bounds on maximum summands are
+! inaccurate or if the user-specified upper bounds (maximum summands
+! and number of local summands) and numbers of levels causes
+! any of the global sums to have fewer than the expected
+! number of significant digits, and if the optional parameter
+! repro_sum_validate is NOT set to .false., then the algorithm will
+! repeat the computations with internally calculated values for
+! arr_max_levels, arr_gbl_max, and gbl_max_nsummands.
 !
 ! If requested (by setting cism_reprosum_reldiffmax >= 0.0 and passing in
 ! the optional rel_diff parameter), results are compared with a
@@ -288,32 +458,40 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands,     &
 !
 ! Note that the cost of the algorithm is not strongly correlated with
 ! the number of levels, which primarily shows up as a (modest) increase
-! in cost of the MPI_Allreduce as a function of vector length. Rather the
-! cost is more a function of (a) the number of integers required to
-! represent an individual summand and (b) the number of MPI_Allreduce
-! calls. The number of integers required to represent an individual
-! summand is 1 or 2 when using 8-byte integers for 8-byte real summands
-! when the number of local summands is not too large. As the number of
-! local summands increases, the number of integers required increases.
-! The number of MPI_Allreduce calls is either 2 (specifying nothing) or
-! 1 (specifying gbl_max_nsummands, arr_max_levels, and arr_gbl_max
-! correctly). When specifying arr_max_levels and arr_gbl_max
+! in the cost of the MPI_Allreduce as a function of vector length.
+! Rather the cost is more a function of (a) the number of integers
+! required to represent an individual summand and (b) the number of
+! MPI_Allreduce calls. The number of integers required to represent an
+! individual summand is 1 or 2 when using 8-byte integers for 8-byte
+! real summands when the number of local summands and number of MPI
+! tasks are not too large. As the magnitude of either of these increase,
+! the number of integers required increases. The number of
+! MPI_Allreduce calls is either 2 (specifying nothing or just
+! arr_max_levels and arr_gbl_max correctly) or 1 (specifying
+! gbl_max_nsummands, arr_max_levels, and arr_gbl_max correctly).
+! When specifying arr_max_nsummands, arr_max_levels, or arr_gbl_max
 ! incorrectly, 3 or 4 MPI_Allreduce calls will be required.
 !
+! Description of alternative (DDPDD) algorithm:
+!----------------------------------------------
 ! The alternative algorithm is a minor modification of a parallel
 ! implementation of David Bailey's routine DDPDD by Helen He
-! and Chris Ding. Bailey uses the Knuth trick to implement quadruple
-! precision summation of double precision values with 10 double
-! precision operations. The advantage of this algorithm is that
+! and Chris Ding. See, for example,
+!  Y. He, and C. Ding, 'Using Accurate Arithmetics to Improve
+!  Numerical Reproducibility and Stability in Parallel Applications,'
+!  J. Supercomputing, vol. 18, no. 3, 2001, pp. 259–277
+! and the citations therein. Bailey uses the Knuth trick to implement
+! quadruple precision summation of double precision values with 10
+! double precision operations. The advantage of this algorithm is that
 ! it requires a single MPI_Allreduce and is less expensive per summand
-! than is the fixed precision algorithm. The disadvantage is that it
+! than is the integer vector algorithm. The disadvantage is that it
 ! is not guaranteed to be reproducible (though it is reproducible
-! much more often than is the standard algorithm). This alternative
-! is used when the optional parameter ddpdd_sum is set to .true. It is
-! also used if the fixed precision algorithm radix assumption does not
-! hold.
+! much more often than is the standard floating point algorithm).
+! This alternative is used when the optional parameter ddpdd_sum is
+! set to .true. It is also used if the integer vector algorithm radix
+! assumption does not hold.
 !
-!----------------------------------------------------------------------
+!------------------------------------------------------------------------
 !
 ! Arguments
 !
@@ -328,7 +506,7 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands,     &
 
       logical,  intent(in),    optional :: ddpdd_sum
                                          ! use ddpdd algorithm instead
-                                         ! of fixed precision algorithm
+                                         ! of integer vector algorithm
 
       logical,  intent(in),    optional :: allow_infnan
          ! if .true., allow INF or NaN input values.
@@ -351,11 +529,11 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands,     &
 
       integer,  intent(in),    optional :: gbl_max_nsummands
                                          ! maximum of nsummand over all
-                                         ! processes
+                                         ! MPI tasks
 
       integer,  intent(out),   optional :: gbl_max_nsummands_out
                                          ! calculated maximum nsummands
-                                         ! over all processes
+                                         ! over all MPI tasks
 
       integer,  intent(in),    optional :: gbl_count
                                          ! was total number of summands;
@@ -377,8 +555,8 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands,     &
 
       real(r8), intent(out),   optional :: rel_diff(2,nflds)
                                          ! relative and absolute
-                                         !  differences between fixed
-                                         !  and floating point sums
+                                         !  differences between integer
+                                         !  vector and floating point sums
 
       integer,  intent(in),    optional :: commid
                                          ! MPI communicator
@@ -418,11 +596,11 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands,     &
                                          !  input array
       integer :: omp_nthreads            ! number of OpenMP threads
       integer :: mpi_comm                ! MPI subcommunicator
-      integer :: mypid                   ! MPI process ID (COMM_WORLD)
-      integer :: tasks                   ! number of MPI processes
+      integer :: mypid                   ! MPI task ID (COMM_WORLD)
+      integer :: tasks                   ! number of MPI tasks
       integer :: ierr                    ! MPI error return
       integer :: ifld, isum, ithread     ! loop variables
-      integer :: max_nsummands           ! max nsummands over all processes
+      integer :: max_nsummands           ! max nsummands over all MPI tasks
                                          !  or threads (used in both ways)
 
       integer, allocatable :: isum_beg(:), isum_end(:)
@@ -447,26 +625,30 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands,     &
       integer :: max_levels(nflds)       ! maximum number of levels of
                                          !  integer expansion to use
       integer :: max_level               ! maximum value in max_levels
+      integer :: extra_levels            ! number of extra levels needed
+                                         !  to guarantee that sum over threads
+                                         !  or tasks does not cause overflow
 
-      real(r8) :: xmax_nsummands         ! dble of max_nsummands
+      real(r8) :: xmax_nsummands         ! real(max_nsummands,r8)
       real(r8) :: arr_lsum(nflds)        ! local sums
       real(r8) :: arr_gsum_fast(nflds)   ! global sum calculated using
                                          !  fast, nonreproducible,
                                          !  floating point alg.
       real(r8) :: abs_diff               ! absolute difference between
-                                         !  fixed and floating point
+                                         !  integer vector and floating point
                                          !  sums
+      !WHL mod
       character(len=64) :: binary_str    ! string to represent 64 bits of i8 integer
       integer :: n
-
+      !end WHL mod
 #ifdef _OPENMP
       integer omp_get_max_threads
       external omp_get_max_threads
 #endif
-
-!-----------------------------------------------------------------------
 !
-! initialize local statistics variables
+!------------------------------------------------------------------------
+!
+! Initialize local statistics variables
       gbl_lor_red = 0
       gbl_max_red = 0
       repro_sum_fast = 0
@@ -474,31 +656,40 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands,     &
       repro_sum_both = 0
       nonrepro_sum = 0
 
-! set MPI communicator
+! Set MPI communicator
       if ( present(commid) ) then
          mpi_comm = commid
       else
          mpi_comm = MPI_COMM_WORLD
       endif
-#ifdef TIMING
-      call t_barrierf('sync_repro_sum',mpi_comm)
+#ifndef EAMXX_STANDALONE
+!WHL - commented out since the profile mod does not include tbarrier_f
+!      call t_barrierf('sync_repro_sum',mpi_comm)
 #endif
-! check whether should abort if input contains NaNs or INFs
+
+! Check whether should abort if input contains NaNs or INFs
       abort_inf_nan = .not. repro_sum_allow_infnan
       if ( present(allow_infnan) ) then
          abort_inf_nan = .not. allow_infnan
       endif
-#ifdef TIMING
+
+! With Fujitsu always abort on NaNs or INFs in input
+#ifdef CPRFJ
+      abort_inf_nan = .true.
+#endif
+
+#ifndef EAMXX_STANDALONE
       call t_startf('cism_reprosum_INF_NaN_Chk')
 #endif
-! initialize flags to indicate that no NaNs or INFs are present in the input data
+
+! Initialize flags to indicate that no NaNs or INFs are present in the input data
       inf_nan_gchecks = .false.
       arr_gsum_infnan = .false.
 
       !TODO - Remove the inf_nan option; assume abort_inf_nan = T, as in CICE
       if (abort_inf_nan) then
 
-! check whether input contains NaNs or INFs, and abort if so
+! Check whether input contains NaNs or INFs, and abort if so
          nan_check = any(cism_infnan_isnan(arr))
          inf_check = any(cism_infnan_isinf(arr))
 
@@ -509,20 +700,23 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands,     &
 
             if ((nan_count > 0) .or. (inf_count > 0)) then
                call mpi_comm_rank(MPI_COMM_WORLD, mypid, ierr)
-!!               write(s_logunit,37) real(nan_count,r8), real(inf_count,r8), mypid
+!               write(s_logunit,37) real(nan_count,r8), real(inf_count,r8), mypid
                write(iulog,37) real(nan_count,r8), real(inf_count,r8), mypid
-37 format("CISM_REPROSUM_CALC: Input contains ",e12.5, &
-          " NaNs and ", e12.5, " INFs on process ", i7)
-!!               call shr_sys_abort("shr_reprosum_calc ERROR: NaNs or INFs in input")
-               write(iulog,*) "cism_reprosum_calc ERROR: NaNs or INFs in input"
+37 format("cism_REPROSUM_CALC: Input contains ",e12.5, &
+          " NaNs and ", e12.5, " INFs on MPI task ", i7)
+               !WHL mod
+!               call shr_sys_abort("shr_reprosum_calc ERROR: NaNs or INFs in input")
+               write(iulog,*) 'cism_reprosum_calc ERROR: NaNs or INFs in input'
                call mpi_abort(MPI_COMM_WORLD, 1001, ierr)
+               !end WHL mod
             endif
 
          endif
 
+#ifndef CPRFJ
       else
 
-! determine whether any fields contain NaNs or INFs, and avoid processing them
+! Determine whether any fields contain NaNs or INFs, and avoid processing them
 ! via integer expansions
          inf_nan_lchecks = .false.
 
@@ -531,69 +725,79 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands,     &
             inf_nan_lchecks(2,ifld) = any(cism_infnan_isposinf(arr(:,ifld)))
             inf_nan_lchecks(3,ifld) = any(cism_infnan_isneginf(arr(:,ifld)))
          end do
-#ifdef TIMING
+#ifndef EAMXX_STANDALONE
          call t_startf("repro_sum_allr_lor")
 #endif
          call mpi_allreduce (inf_nan_lchecks, inf_nan_gchecks, 3*nflds, &
                              MPI_LOGICAL, MPI_LOR, mpi_comm, ierr)
          gbl_lor_red = 1
-#ifdef TIMING
+#ifndef EAMXX_STANDALONE
          call t_stopf("repro_sum_allr_lor")
 #endif
+
          do ifld=1,nflds
             arr_gsum_infnan(ifld) = any(inf_nan_gchecks(:,ifld))
          enddo
+#endif
 
       endif
-#ifdef TIMING
+
+#ifndef EAMXX_STANDALONE
       call t_stopf('cism_reprosum_INF_NaN_Chk')
 #endif
-! check whether should use cism_reprosum_ddpdd algorithm
+
+! Check whether should use cism_reprosum_ddpdd algorithm
       use_ddpdd_sum = repro_sum_use_ddpdd
       if ( present(ddpdd_sum) ) then
          use_ddpdd_sum = ddpdd_sum
       endif
 
-! check whether intrinsic-based algorithm will work on this system
+! Check whether intrinsic-based algorithm will work on this system
 ! (requires floating point and integer bases to be the same)
 ! If not, always use ddpdd.
-      use_ddpdd_sum = use_ddpdd_sum .or. (radix(0._r8) /= radix(0_i8))
+      use_ddpdd_sum = use_ddpdd_sum .or. (radix(1.0_r8) /= radix(1_i8))
 
       if ( use_ddpdd_sum ) then
-#ifdef TIMING
+
+#ifndef EAMXX_STANDALONE
          call t_startf('cism_reprosum_ddpdd')
 #endif
+
          call cism_reprosum_ddpdd(arr, arr_gsum, nsummands, dsummands, &
-                                  nflds, mpi_comm)
+                              nflds, mpi_comm)
          repro_sum_fast = 1
-#ifdef TIMING
+
+#ifndef EAMXX_STANDALONE
          call t_stopf('cism_reprosum_ddpdd')
 #endif
+
       else
-#ifdef TIMING
+
+#ifndef EAMXX_STANDALONE
          call t_startf('cism_reprosum_int')
 #endif
-! get number of MPI tasks
+
+! Get number of MPI tasks
          call mpi_comm_size(mpi_comm, tasks, ierr)
 
-! get number of OpenMP threads
+! Get number of OpenMP threads
 #ifdef _OPENMP
          omp_nthreads = omp_get_max_threads()
 #else
          omp_nthreads = 1
 #endif
 
-! see if have sufficient information to not require max/min allreduce
+! See if have sufficient information to not require max/min allreduce
          recompute = .true.
          validate = .false.
          if ( present(arr_gbl_max) .and. present(arr_max_levels) ) then
             recompute = .false.
 
-! setting lower bound on max_level*nflds to be 64 to improve OpenMP
+! Setting lower bound on max_level*nflds to be 64 to improve OpenMP
 ! performance for loopb in cism_reprosum_int
             max_level = (64/nflds) + 1
             do ifld=1,nflds
-               if ((arr_gbl_max(ifld) .ge. 0.0_r8) .and. &
+               if ((arr_gbl_max(ifld) >= 0.0_r8) .and. &
                    (arr_max_levels(ifld) > 0)) then
 
                   arr_gmax_exp(ifld)  = exponent(arr_gbl_max(ifld))
@@ -607,9 +811,9 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands,     &
 
             if (.not. recompute) then
 
-! determine maximum number of summands in local phases of the
+! Determine maximum number of summands in local phases of the
 ! algorithm
-#ifdef TIMING
+#ifndef EAMXX_STANDALONE
                call t_startf("repro_sum_allr_max")
 #endif
                if ( present(gbl_max_nsummands) ) then
@@ -625,40 +829,63 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands,     &
                                       MPI_INTEGER, MPI_MAX, mpi_comm, ierr)
                   gbl_max_red = 1
                endif
-#ifdef TIMING
+#ifndef EAMXX_STANDALONE
                call t_stopf("repro_sum_allr_max")
 #endif
-! determine maximum shift. Shift needs to be small enough that summation
-!  does not exceed maximum number of digits in i8.
 
-! if requested, return max_nsummands before it is redefined
+! Determine maximum shift. Shift needs to be small enough that summation,
+! in absolute value, does not exceed maximum value representable by i8.
+
+! If requested, return max_nsummands before it is redefined
                if ( present( gbl_max_nsummands_out) ) then
                   gbl_max_nsummands_out = max_nsummands
                endif
 
-! summing within each thread first
+! Summing within each thread first (adding 1 to max_nsummands
+! to ensure that integer division rounds up)
                max_nsummands = (max_nsummands/omp_nthreads) + 1
 ! then over threads and tasks
                max_nsummands = max(max_nsummands, tasks*omp_nthreads)
 ! A 'max' is used in the above calculation because the partial sum for
 ! each thread, calculated in cism_reprosum_int, is postprocessed so that
 ! each integer in the corresponding vector of integers is reduced in
-! magnitude to be less than (radix(IX_8)**arr_max_shift). Therefore,
+! magnitude to be less than (radix(1_i8)**arr_max_shift). Therefore,
 ! the maximum shift can be calculated separately for per thread sums
 ! and sums over threads and tasks, and the smaller value used. This is
 ! equivalent to using max_nsummands as defined above.
 
-               xmax_nsummands = dble(max_nsummands)
-               arr_max_shift = digits(0_i8) - (exponent(xmax_nsummands) + 1)
+               xmax_nsummands = real(max_nsummands,r8)
+               arr_max_shift = digits(1_i8) - (exponent(xmax_nsummands) + 1)
                if (arr_max_shift < 2) then
-!!                  call shr_sys_abort('repro_sum failed: number of summands too '// &
-!!                                     'large for fixed precision algorithm' )
-                  write(iulog,*) &
-                       'repro_sum failed: number of summands too large for fixed precision algorithm'
+                  !WHL mod
+!                  call shr_sys_abort('repro_sum failed: number of summands too '// &
+!                                     'large for integer vector algorithm' )
+                  write(iulog,*) 'repro_sum failed: number of summands too '// &
+                                 'large for integer vector algorithm'
                   call mpi_abort(MPI_COMM_WORLD, 1001, ierr)
+                  ! end WHL mod
                endif
-
-! calculate sum
+! Note: by construction, each floating point value will be decomposed
+! into a vector of integers each component of which will be strictly
+! less than radix(1_i8)**arr_max_shift in absolute value, and the
+! summation of max_nsummands of these, again in absolute value, will
+! then be less than
+!  radix(1_i8)**(arr_max_shift + exponent(xmax_nsummands))
+! or radix(1_i8)**(digits(1_i8) - 1). This is more conservative than
+! necessary, but it also allows the postprocessing mentioned above
+! (and described later) to proceed without danger of introducing
+! overflow.
+
+! Determine additional number of levels needed to support the
+! postprocessing that reduces the magnitude of each component
+! of the integer vector of the partial sum for each thread
+! to be less than (radix(1_i8)**arr_max_shift).
+               extra_levels = (digits(1_i8) - 1)/arr_max_shift
+! Extra levels are indexed by (-(extra_levels-1):0)
+! Derivation of this is described in the comments in
+! cism_reprosum_int.
+
+! Calculate sum
                if (present(repro_sum_validate)) then
                   validate = repro_sum_validate
                else
@@ -666,15 +893,16 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands,     &
                endif
                call cism_reprosum_int(arr, arr_gsum, nsummands, dsummands, &
                                      nflds, arr_max_shift, arr_gmax_exp, &
-                                     arr_max_levels, max_level, arr_gsum_infnan, &
-                                     validate, recompute, omp_nthreads, mpi_comm)
+                                     arr_max_levels, max_level, extra_levels, &
+                                     arr_gsum_infnan, validate, recompute, &
+                                     omp_nthreads, mpi_comm)
 
-! record statistics, etc.
+! Record statistics, etc.
                repro_sum_fast = 1
                if (recompute) then
                   repro_sum_both = 1
                else
-! if requested, return specified levels and upper bounds on maxima
+! If requested, return specified levels and upper bounds on maxima
                   if ( present(arr_max_levels_out) ) then
                      do ifld=1,nflds
                         arr_max_levels_out(ifld) = arr_max_levels(ifld)
@@ -689,40 +917,38 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands,     &
             endif
          endif
 
-! do not have sufficient information; calculate global max/min and
+! Do not have sufficient information; calculate global max/min and
 ! use to compute required number of levels
          if (recompute) then
 
-! record statistic
+! Record statistic
             repro_sum_slow = 1
 
-! determine maximum and minimum (non-zero) summand values and
+! Determine maximum and minimum (non-zero) summand values and
 ! maximum number of local summands
 
-! allocate thread-specific work space
+! Allocate thread-specific work space
             allocate(arr_tlmax_exp(nflds,omp_nthreads))
             allocate(arr_tlmin_exp(nflds,omp_nthreads))
             allocate(isum_beg(omp_nthreads))
             allocate(isum_end(omp_nthreads))
 
-! split summand index range over OpenMP threads
-!!            call split_indices(nsummands, omp_nthreads, isum_beg, isum_end)
-            !WHL - debug
-            call split_indices(dsummands, omp_nthreads, isum_beg, isum_end)
+! Split summand index range over OpenMP threads
+            call split_indices(nsummands, omp_nthreads, isum_beg, isum_end)
 
 !$omp parallel do      &
 !$omp default(shared)  &
 !$omp private(ithread, ifld, isum, arr_exp, arr_exp_tlmin, arr_exp_tlmax)
             do ithread=1,omp_nthreads
-#ifdef TIMING
+#ifndef EAMXX_STANDALONE
                call t_startf('repro_sum_loopa')
 #endif
                do ifld=1,nflds
-                  arr_exp_tlmin = MAXEXPONENT(1._r8)
-                  arr_exp_tlmax = MINEXPONENT(1._r8)
+                  arr_exp_tlmin = MAXEXPONENT(1.0_r8)
+                  arr_exp_tlmax = MINEXPONENT(1.0_r8)
                   if (.not. arr_gsum_infnan(ifld)) then
                      do isum=isum_beg(ithread),isum_end(ithread)
-                        if (arr(isum,ifld) .ne. 0.0_r8) then
+                        if (arr(isum,ifld) /= 0.0_r8) then
                            arr_exp = exponent(arr(isum,ifld))
                            arr_exp_tlmin = min(arr_exp,arr_exp_tlmin)
                            arr_exp_tlmax = max(arr_exp,arr_exp_tlmax)
@@ -732,7 +958,7 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands,     &
                   arr_tlmin_exp(ifld,ithread) = arr_exp_tlmin
                   arr_tlmax_exp(ifld,ithread) = arr_exp_tlmax
                end do
-#ifdef TIMING
+#ifndef EAMXX_STANDALONE
                call t_stopf('repro_sum_loopa')
 #endif
             end do
@@ -746,77 +972,100 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands,     &
             arr_lextremes(0,:) = -nsummands
             arr_lextremes(1:nflds,1) = -arr_lmax_exp(:)
             arr_lextremes(1:nflds,2) = arr_lmin_exp(:)
-#ifdef TIMING
+#ifndef EAMXX_STANDALONE
             call t_startf("repro_sum_allr_minmax")
 #endif
             call mpi_allreduce (arr_lextremes, arr_gextremes, 2*(nflds+1), &
                                 MPI_INTEGER, MPI_MIN, mpi_comm, ierr)
-#ifdef TIMING
+#ifndef EAMXX_STANDALONE
             call t_stopf("repro_sum_allr_minmax")
 #endif
             max_nsummands   = -arr_gextremes(0,1)
             arr_gmax_exp(:) = -arr_gextremes(1:nflds,1)
             arr_gmin_exp(:) =  arr_gextremes(1:nflds,2)
 
-! if a field is identically zero or contains INFs or NaNs, arr_gmin_exp
-!   still equals MAXEXPONENT and arr_gmax_exp still equals MINEXPONENT.
-!   In this case, set arr_gmin_exp = arr_gmax_exp = MINEXPONENT
+! If a field is identically zero or contains INFs or NaNs, arr_gmin_exp
+! still equals MAXEXPONENT and arr_gmax_exp still equals MINEXPONENT.
+! In this case, set arr_gmin_exp = arr_gmax_exp = MINEXPONENT
             do ifld=1,nflds
                arr_gmin_exp(ifld) = min(arr_gmax_exp(ifld),arr_gmin_exp(ifld))
             enddo
 
-! if requested, return upper bounds on observed maxima
+! If requested, return upper bounds on observed maxima
             if ( present(arr_gbl_max_out) ) then
                do ifld=1,nflds
                   arr_gbl_max_out(ifld) = scale(1.0_r8,arr_gmax_exp(ifld))
                enddo
             endif
 
-! if requested, return max_nsummands before it is redefined
+! If requested, return max_nsummands before it is redefined
             if ( present( gbl_max_nsummands_out) ) then
                gbl_max_nsummands_out = max_nsummands
             endif
 
-! determine maximum shift (same as in previous branch, but with calculated
-!  max_nsummands). Shift needs to be small enough that summation does not
-!  exceed maximum number of digits in i8.
+! Determine maximum shift (same as in previous branch, but with calculated
+! max_nsummands). Shift needs to be small enough that summation, in absolute
+! value, does not exceed maximum value representable by i8.
 
-! summing within each thread first
+! Summing within each thread first (adding 1 to max_nsummands
+! to ensure that integer division rounds up)
             max_nsummands = (max_nsummands/omp_nthreads) + 1
 ! then over threads and tasks
             max_nsummands = max(max_nsummands, tasks*omp_nthreads)
 ! A 'max' is used in the above calculation because the partial sum for
 ! each thread, calculated in cism_reprosum_int, is postprocessed so that
 ! each integer in the corresponding vector of integers is reduced in
-! magnitude to be less than (radix(IX_8)**arr_max_shift). Therefore,
+! magnitude to be less than (radix(1_i8)**arr_max_shift). Therefore,
 ! the maximum shift can be calculated separately for per thread sums
 ! and sums over threads and tasks, and the smaller value used. This is
 ! equivalent to using max_nsummands as defined above.
 
-            xmax_nsummands = dble(max_nsummands)
-            arr_max_shift = digits(0_i8) - (exponent(xmax_nsummands) + 1)
+            xmax_nsummands = real(max_nsummands,r8)
+            arr_max_shift = digits(1_i8) - (exponent(xmax_nsummands) + 1)
             if (arr_max_shift < 2) then
-!!               call shr_sys_abort('repro_sum failed: number of summands too '// &
-!!                                  'large for fixed precision algorithm' )
-               write(iulog,*) &
-                    'repro_sum failed: number of summands too large for fixed precision algorithm'
+               !WHL mod
+!               call shr_sys_abort('repro_sum failed: number of summands too '// &
+!                                  'large for integer vector algorithm' 
+               write(iulog,*) 'repro_sum failed: number of summands too '// &
+                              'large for integer vector algorithm'
                call mpi_abort(MPI_COMM_WORLD, 1001, ierr)
+               ! end WHL mod
             endif
-
-! determine maximum number of levels required for each field
-! ((digits(0.0_r8) + (arr_gmax_exp(ifld)-arr_gmin_exp(ifld))) / arr_max_shift)
-! + 1 because first truncation probably does not involve a maximal shift
-! + 1 to guarantee that the integer division rounds up (not down)
+! Note: by construction, each floating point value will be decomposed
+! into a vector of integers each component of which will be strictly
+! less than radix(1_i8)**arr_max_shift in absolute value, and the
+! summation of max_nsummands of these, again in absolute value, will
+! then be less than
+!  radix(1_i8)**(arr_max_shift + exponent(xmax_nsummands))
+! or radix(1_i8)**(digits(1_i8) - 1). This is more conservative than
+! necessary, but it also allows the postprocessing mentioned above
+! (and described later) to proceed without danger of introducing
+! overflow.
+
+! Determine maximum number of levels required for each field.
+! Need enough levels to represent both the smallest and largest
+! nonzero summands (in absolute value), and any values in between.
+! The number of digits from the most significant digit in the
+! largest summand to the most significant digit in the smallest
+! summand is (arr_gmax_exp(ifld)-arr_gmin_exp(ifld)), and the maximum
+! number of digits needed to represent the smallest value is
+! digits(1.0_r8). Divide this total number of digits by the number of
+! digits per level (arr_max_shift) to get the number of levels
+!  ((digits(1.0_r8) + (arr_gmax_exp(ifld)-arr_gmin_exp(ifld))) / arr_max_shift)
+! with some tweaks:
+!  + 1 because first truncation for any given summand probably does
+!  not involve a maximal shift (but this adds only one to the total)
+!  + 1 to guarantee that the integer division rounds up (not down)
 ! (setting lower bound on max_level*nflds to be 64 to improve OpenMP
 !  performance for loopb in cism_reprosum_int)
             max_level = (64/nflds) + 1
             do ifld=1,nflds
                max_levels(ifld) = 2 + &
-                ((digits(0.0_r8) + (arr_gmax_exp(ifld)-arr_gmin_exp(ifld))) &
+                ((digits(1.0_r8) + (arr_gmax_exp(ifld)-arr_gmin_exp(ifld))) &
                 / arr_max_shift)
                if ( present(arr_max_levels) .and. (.not. validate) ) then
-! if validate true, then computation with arr_max_levels failed
-!  previously
+! If validate true, then computation with arr_max_levels failed
+! previously
                   if ( arr_max_levels(ifld) > 0 ) then
                      max_levels(ifld) = &
                         min(arr_max_levels(ifld),max_levels(ifld))
@@ -826,37 +1075,52 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands,     &
                   max_level = max_levels(ifld)
             enddo
 
-! if requested, return calculated levels
+! If requested, return calculated levels
             if ( present(arr_max_levels_out) ) then
                do ifld=1,nflds
                   arr_max_levels_out(ifld) = max_levels(ifld)
                enddo
             endif
 
-! calculate sum
+! Determine additional number of levels needed to support the
+! postprocessing that reduces the magnitude of each component
+! of the integer vector of the partial sum for each thread
+! to be less than (radix(1_i8)**arr_max_shift).
+            extra_levels = (digits(1_i8) - 1)/arr_max_shift
+! Extra levels are indexed by (-(extra_levels-1):0)
+! Derivation of this is described in the comments in
+! cism_reprosum_int.
+
+! Calculate sum
             validate = .false.
             call cism_reprosum_int(arr, arr_gsum, nsummands, dsummands, &
                                   nflds, arr_max_shift, arr_gmax_exp, &
-                                  max_levels, max_level, arr_gsum_infnan, &
-                                  validate, recompute, omp_nthreads, mpi_comm)
+                                  max_levels, max_level, extra_levels, &
+                                  arr_gsum_infnan, validate, recompute, &
+                                  omp_nthreads, mpi_comm)
 
          endif
-#ifdef TIMING
+
+#ifndef EAMXX_STANDALONE
          call t_stopf('cism_reprosum_int')
 #endif
+
       endif
 
-! compare fixed and floating point results
+! Compare integer vector and floating point results
       if ( present(rel_diff) ) then
          if (cism_reprosum_reldiffmax >= 0.0_r8) then
-#ifdef TIMING
-            call t_barrierf('sync_nonrepro_sum',mpi_comm)
+#ifndef EAMXX_STANDALONE
+            !WHL - commented out since the profile mod does not include tbarrier_f
+!            call t_barrierf('sync_nonrepro_sum',mpi_comm)
+#endif
+#ifndef EAMXX_STANDALONE
             call t_startf('nonrepro_sum')
 #endif
-! record statistic
+! Record statistic
             nonrepro_sum = 1
-! compute nonreproducible sum
-            arr_lsum(:) = 0._r8
+! Compute nonreproducible sum
+            arr_lsum(:) = 0.0_r8
 !$omp parallel do      &
 !$omp default(shared)  &
 !$omp private(ifld, isum)
@@ -867,17 +1131,21 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands,     &
                   end do
                endif
             end do
-#ifdef TIMING
+
+#ifndef EAMXX_STANDALONE
             call t_startf("nonrepro_sum_allr_r8")
 #endif
             call mpi_allreduce (arr_lsum, arr_gsum_fast, nflds, &
                                 MPI_REAL8, MPI_SUM, mpi_comm, ierr)
-#ifdef TIMING
+#ifndef EAMXX_STANDALONE
             call t_stopf("nonrepro_sum_allr_r8")
+#endif
 
+#ifndef EAMXX_STANDALONE
             call t_stopf('nonrepro_sum')
 #endif
-! determine differences
+
+! Determine differences
 !$omp parallel do      &
 !$omp default(shared)  &
 !$omp private(ifld, abs_diff)
@@ -914,7 +1182,7 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands,     &
          endif
       end do
 
-! return statistics
+! Return statistics
       if ( present(repro_sum_stats) ) then
          repro_sum_stats(1) = repro_sum_stats(1) + repro_sum_fast
          repro_sum_stats(2) = repro_sum_stats(2) + repro_sum_slow
@@ -924,38 +1192,44 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands,     &
          repro_sum_stats(6) = repro_sum_stats(6) + gbl_lor_red
       endif
 
+      !WHL mod: optional diagnostics
       if (verbose_reprosum) then
          call mpi_comm_rank(MPI_COMM_WORLD, mypid, ierr)
-         if (mypid == 0) then
-            write(iulog,*) 'Exit reprosum, nflds =', nflds
-            write(iulog,*) '   n, arr_gsum, binary_str:'
+          if (mypid == 0) then
+             write(iulog,*) 'Exit reprosum, nflds =', nflds
+             write(iulog,*) '   n, arr_gsum, binary_str:'
 !!            do n = 1, nflds
-            do n = 1, min(2,nflds)
-               call double_to_binary(arr_gsum(n), binary_str)
-               write(iulog,*) n, arr_gsum(n), binary_str
-            enddo
-         endif
-      endif
+             do n = 1, min(2,nflds)
+                call double_to_binary(arr_gsum(n), binary_str)
+                write(iulog,*) n, arr_gsum(n), binary_str
+             enddo
+          endif
+       endif
+       ! end WHL mod
+
+  end subroutine cism_reprosum_calc
 
-   end subroutine cism_reprosum_calc
 !
 !========================================================================
 !
 
-   subroutine cism_reprosum_int (arr, arr_gsum, nsummands, dsummands, nflds, &
+   subroutine cism_reprosum_int(arr, arr_gsum, nsummands, dsummands, nflds, &
                                 arr_max_shift, arr_gmax_exp, max_levels,    &
-                                max_level, skip_field, validate, recompute, &
-                                omp_nthreads, mpi_comm                      )
-!----------------------------------------------------------------------
+                                max_level, extra_levels, skip_field,        &
+                                validate, recompute, omp_nthreads, mpi_comm )
+!------------------------------------------------------------------------
 !
 ! Purpose:
-! Compute the global sum of each field in "arr" using the indicated
+! Compute the global sum of each field in 'arr' using the indicated
 ! communicator with a reproducible yet scalable implementation based
-! on a fixed point algorithm. The accuracy of the fixed point algorithm
-! is controlled by the number of "levels" of integer expansion, the
-! maximum value of which is specified by max_level.
+! on first converting each floating point summand into an equivalent
+! representation using a vector of integers, summing the integer
+! vectors, then converting the resulting sum back to a floating point
+! representation. The accuracy of the integer vector algorithm is
+! controlled by the number of 'levels' of integer expansion, the maximum
+! value of which is specified by max_level.
 !
-!----------------------------------------------------------------------
+!------------------------------------------------------------------------
 !
 ! Arguments
 !
@@ -972,6 +1246,10 @@ subroutine cism_reprosum_int (arr, arr_gsum, nsummands, dsummands, nflds, &
                                             !  of integer expansion
       integer,  intent(in) :: max_level     ! maximum value in
                                             !  max_levels
+      integer,  intent(in) :: extra_levels  ! number of extra levels
+                                            ! needed to guarantee that
+                                            ! sum over threads or tasks
+                                            ! does not cause overflow
       integer,  intent(in) :: omp_nthreads  ! number of OpenMP threads
       integer,  intent(in) :: mpi_comm      ! MPI subcommunicator
 
@@ -996,25 +1274,27 @@ subroutine cism_reprosum_int (arr, arr_gsum, nsummands, dsummands, nflds, &
 !
 ! Local workspace
 !
-      integer, parameter  :: max_jlevel = &
-                                1 + (digits(0_i8)/digits(0.0_r8))
+      integer, parameter  :: max_svlevel_factor = &
+                                1 + (digits(1_i8)/digits(1.0_r8))
 
-      integer(i8) :: i8_arr_tlsum_level(0:max_level,nflds,omp_nthreads)
+      integer(i8) :: i8_arr_tlsum_level(-(extra_levels-1):max_level,nflds,omp_nthreads)
                                    ! integer vector representing local
                                    !  sum (per thread, per field)
-      integer(i8) :: i8_arr_lsum_level((max_level+3)*nflds)
+      integer(i8) :: i8_arr_lsum_level((max_level+extra_levels+2)*nflds)
                                    ! integer vector representing local
                                    !  sum
       integer(i8) :: i8_arr_level  ! integer part of summand for current
                                    !  expansion level
-      integer(i8) :: i8_arr_gsum_level((max_level+3)*nflds)
+      integer(i8) :: i8_arr_gsum_level((max_level+extra_levels+2)*nflds)
                                    ! integer vector representing global
                                    !  sum
-      integer(i8) :: IX_8          ! integer representation of current
-                                   !  jlevels of X_8 ('part' of
-                                   !  i8_arr_gsum_level)
+      integer(i8) :: i8_gsum_level(-(extra_levels-1):max_level)
+                                   ! integer vector representing global
+                                   !  sum for one field
+      integer(i8) :: IX_8          ! integer representation of r8 value
       integer(i8) :: i8_sign       ! sign global sum
-      integer(i8) :: i8_radix      ! radix for i8 variables
+      integer(i8) :: i8_radix      ! radix for i8 variables (and r8
+                                   !  variables by earlier if-test)
 
       integer :: max_error(nflds,omp_nthreads)
                                    ! accurate upper bound on data?
@@ -1024,7 +1304,7 @@ subroutine cism_reprosum_int (arr, arr_gsum, nsummands, dsummands, nflds, &
       integer :: isum_beg(omp_nthreads), isum_end(omp_nthreads)
                                    ! range of summand indices for each
                                    !  OpenMP thread
-      integer :: ifld, isum, ithread
+      integer :: ifld, isum, ithread, jlevel
                                    ! loop variables
       integer :: arr_exp           ! exponent of summand
       integer :: arr_shift         ! exponent used to generate integer
@@ -1035,14 +1315,31 @@ subroutine cism_reprosum_int (arr, arr_gsum, nsummands, dsummands, nflds, &
                                    !  expansion of current ifld
       integer :: voffset           ! modification to offset used to
                                    !  include validation metrics
+      integer :: min_level         ! index of minimum levels (including
+                                   !  extra levels) for i8_arr_tlsum_level
       integer :: ioffset           ! offset(ifld)
-      integer :: jlevel            ! number of floating point 'pieces'
-                                   !  extracted from a given i8 integer
+      integer :: svlevel           ! number of summands in summand_vector
       integer :: ierr              ! MPI error return
-      integer :: LX(max_jlevel)    ! exponent of X_8 (see below)
+      integer :: LX                ! exponent of X_8 (see below)
       integer :: veclth            ! total length of i8_arr_lsum_level
-      integer :: sum_digits        ! lower bound on number of significant
-                                   !  in integer expansion of sum
+      integer :: i8_digit_count    ! number of digits in integer
+                                   !  expansion of sum
+      integer :: i8_begin_level    ! level starting from in
+                                   !  creating next 'exactly representable'
+                                   !  floating point value from modified
+                                   !  integer expansion of the sum
+      integer :: i8_trunc_level    ! level at which the number of digits in
+                                   !  the modified integer expansion of the
+                                   !  sum exceeds the number of representable
+                                   !  digits in the floating point sum
+      integer :: i8_trunc_loc      ! location of last digit at i8_trunc_level
+                                   !  in the modified integer expansion of the
+                                   !  sum that is representable in the floating
+                                   !  point sum
+      integer(i8) :: i8_trunc_level_rem
+                                   ! truncated digits at i8_trunc_level
+                                   !  in the modified integer expansion
+                                   !  of the sum
       integer :: curr_exp          ! exponent of partial sum during
                                    !  reconstruction from integer vector
       integer :: corr_exp          ! exponent of current summand in
@@ -1051,35 +1348,38 @@ subroutine cism_reprosum_int (arr, arr_gsum, nsummands, dsummands, nflds, &
       real(r8) :: arr_frac         ! fraction of summand
       real(r8) :: arr_remainder    ! part of summand remaining after
                                    !  current level of integer expansion
-      real(r8) :: X_8(max_jlevel)  ! r8 vector representation of current
+      real(r8) :: X_8              ! r8 representation of current
                                    !  i8_arr_gsum_level
-      real(r8) :: RX_8             ! r8 representation of difference
-                                   !  between current i8_arr_gsum_level
-                                   !  and current jlevels of X_8
-                                   !  (== IX_8). Also used in final
-                                   !  scaling step
-
-      logical :: first             ! flag used to indicate that just
-                                   !  beginning reconstruction of sum
-                                   !  from integer vector
-
-      !WHL - debug
-      integer :: mypid, k
-      call mpi_comm_rank(MPI_COMM_WORLD, mypid, ierr)
-!
-!-----------------------------------------------------------------------
+      real(r8) :: RX_8             ! r8 representation of (other)
+                                   !  integers used in calculation.
+      real(r8) :: summand_vector((max_level+extra_levels)*max_svlevel_factor)
+                                   ! vector of r8 values generated from
+                                   !  integer vector representation to be
+                                   !  summed to generate global sum
+
+      logical :: first_stepd_iteration
+                                   ! flag used to indicate whether first
+                                   !  time through process of converting
+                                   !  vector of integers into a floating
+                                   !  point value, as it requires
+                                   !  special logic
+!
+!------------------------------------------------------------------------
 ! Save radix of i8 variables in an i8 variable
       i8_radix = radix(IX_8)
 
 ! If validating upper bounds, reserve space for validation metrics
-! In both cases, reserve an extra level for overflow from the top level
+! In both cases, reserve extra levels for overflows from the top level
       if (validate) then
-        voffset = 3
+        voffset = extra_levels + 2
       else
-        voffset = 1
+        voffset = extra_levels
       endif
 
-      ! compute offsets for each field
+! For convenience, define minimum level index for i8_arr_tlsum_level
+      min_level = -(extra_levels-1)
+
+! Compute offsets for each field
       offset(1) = voffset
       do ifld=2,nflds
          offset(ifld) = offset(ifld-1) &
@@ -1087,29 +1387,23 @@ subroutine cism_reprosum_int (arr, arr_gsum, nsummands, dsummands, nflds, &
       enddo
       veclth = offset(nflds) + max_levels(nflds)
 
-! split summand index range over OpenMP threads
-      call split_indices(dsummands, omp_nthreads, isum_beg, isum_end)
+! Split summand index range over OpenMP threads
+      call split_indices(nsummands, omp_nthreads, isum_beg, isum_end)
 
-! convert local summands to vector of integers and sum
+! Convert local summands to vector of integers and sum
 ! (Using scale instead of set_exponent because arr_remainder may not be
-!  "normal" after level 1 calculation)
+! 'normal' after level 1 calculation)
       i8_arr_lsum_level(:) = 0_i8
 
-      !WHLmod - initialized the gsum also.
-      !         This avoids having some garbage (lots of nonzero values) in the vector returned from mpi_reduce_sum.
-      !         (Though I don't think the nonzero values do any harm
-      i8_arr_gsum_level(:) = 0_i8
-      !end WHLmod
-
 !$omp parallel do      &
 !$omp default(shared)  &
 !$omp private(ithread, ifld, ioffset, isum, arr_frac, arr_exp, &
 !$omp         arr_shift, ilevel, i8_arr_level, arr_remainder, RX_8, IX_8)
       do ithread=1,omp_nthreads
-#ifdef TIMING
-         call t_startf('repro_sum_loopb')
+#ifndef EAMXX_STANDALONE
+       call t_startf('repro_sum_loopb')
 #endif
-         do ifld=1,nflds
+       do ifld=1,nflds
           ioffset = offset(ifld)
 
           max_error(ifld,ithread) = 0
@@ -1119,25 +1413,24 @@ subroutine cism_reprosum_int (arr, arr_gsum, nsummands, dsummands, nflds, &
           if (skip_field(ifld)) cycle
 
           do isum=isum_beg(ithread),isum_end(ithread)
-
             arr_remainder = 0.0_r8
 
-            if (arr(isum,ifld) .ne. 0.0_r8) then
+            if (arr(isum,ifld) /= 0.0_r8) then
                arr_exp   = exponent(arr(isum,ifld))
                arr_frac  = fraction(arr(isum,ifld))
 
-! test that global maximum upper bound is an upper bound
+! Test that global maximum upper bound is an upper bound
                if (arr_exp > arr_gmax_exp(ifld)) then
                   max_error(ifld,ithread) = 1
                   exit
                endif
 
-! calculate first shift
+! Calculate first shift
                arr_shift = arr_max_shift - (arr_gmax_exp(ifld)-arr_exp)
 
-! determine first (probably) nonzero level (assuming initial fraction is
-!  'normal' - algorithm still works if this is not true)
-!  NOTE: this is critical; scale will set to zero if min exponent is too small.
+! Determine first (probably) nonzero level (assuming initial fraction is
+! 'normal' - algorithm still works if this is not true)
+! NOTE: this is critical; scale will set to zero if min exponent is too small.
                if (arr_shift < 1) then
                   ilevel = (1 + (arr_gmax_exp(ifld)-arr_exp))/arr_max_shift
                   arr_shift = ilevel*arr_max_shift - (arr_gmax_exp(ifld)-arr_exp)
@@ -1150,67 +1443,118 @@ subroutine cism_reprosum_int (arr, arr_gsum, nsummands, dsummands, nflds, &
                   ilevel = 1
                endif
 
-               if (ilevel .le. max_levels(ifld)) then
-! apply first shift/truncate, add it to the relevant running
+               if (ilevel <= max_levels(ifld)) then
+! Apply first shift/truncate, add it to the relevant running
 ! sum, and calculate the remainder.
                   arr_remainder = scale(arr_frac,arr_shift)
                   i8_arr_level = int(arr_remainder,i8)
                   i8_arr_tlsum_level(ilevel,ifld,ithread) = &
                      i8_arr_tlsum_level(ilevel,ifld,ithread) + i8_arr_level
                   arr_remainder = arr_remainder - i8_arr_level
-                  
-! while the remainder is non-zero, continue to shift, truncate,
+
+! While the remainder is non-zero, continue to shift, truncate,
 ! sum, and calculate new remainder
-                  do while ((arr_remainder .ne. 0.0_r8) &
+                  do while ((arr_remainder /= 0.0_r8) &
                      .and. (ilevel < max_levels(ifld)))
                      ilevel = ilevel + 1
                      arr_remainder = scale(arr_remainder,arr_max_shift)
                      i8_arr_level = int(arr_remainder,i8)
                      i8_arr_tlsum_level(ilevel,ifld,ithread) = &
-                          i8_arr_tlsum_level(ilevel,ifld,ithread) + i8_arr_level
-
+                        i8_arr_tlsum_level(ilevel,ifld,ithread) + i8_arr_level
                      arr_remainder = arr_remainder - i8_arr_level
-
-                  enddo  ! arr_remainder /= 0.0_r8
+                  enddo
 
                endif
-            endif  ! arr(isum,ifld) /= 0.0_r8
+            endif
 
-            if (arr_remainder .ne. 0.0_r8) then
+            if (arr_remainder /= 0.0_r8) then
                not_exact(ifld,ithread) = 1
             endif
 
-          enddo  ! isum
-
-! postprocess integer vector to eliminate potential for overlap in the following
-! sums over threads and processes: if value larger than or equal to
-! (radix(IX_8)**arr_max_shift), add this 'overlap' to next larger integer in
-! vector, resulting in nonoverlapping ranges for each component. Note that
-! "ilevel-1==0" corresponds to an extra level used to guarantee that the sums
-! over threads and processes do not overflow for ilevel==1.
-          do ilevel=max_levels(ifld),1,-1
-             RX_8 = i8_arr_tlsum_level(ilevel,ifld,ithread)
-             IX_8 = int(scale(RX_8,-arr_max_shift),i8)
-             if (IX_8 .ne. 0_i8) then
+          enddo
+! Postprocess integer vector to eliminate possibility of overflow
+! during subsequent sum over threads and tasks, as per earlier
+! comment on logic behind definition of max_nsummands. If value at a
+! given level is larger than or equal to
+! (radix(1_i8)**arr_max_shift), subtract this 'overlap' from the
+! current value and add it (appropriately shifted) to the value at
+! the next smaller level in the vector.
+! (a) As described earlier, prior to this postprocessing the integer
+!     components are each strictly less than
+!     radix(1_i8)**(digits(1_i8) - 1) in absolute value. So, after
+!     shifting, the absolute value of the amount added to level
+!     max_levels(ifld)-1 from level max_levels(ifld) is less than
+!     radix(1_i8)**(digits(1_i8) - 1 - arr_max_shift) with the
+!     resulting sum, in absolute value, being less than
+!      (radix(1_i8)**(digits(1_i8) - 1))*(1 + radix(1_i8)**(-arr_max_shift)).
+!     Any overlap from this component is then added to the level
+!     max_levels(ifld)-2, etc., with resulting intermediate sums, in
+!     absolute value, for levels 1 to max_levels(ifld) being bounded
+!     from above by 
+!      (radix(1_i8)**(digits(1_i8) - 1))*sum{i=0,inf}(radix(1_i8)**(-i*arr_max_shift)).
+!     Since radix(1_i8) >= 2 and arr_max_shift is also required to be
+!     >= 2 (otherwise the code exits with an error) this is less than
+!     or equal to 
+!      (radix(1_i8)**(digits(1_i8) - 1))*sum{i=0,inf}(2**(-2i)),
+!     or
+!      (radix(1_i8)**(digits(1_i8) - 1))*(4/3).
+!     In summary, this shows that no absolute value generated during
+!     this process will exceed the maximum value representable in i8,
+!     i.e. (radix(1_i8)**(digits(1_i8)) - 1), as long as
+!     digits(1_i8) >= 2. 
+! (b) 'ilevel==0,...,-(extra_levels-1)' correspond to extra levels
+!     used to continue the above process until values at all levels
+!     are less than radix(1_i8)**arr_max_shift in absolute value
+!     (except level -(extra_levels-1), as described below). The
+!     result of shifting the overlap from level 1 to level 0, which
+!     is initially zero, is bounded in absolute value by 
+!      (radix(1_i8)**(digits(1_i8) - 1 - arr_max_shift))*(4/3).
+!     After removing any overlap from level 0, the upper bound for
+!     level -1, which is also initially zero, is
+!      (radix(1_i8)**(digits(1_i8) - 1 - 2*arr_max_shift))*(4/3).
+!     Continuing the process, when get to level -(extra_levels-1),
+!     the upper bound is
+!      (radix(1_i8)**(digits(1_i8) - 1 - extra_levels*arr_max_shift))*(4/3).
+!     If we define
+!      extra_levels = ceiling[(digits(1_i8) - 1)/arr_max_shift - 1]
+!     then the upper bound is
+!      (radix(1_i8)**(arr_max_shift))*(4/3).
+!     Setting 
+!      extra_levels = (digits(1_i8) - 1)/arr_max_shift
+!     is then a slightly conservative estimate that achieves the same
+!     upper bound. While the above upper bound at level
+!     -(extra_levels-1)is a factor of (4/3) larger than the target
+!     radix(1_i8)**arr_max_shift, it is still small enough so that
+!     the sum over threads and tasks, bounded from above in absolute
+!     value by 
+!      (radix(1_i8)**(digits(1_i8) - 1))*(4/3),
+!     will not cause an overflow at level -(extra_levels-1) as long as
+!     digits(1_i8) >= 2.
+          do ilevel=max_levels(ifld),min_level+1,-1
+             if (abs(i8_arr_tlsum_level(ilevel,ifld,ithread)) >= &
+                    (i8_radix**arr_max_shift)) then
+
+                IX_8 = i8_arr_tlsum_level(ilevel,ifld,ithread) &
+                       / (i8_radix**arr_max_shift)
                 i8_arr_tlsum_level(ilevel-1,ifld,ithread) = &
                    i8_arr_tlsum_level(ilevel-1,ifld,ithread) + IX_8
+
                 IX_8 = IX_8*(i8_radix**arr_max_shift)
                 i8_arr_tlsum_level(ilevel,ifld,ithread)   = &
                    i8_arr_tlsum_level(ilevel,ifld,ithread) - IX_8
              endif
           enddo
        enddo
-#ifdef TIMING
+#ifndef EAMXX_STANDALONE
        call t_stopf('repro_sum_loopb')
 #endif
-    enddo  ! ithread
+      enddo
 
-! sum contributions from different threads
+! Sum contributions from different threads
       do ifld=1,nflds
-
          ioffset = offset(ifld)
          do ithread = 1,omp_nthreads
-            do ilevel = 0,max_levels(ifld)
+            do ilevel = min_level,max_levels(ifld)
                i8_arr_lsum_level(ioffset+ilevel) = &
                   i8_arr_lsum_level(ioffset+ilevel) &
                   + i8_arr_tlsum_level(ilevel,ifld,ithread)
@@ -1218,7 +1562,7 @@ subroutine cism_reprosum_int (arr, arr_gsum, nsummands, dsummands, nflds, &
          enddo
       enddo
 
-! record if upper bound was inaccurate or if level expansion stopped
+! Record if upper bound was inaccurate or if level expansion stopped
 ! before full accuracy was achieved
       if (validate) then
          do ifld=1,nflds
@@ -1228,188 +1572,396 @@ subroutine cism_reprosum_int (arr, arr_gsum, nsummands, dsummands, nflds, &
          enddo
       endif
 
-! sum integer vector element-wise
+! Sum integer vector element-wise
 #if ( defined noI8 )
      ! Workaround for when shr_kind_i8 is not supported.
-#ifdef TIMING
+#ifndef EAMXX_STANDALONE
       call t_startf("repro_sum_allr_i4")
 #endif
       call mpi_allreduce (i8_arr_lsum_level, i8_arr_gsum_level, &
                           veclth, MPI_INTEGER, MPI_SUM, mpi_comm, ierr)
-
-#ifdef TIMING
+#ifndef EAMXX_STANDALONE
       call t_stopf("repro_sum_allr_i4")
 #endif
 #else
-#ifdef TIMING
+#ifndef EAMXX_STANDALONE
       call t_startf("repro_sum_allr_i8")
 #endif
-
       call mpi_allreduce (i8_arr_lsum_level, i8_arr_gsum_level, &
                           veclth, MPI_INTEGER8, MPI_SUM, mpi_comm, ierr)
-
-#ifdef TIMING
+#ifndef EAMXX_STANDALONE
       call t_stopf("repro_sum_allr_i8")
 #endif
 #endif
 
+#ifndef EAMXX_STANDALONE
+      call t_startf('repro_sum_finalsum')
+#endif
 ! Construct global sum from integer vector representation:
 !  1) arr_max_shift is the shift applied to fraction(arr_gmax) .
-!   When shifting back, need to "add back in" true arr_gmax exponent. This was
-!   removed implicitly by working only with the fraction .
-!  2) want to add levels into sum in reverse order (smallest to largest). However,
-!   even this can generate floating point rounding errors if signs of integers
-!   alternate. To avoid this, do some arithmetic with integer vectors so that all
-!   components have the same sign. This should keep relative difference between
-!   using different integer sizes (e.g. i8 and i4) to machine epsilon
-!  3) assignment to X_8 will usually lose accuracy since maximum integer
-!   size is greater than the max number of 'digits' in r8 value (if xmax_nsummands
-!   correction not very large). Calculate remainder and add in first (since
-!   smaller). One correction is sufficient for r8 (53 digits) and i8 (63 digits).
-!   For r4 (24 digits) may need to correct twice. Code is written in a general
-!   fashion, to work no matter how many corrections are necessary (assuming
-!   max_jlevel parameter calculation is correct).
+!   When shifting back, need to 'add back in' the true arr_gmax exponent.
+!   This was removed implicitly by working only with the fraction.
+!  2) To avoid the possibility of catastrophic cancellation, and
+!   an unacceptable floating point rounding error, can do some arithmetic
+!   with the integer vector so that all components have the same sign.
+!  3) If convert each integer in the integer vector to a floating
+!   point value and then add these together, smallest to largest, to
+!   calculate the final sum, there may be roundoff error in the least
+!   significant digit. This error will be sensitive to the particular
+!   floating values generated from the integer vector, and so will be
+!   sensitive to the number of levels in the vector and the implicit
+!   exponent associated with each level. So this approach is not
+!   guaranteed to be reproducible with respect to a change in the
+!   number of MPI tasks and OpenMP threads (as this changes the
+!   definition of max_nsummands, and thus also arr_max_shift). It is
+!   also not guaranteed to be reproducible with respect to changing
+!   the integer size, e.g. from i8 to i4, as this also changes
+!   arr_max_shift. However, can eliminate this potential loss of
+!   reproducibility by taking the following steps. 
+!   a) Manipulate the integer vector so that
+!    i)  the component values do not 'overlap', that is, the value
+!        represented by a component is strictly less than the value
+!        represented by the least significant digit in the previous
+!        component, and 
+!    ii) all components are positive (saving the sign to be restored
+!        to the final result).
+!   b) Identify the digit in the resulting integer vector that is the
+!      last representable in the floating point representation, then
+!      truncate the vector at this point, i.e., all digits of lesser
+!      significance in the given component and all components
+!      representing digits of lesser significance (call this the
+!      remainder). 
+!   c) Convert each integer component in the modified integer vector
+!      to its corresponding floating point value and sum the
+!      sequence. (Order is unimportant, as explained below, but here
+!      add largest to smallest.) 
+!   d) Repeat (b) and (c) for the remainder (recursively, as
+!      necessary).
+!   e) Sum all floating point numbers generated by step (c), smallest
+!      to largest. 
+!   f) Restore the sign.
+!   With the manipulations in (a) and (b), the summation in (c) is
+!   equivalent to concatenating the digits in the mantissas for the
+!   component summands, so rounding is irrelevant (so far). Repeating
+!   this with the remainder(s) generates a sequence of 'exact'
+!   floating point numbers. Summing these can still generate a
+!   rounding error in the least significant digit in the largest
+!   floating point value (which is the last representable digit in the
+!   final result), but the floating point values being summed and
+!   order of summation are independent of the number of levels and
+!   implicit exponents, so reproducibility is ensured.
+!
+!  Note that assignment of an i8 integer value to an r8 floating point
+!  variable in step (c) can lead to a loss of accuracy because the
+!  maximum number of digits in the i8 integer can be greater than the
+!  maximum number of digits representable in the r8 variable (if the
+!  xmax_nsummands correction is not very large). With the same sign
+!  and nonoverlapping properties of the integer components, these lost
+!  digits will also not be representable in the final sum. The process
+!  described above of truncating at this last representable digit, and
+!  then separately generating floating point value(s) for the
+!  remainder, takes care of this automatically. Similar reasoning
+!  applies to r4 floating point values with either i8 or i4 integer
+!  components.
 
       recompute = .false.
       do ifld=1,nflds
          arr_gsum(ifld) = 0.0_r8
          ioffset = offset(ifld)
+         svlevel = 0
 
-! if validate is .true., test whether the summand upper bound
-!  was exceeded on any of the processes
+! If validate is .true., test whether the summand upper bound
+! was exceeded on any of the MPI tasks
          if (validate) then
-            if (i8_arr_gsum_level(ioffset-voffset+1) .ne. 0_i8) then
+            if (i8_arr_gsum_level(ioffset-voffset+1) /= 0_i8) then
                recompute = .true.
             endif
          endif
 
          if (.not. recompute) then
+! Copy integer vector for current field from i8_arr_gsum_level, so that
+! can be modified without changing i8_arr_gsum_level. (Preserving
+! i8_arr_gsum_level unchanged is not necessary, but is convenient for debugging
+! and makes indexing clearer and less error prone.)
+            i8_gsum_level(:) = 0_i8
+            do ilevel=min_level,max_levels(ifld)
+               i8_gsum_level(ilevel) = i8_arr_gsum_level(ioffset+ilevel)
+            enddo
 
-! preprocess integer vector:
-!  a) if value larger than or equal to (radix(IX_8)**arr_max_shift), add this 'overlap'
-!     to next larger integer in vector, resulting in nonoverlapping ranges for each
-!     component. Note that have "ilevel-1=0" level here as described above.
-           do ilevel=max_levels(ifld),1,-1
-             RX_8 = i8_arr_gsum_level(ioffset+ilevel)
-             IX_8 = int(scale(RX_8,-arr_max_shift),i8)
-             if (IX_8 .ne. 0_i8) then
-               i8_arr_gsum_level(ioffset+ilevel-1) = i8_arr_gsum_level(ioffset+ilevel-1) &
-                                                   + IX_8
-               IX_8 = IX_8*(i8_radix**arr_max_shift)
-               i8_arr_gsum_level(ioffset+ilevel)   = i8_arr_gsum_level(ioffset+ilevel)   &
-                                                   - IX_8
-             endif
-           enddo
-!  b) subtract +/- 1 from larger and add +/- 1 to smaller when necessary
-!     so that all vector components have the same sign (eliminating loss
-!     of accuracy arising from difference of large values when
-!     reconstructing r8 sum from integer vector)
-           ilevel = 0
-           do while ((i8_arr_gsum_level(ioffset+ilevel) .eq. 0_i8) &
-                     .and. (ilevel < max_levels(ifld)))
-             ilevel = ilevel + 1
-           enddo
-!
-           if (ilevel < max_levels(ifld)) then
-              if (i8_arr_gsum_level(ioffset+ilevel) > 0_i8) then
-                 i8_sign = 1_i8
-              else
-                 i8_sign = -1_i8
-              endif
-              do jlevel=ilevel,max_levels(ifld)-1
-                 if (sign(1_i8,i8_arr_gsum_level(ioffset+jlevel)) &
-                     .ne. sign(1_i8,i8_arr_gsum_level(ioffset+jlevel+1))) then
-                    i8_arr_gsum_level(ioffset+jlevel)   = i8_arr_gsum_level(ioffset+jlevel) &
-                                                        - i8_sign
-                    i8_arr_gsum_level(ioffset+jlevel+1) = i8_arr_gsum_level(ioffset+jlevel+1) &
-                                                        + i8_sign*(i8_radix**arr_max_shift)
-                 endif
-              enddo
+! Preprocess integer vector (as described in 3(a) above):
+!  i) If value larger than or equal to (radix(1_i8)**arr_max_shift),
+!     add this 'overlap' to the value at the next smaller level
+!     in the vector, resulting in nonoverlapping ranges for each
+!     component.
+!
+!     As before, no intermediate sums for levels
+!     max_levels(ifld) to -(extra_levels-2), in absolute value,
+!     will exceed the the maximum value representable in i8, but the
+!     upper bound on the final sum, in absolute value, at
+!     level -(extra_levels-1) is now 
+!      (radix(1_i8)**(digits(1_i8) - 1))*(4/3) + 
+!            + sum{i=1,inf}(radix(1_i8)**(-i*arr_max_shift))
+!      = (radix(1_i8)**(digits(1_i8) - 1))*
+!            ((4/3) + sum{i=1,inf}(radix(1_i8)**(-i*arr_max_shift)).
+!     which is less than or equal to 
+!      (radix(1_i8)**(digits(1_i8) - 1))*((4/3) + (1/3))
+!     or
+!      (radix(1_i8)**(digits(1_i8) - 1))*(5/3)
+!     which will not cause an overflow at level -(extra_levels-1)
+!     as long as digits(1_i8) >= 3.
+!
+!     Since the exponents associated with each successive level
+!     differ by arr_max_shift, monotonically decreasing with
+!     increasing level, the absolute value at each level after this
+!     preprocessing is strictly less than what can be represented at
+!     the next lower level (larger exponent). If nonzero, it is also
+!     strictly greater than what is represented at the next higher
+!     level (smaller exponent). Note that the smallest level,
+!     -(extra_levels-1), does not have to be less than
+!     (radix(1_i8)**arr_max_shift) for this 'nonoverlap' property to
+!     hold. 
+            do ilevel=max_levels(ifld),min_level+1,-1
+               if (abs(i8_gsum_level(ilevel)) >= &
+                      (i8_radix**arr_max_shift)) then
+
+                  IX_8 = i8_gsum_level(ilevel) &
+                         / (i8_radix**arr_max_shift)
+                  i8_gsum_level(ilevel-1) = &
+                     i8_gsum_level(ilevel-1) + IX_8
+
+                  IX_8 = IX_8*(i8_radix**arr_max_shift)
+                  i8_gsum_level(ilevel)   = &
+                     i8_gsum_level(ilevel) - IX_8
+               endif
+            enddo
+
+!  ii) Working consecutively from the first level with a nonzero value
+!      up to level max_levels(ifld), subtract +/- 1 from level with
+!      larger exponent (e.g., ilevel) and add  +/-
+!      (i8_radix**arr_max_shift) to level with smaller exponent
+!      (ilevel+1), when necessary, so that the value at ilevel+1
+!      has the same sign as the value at ilevel. Treat a zero value at
+!      ilevel+1 as always a different sign from the value at ilevel so
+!      that the process always makes this nonzero. (Otherwise, the
+!      wrong sign could be reintroduced by subtracting from a zero
+!      value at the next step.) When finished with the process values
+!      at all levels are either greater than or equal to zero or all
+!      are less than or equal to zero. Note that this can decrease
+!      (but not increase) the absolute value at level
+!      -(extra_levels-1) by 1. All other levels are now less than or
+!      equal to (radix(1_i8)**arr_max_shift) in absolute value rather
+!      than strictly less than. 
+            ilevel = min_level
+            do while ((i8_gsum_level(ilevel) == 0_i8) &
+                      .and. (ilevel < max_levels(ifld)))
+               ilevel = ilevel + 1
+            enddo
+!
+            if (i8_gsum_level(ilevel) < 0_i8) then
+               i8_sign = -1_i8
+            else
+               i8_sign = 1_i8
+            endif
+!
+            if (ilevel < max_levels(ifld)) then
+               do jlevel=ilevel,max_levels(ifld)-1
+                  if ((sign(1_i8,i8_gsum_level(jlevel)) &
+                       /= sign(1_i8,i8_gsum_level(jlevel+1)))&
+                      .or. (i8_gsum_level(jlevel+1) == 0_i8)) then
+                     i8_gsum_level(jlevel)   = &
+                       i8_gsum_level(jlevel) - i8_sign
+                     i8_gsum_level(jlevel+1) = &
+                        i8_gsum_level(jlevel+1) &
+                        + i8_sign*(i8_radix**arr_max_shift)
+                  endif
+               enddo
             endif
 
-! start with maximum shift, and work up to larger values
-            arr_shift = arr_gmax_exp(ifld) &
-                        - max_levels(ifld)*arr_max_shift
-            curr_exp = 0
-            first = .true.
-            do ilevel=max_levels(ifld),0,-1
-
-               if (i8_arr_gsum_level(ioffset+ilevel) .ne. 0_i8) then
-                  jlevel = 1
-
-! r8 representation of higher order bits in integer
-                  X_8(jlevel) = i8_arr_gsum_level(ioffset+ilevel)
-                  LX(jlevel)  = exponent(X_8(jlevel))
-
-! calculate remainder
-                  IX_8 = int(X_8(jlevel),i8)
-                  RX_8 = (i8_arr_gsum_level(ioffset+ilevel) - IX_8)
-
-! repeat using remainder
-                  do while (RX_8 .ne. 0.0_r8)
-                     jlevel = jlevel + 1
-                     X_8(jlevel) = RX_8
-                     LX(jlevel) = exponent(RX_8)
-                     IX_8 = IX_8 + int(RX_8,i8)
-                     RX_8 = (i8_arr_gsum_level(ioffset+ilevel) - IX_8)
-                  enddo
+!  iii) If 'same sign' is negative, then change to positive
+!       temporarily. 
+            if (i8_sign < 0_i8) then
+               do jlevel=ilevel,max_levels(ifld)
+                  i8_gsum_level(jlevel) = -i8_gsum_level(jlevel)
+               enddo
+            endif
+
+!  iv) Nonoverlap property can be lost after imposition of same sign
+!      over components. Reintroduce this property (retaining same sign
+!      property). Note that carryover is never more than '1' to the
+!      next smaller level, so, again, no intermediate or final sums
+!      will exceed the maximum value representable in i8, including
+!      level -(extra_levels-1) as long as digits(1_i8) >= 4. 
+            do ilevel=max_levels(ifld),min_level+1,-1
+               if (abs(i8_gsum_level(ilevel)) >= &
+                      (i8_radix**arr_max_shift)) then
+
+                  IX_8 = i8_gsum_level(ilevel)/ &
+                         (i8_radix**arr_max_shift)
+                  i8_gsum_level(ilevel-1) = &
+                     i8_gsum_level(ilevel-1) + IX_8
+
+                  IX_8 = IX_8*(i8_radix**arr_max_shift)
+                  i8_gsum_level(ilevel)   = &
+                     i8_gsum_level(ilevel) - IX_8
+               endif
+            enddo
 
-! add in contributions, smaller to larger, rescaling for each
-! addition to guarantee that exponent of working summand is always
-! larger than minexponent
-                  do while (jlevel > 0)
-                     if (first) then
-                        curr_exp = LX(jlevel) + arr_shift
-                        arr_gsum(ifld) = fraction(X_8(jlevel))
-                        first = .false.
+! Step 3(d): iterate over steps 3(b) and 3(c), truncating integer
+! vector to 'fit' into a floating point value, then repeating with
+! remainder
+            first_stepd_iteration = .true.
+            arr_shift = arr_gmax_exp(ifld) - (min_level)*arr_max_shift
+            i8_digit_count = 0
+            i8_begin_level = min_level
+            do while (i8_begin_level <= max_levels(ifld))
+   
+! Determine at which level the total number of integer digits equals
+! or exceeds the number of digits representable in the floating point
+! sum. Then determine which digit at this level is the last
+! representable in the floating point sum. Note that this location
+! (i8_trunc_loc) is zero-based, i.e. smallest digit is at location
+! 0. Note that the exponent is a count of the number of digits for the
+! first nonzero level. All subsequent levels contribute arr_max_shift
+! digits.
+               i8_trunc_loc = 0
+               i8_trunc_level = max_levels(ifld)
+               do ilevel=i8_begin_level,max_levels(ifld)
+                  if (first_stepd_iteration) then
+! Special logic for first time through. Subsequent iterations treat
+! leading zeroes as significant.
+                     if (i8_digit_count == 0) then
+                        if (i8_gsum_level(ilevel) /= 0_i8) then
+                           X_8 = i8_gsum_level(ilevel)
+                           LX  = exponent(X_8)
+! Note that even if i8_gsum_level(ilevel) is truncated when assigned
+! to X_8, the exponent LX will still capture the original number of
+! digits.
+                        else
+                           LX = 0
+                        endif
                      else
-                        corr_exp = curr_exp - (LX(jlevel) + arr_shift)
-                        arr_gsum(ifld) = fraction(X_8(jlevel)) &
-                                       + scale(arr_gsum(ifld),corr_exp)
-                        curr_exp = LX(jlevel) + arr_shift
+                        LX = arr_max_shift
                      endif
-                     jlevel = jlevel - 1
-                  enddo
+                  else
+! If i8_digit_count /= 0 during the first iteration
+! (ilevel == i8_begin_level), then there is a remainder left at the
+! previous i8_trunc_level and LX should be set to zero for this
+! iteration.
+                     if ((ilevel == i8_begin_level) .and. (i8_digit_count /= 0)) then
+                        LX = 0
+                     else
+                        LX = arr_max_shift
+                     endif
+                  endif
+                  if (i8_digit_count + LX >= digits(1.0_r8)) then
+                     i8_trunc_level = ilevel
+                     i8_trunc_loc = (i8_digit_count + LX) - digits(1.0_r8)
+                     exit
+                  else
+                     i8_digit_count = i8_digit_count + LX
+                  endif
+               enddo
+               first_stepd_iteration = .false.
+
+! Truncate at i8_trunc_loc as needed and determine what the remainder
+! is. 
+               if (i8_trunc_loc == 0) then
+! No truncation is necessary, and remainder is just the components
+! for the remaining levels
+                  i8_trunc_level_rem = 0
+               else
+! Shift right to identify the digits to be preserved and truncate
+! there
+                  IX_8 = i8_gsum_level(i8_trunc_level)/ &
+                         (i8_radix**i8_trunc_loc)
+! Shift left to put digits in the correct location (right fill with
+! zeroes)
+                  IX_8 = IX_8*(i8_radix**i8_trunc_loc)
+! Calculate local remainder
+                  i8_trunc_level_rem = (i8_gsum_level(i8_trunc_level) - IX_8)
+! Update level with the truncated value
+                  i8_gsum_level(i8_trunc_level) = IX_8
+               endif
+    
+! Calculate floating point value corresponding to modified integer
+! vector. Note that, by construction, i8 integer value will fit into
+! r8 floating point value, so do not need to test for this.
+               svlevel = svlevel + 1
+               summand_vector(svlevel) = 0.0_r8
+               do ilevel=i8_begin_level,i8_trunc_level
+                  if (i8_gsum_level(ilevel) /= 0_i8) then
+
+! Convert integer to floating point representation
+                     X_8 = i8_gsum_level(ilevel)
+                     LX  = exponent(X_8)
+
+! Add to vector of floating point summands, scaling first if exponent
+! is too small to apply directly
+                     curr_exp = LX + arr_shift
+                     if (curr_exp >= MINEXPONENT(1.0_r8)) then
+                        summand_vector(svlevel) = &
+                           summand_vector(svlevel) + set_exponent(X_8,curr_exp)
+                     else
+                        RX_8 = set_exponent(X_8, &
+                                            curr_exp-MINEXPONENT(1.0_r8))
+                        summand_vector(svlevel) = &
+                           summand_vector(svlevel) + scale(RX_8,MINEXPONENT(1.0_r8))
+                     endif
+
+                  endif
 
+! Note that same arr_shift should be used for next 'step 3(d)'
+! iteration if i8_trunc_loc > 0.
+                  if ((ilevel < i8_trunc_level) .or. (i8_trunc_loc == 0)) then
+                     arr_shift = arr_shift - arr_max_shift
+                  endif
+
+               enddo
+
+               if (i8_trunc_loc == 0) then
+                  i8_digit_count = 0
+                  i8_begin_level = i8_trunc_level + 1
+               else
+                  i8_digit_count = i8_trunc_loc
+                  i8_begin_level = i8_trunc_level
+                  i8_gsum_level(i8_trunc_level) = i8_trunc_level_rem
                endif
 
-               arr_shift = arr_shift + arr_max_shift
             enddo
 
-! apply final exponent correction, scaling first if exponent is too small
-! to apply directly
-            corr_exp = curr_exp + exponent(arr_gsum(ifld))
-            if (corr_exp .ge. MINEXPONENT(1._r8)) then
-               arr_gsum(ifld) = set_exponent(arr_gsum(ifld),corr_exp)
-            else
-               RX_8 = set_exponent(arr_gsum(ifld), &
-                                   corr_exp-MINEXPONENT(1._r8))
-               arr_gsum(ifld) = scale(RX_8,MINEXPONENT(1._r8))
-            endif
+! Step 3(e): sum vector of floating point values, smallest to largest
+            arr_gsum(ifld) = 0.0_r8
+            do jlevel=svlevel,1,-1
+               arr_gsum(ifld) = arr_gsum(ifld) + summand_vector(jlevel)
+            enddo
 
-! if validate is .true. and some precision lost, test whether 'too much'
-!  was lost, due to too loose an upper bound, too stringent a limit on number
-!  of levels of expansion, cancellation, .... Calculated by comparing lower
-!  bound on number of sigificant digits with number of digits in 1.0_r8 .
+! Step 3(f): restore the sign
+            arr_gsum(ifld) = i8_sign*arr_gsum(ifld)
+
+! If validate is .true. and some precision lost, test whether 'too
+! much' was lost, due to too loose an upper bound, too stringent a
+! limit on number of levels of expansion, cancellation, ...
+! Calculated by comparing lower bound on number of significant digits
+! with number of digits in 1.0_r8 .
             if (validate) then
-               if (i8_arr_gsum_level(ioffset-voffset+2) .ne. 0_i8) then
-
-! find first nonzero level and use exponent for this level, then assume all
-! subsequent levels contribute arr_max_shift digits.
-                  sum_digits = 0
-                  do ilevel=0,max_levels(ifld)
-                     if (sum_digits .eq. 0) then
-                        if (i8_arr_gsum_level(ioffset+ilevel) .ne. 0_i8) then
-                           X_8(1) = i8_arr_gsum_level(ioffset+ilevel)
-                           LX(1)  = exponent(X_8(1))
-                           sum_digits = LX(1)
+               if (i8_arr_gsum_level(ioffset-voffset+2) /= 0_i8) then
+
+! Find first nonzero level and use exponent for this level, then
+! assume all subsequent levels contribute arr_max_shift digits.
+                  i8_digit_count = 0
+                  do ilevel=min_level,max_levels(ifld)
+                     if (i8_digit_count == 0) then
+                        if (i8_arr_gsum_level(ioffset+ilevel) /= 0_i8) then
+                           X_8 = i8_arr_gsum_level(ioffset+ilevel)
+                           LX  = exponent(X_8)
+                           i8_digit_count = LX
                         endif
                      else
-                        sum_digits = sum_digits + arr_max_shift
+                        i8_digit_count = i8_digit_count + arr_max_shift
                      endif
                   enddo
 
-                  if (sum_digits < digits(1.0_r8)) then
+                  if (i8_digit_count < digits(1.0_r8)) then
                      recompute = .true.
                   endif
                endif
@@ -1418,6 +1970,9 @@ subroutine cism_reprosum_int (arr, arr_gsum, nsummands, dsummands, nflds, &
          endif
 
       enddo
+#ifndef EAMXX_STANDALONE
+      call t_stopf('repro_sum_finalsum')
+#endif
 
    end subroutine cism_reprosum_int
 
@@ -1425,28 +1980,28 @@ end subroutine cism_reprosum_int
 !========================================================================
 !
 
-   logical function cism_reprosum_tolExceeded (name, nflds, master, &
+   logical function cism_reprosum_tolExceeded(name, nflds, master, &
                                               logunit, rel_diff    )
-!----------------------------------------------------------------------
+!------------------------------------------------------------------------
 !
 ! Purpose:
 ! Test whether distributed sum exceeds tolerance and print out a
 ! warning message.
 !
-!----------------------------------------------------------------------
+!------------------------------------------------------------------------
 !
 ! Arguments
 !
       character(len=*), intent(in) :: name    ! distributed sum identifier
       integer,  intent(in) :: nflds           ! number of fields
-      logical,  intent(in) :: master          ! process that will write
+      logical,  intent(in) :: master          ! MPI task that will write
                                               !  warning messages?
       integer, optional, intent(in) :: logunit! unit warning messages
                                               !  written to
       real(r8), intent(in) :: rel_diff(2,nflds)
                                               ! relative and absolute
-                                              !  differences between fixed
-                                              !  and floating point sums
+                                              !  differences between integer
+                                              !  vector and floating point sums
 
 !
 ! Local workspace
@@ -1460,7 +2015,7 @@ logical function cism_reprosum_tolExceeded (name, nflds, master, &
       real(r8) :: max_abs_diff                ! maximum absolute difference
       integer  :: max_abs_diff_idx            ! field index for max. abs. diff.
 !
-!-----------------------------------------------------------------------
+!------------------------------------------------------------------------
 !
       cism_reprosum_tolExceeded = .false.
       if (cism_reprosum_reldiffmax < 0.0_r8) return
@@ -1472,7 +2027,7 @@ logical function cism_reprosum_tolExceeded (name, nflds, master, &
          llogunit = iulog
       endif
 
-      ! check that "fast" reproducible sum is accurate enough.
+! Check that 'fast' reproducible sum is accurate enough.
       exceeds_limit = 0
       max_rel_diff = 0.0_r8
       max_abs_diff = 0.0_r8
@@ -1494,7 +2049,7 @@ logical function cism_reprosum_tolExceeded (name, nflds, master, &
       if (exceeds_limit > 0) then
          if (master) then
             write(llogunit,*) trim(name), &
-                            ': difference in fixed and floating point sums ', &
+                            ': difference between integer vector and floating point sums ', &
                             ' exceeds tolerance in ', exceeds_limit, &
                             ' fields.'
             write(llogunit,*) '  Maximum relative diff: (rel)', &
@@ -1507,23 +2062,22 @@ logical function cism_reprosum_tolExceeded (name, nflds, master, &
          cism_reprosum_tolExceeded = .true.
        endif
 
-
-    end function cism_reprosum_tolExceeded
+   end function cism_reprosum_tolExceeded
 
 !
 !========================================================================
 !
 
-   subroutine cism_reprosum_ddpdd (arr, arr_gsum, nsummands, dsummands,  &
+   subroutine cism_reprosum_ddpdd(arr, arr_gsum, nsummands, dsummands,  &
                                   nflds, mpi_comm                       )
-!----------------------------------------------------------------------
+!------------------------------------------------------------------------
 !
 ! Purpose:
-! Compute the global sum of each field in "arr" using the indicated
+! Compute the global sum of each field in 'arr' using the indicated
 ! communicator with a reproducible yet scalable implementation based
 ! on He and Ding's implementation of the double-double algorithm.
 !
-!----------------------------------------------------------------------
+!------------------------------------------------------------------------
 !
 ! Arguments
 !
@@ -1555,12 +2109,10 @@ subroutine cism_reprosum_ddpdd (arr, arr_gsum, nsummands, dsummands,  &
       logical, save :: first_time = .true.
 
 !
-!-----------------------------------------------------------------------
+!------------------------------------------------------------------------
 !
-      !WHL - This is a C routine in the CESM shared code: shr_reprosumx86.c
-      !      Commented out the call.
-      !      Not sure whether this breaks the algorithm. 
-!!      call cism_reprosumx86_fix_start (old_cw)
+      !WHL - commented out
+!      call cism_reprosumx86_fix_start (old_cw)
 
       if (first_time) then
          call mpi_op_create(ddpdd, .true., mpi_sumdd, ierr)
@@ -1572,8 +2124,7 @@ subroutine cism_reprosum_ddpdd (arr, arr_gsum, nsummands, dsummands,  &
 
          do isum=1,nsummands
 
-            ! Compute arr(isum,ifld) + arr_lsum_dd(ifld) using Knuth''s
-            ! trick.
+! Compute arr(isum,ifld) + arr_lsum_dd(ifld) using Knuth''s trick.
             t1 = arr(isum,ifld) + real(arr_lsum_dd(ifld))
             e  = t1 - arr(isum,ifld)
             t2 = ((real(arr_lsum_dd(ifld)) - e) &
@@ -1585,33 +2136,35 @@ subroutine cism_reprosum_ddpdd (arr, arr_gsum, nsummands, dsummands,  &
          enddo
 
       enddo
-#ifdef TIMING
+
+#ifndef EAMXX_STANDALONE
       call t_startf("repro_sum_allr_c16")
 #endif
       call mpi_allreduce (arr_lsum_dd, arr_gsum_dd, nflds, &
                           MPI_COMPLEX16, mpi_sumdd, mpi_comm, ierr)
-#ifdef TIMING
+#ifndef EAMXX_STANDALONE
       call t_stopf("repro_sum_allr_c16")
 #endif
+
       do ifld=1,nflds
          arr_gsum(ifld) = real(arr_gsum_dd(ifld))
       enddo
 
-      !WHL - commented out; see comment above
-!!      call cism_reprosumx86_fix_end (old_cw)
+      !WHL - commented out
+!      call cism_reprosumx86_fix_end (old_cw)
 
    end subroutine cism_reprosum_ddpdd
 !
-!-----------------------------------------------------------------------
+!------------------------------------------------------------------------
 !
    subroutine DDPDD (dda, ddb, len, itype)
-!----------------------------------------------------------------------
+!------------------------------------------------------------------------
 !
 ! Purpose:
 ! Modification of original codes written by David H. Bailey
 ! This subroutine computes ddb(i) = dda(i)+ddb(i)
 !
-!----------------------------------------------------------------------
+!------------------------------------------------------------------------
 !
 ! Arguments
 !
@@ -1625,31 +2178,31 @@ subroutine DDPDD (dda, ddb, len, itype)
       real(r8) e, t1, t2
       integer i
 !
-!-----------------------------------------------------------------------
+!------------------------------------------------------------------------
 !
       do i = 1, len
-!   Compute dda + ddb using Knuth's trick.
+
+! Compute dda + ddb using Knuth's trick.
          t1 = real(dda(i)) + real(ddb(i))
          e  = t1 - real(dda(i))
          t2 = ((real(ddb(i)) - e) + (real(dda(i)) - (t1 - e))) &
               + aimag(dda(i)) + aimag(ddb(i))
 
-!   The result is t1 + t2, after normalization.
+! The result is t1 + t2, after normalization.
          ddb(i) = cmplx ( t1 + t2, t2 - ((t1 + t2) - t1), r8 )
       enddo
 
-
    end subroutine DDPDD
 !
-!-----------------------------------------------------------------------
+!------------------------------------------------------------------------
 !
   subroutine split_indices(total,num_pieces,ibeg,iend)
-!----------------------------------------------------------------------
+!------------------------------------------------------------------------
 !
 ! Purpose:
 ! Split range into 'num_pieces'
 !
-!----------------------------------------------------------------------
+!------------------------------------------------------------------------
 !
 ! Arguments
 !
@@ -1661,7 +2214,7 @@ subroutine split_indices(total,num_pieces,ibeg,iend)
 !
     integer :: itmp1, itmp2, ioffset, i
 !
-!-----------------------------------------------------------------------
+!------------------------------------------------------------------------
 !
     itmp1 = total/num_pieces
     itmp2 = mod(total,num_pieces)
diff --git a/libglissade/glissade_velo_higher.F90 b/libglissade/glissade_velo_higher.F90
index 73b45f52..3b3ec31b 100644
--- a/libglissade/glissade_velo_higher.F90
+++ b/libglissade/glissade_velo_higher.F90
@@ -8376,11 +8376,11 @@ subroutine basal_sliding_bc_2d(nx,               ny,              &
        do j = 1, ny-1
           do i = 1, nx-1
              if (active_vertex(i,j)) then
-                Auu(i,j,m) = Auu(i,j,m) + (dx*dy/vol0) * beta(i,j)
-                Avv(i,j,m) = Avv(i,j,m) + (dx*dy/vol0) * beta(i,j)
-!!                increment = (dx*dy/vol0) * beta(i,j)
-!!                Auu(i,j,m) = Auu(i,j,m) + increment
-!!                Avv(i,j,m) = Avv(i,j,m) + increment
+!                Auu(i,j,m) = Auu(i,j,m) + (dx*dy/vol0) * beta(i,j)
+!                Avv(i,j,m) = Avv(i,j,m) + (dx*dy/vol0) * beta(i,j)
+                increment = (dx*dy/vol0) * beta(i,j)
+                Auu(i,j,m) = Auu(i,j,m) + increment
+                Avv(i,j,m) = Avv(i,j,m) + increment
              endif   ! active_vertex
           enddo   ! i
        enddo   ! j

From 3be7b25169439f164222d9d00ccc3d2141726474 Mon Sep 17 00:00:00 2001
From: William Lipscomb <lipscomb@ucar.edu>
Date: Thu, 8 Jan 2026 14:23:26 -0700
Subject: [PATCH 07/21] Reprosum fixes for glaciers

This commit modifies the calculation of the glacier boundary mask to support reproducible sums.
Previously, this mask (which enforces high values of Cp at vertices that border two different glaciers)
could be computed incorrectly near block boundaries.

Other changes:

* I added an optional argument, zero_global_boundary_no_ice_bc, to subroutine parallel_halo_real8_2d.
  By default, this subroutine zeroes out the values of scalars in cells adjacent to the global boundary.
  (The reasons are a bit complicated; they are related to the issue of communicating scalar values
  to the halo cells of diagonal block neighbors when some blocks are inactive.)
  Passing in '.false.' for this argument preserves the original value in these cells.
  For topg and relx, it is better to keep the correct physical values than to set them to zero.

* The call to glissade_glacier_init now takes place after thck, topg, lsrf, and usrf have been computed
  in halo cells. This prevents erroneous nonzero thickness targets near block boundaries.

* I added a subroutine called parallel_halo_extrapolate_1d. This subroutine is used to set
  the correct values of (x0,y0) and (x1,y1) in halo cells.
  Note: CISM requires (x1,y1) in the input file. If (x0,y0) are not present, they are
        computed from (x1,y1).

* If glacier%length_scale_factor /= 1, then (x0,y0) and (x1,y1) are now scaled at initialization,
  along with dew and dns.

* Removed circular dependencies by deleting 'use cism_parallel' and 'use glimmer_log' from the profile module

* Extended the write_array_to_file subroutines to allow either floating-point or binary-string output

* Eliminated some glacier diagnostic output for the case that the diagnostic cell belongs to no glacier
---
 libglide/glide_diagnostics.F90       |  73 +++++++------
 libglide/glide_nc_custom.F90         |   7 +-
 libglimmer/cism_reprosum_mod.F90     |  17 +--
 libglimmer/parallel_mpi.F90          | 136 +++++++++++++++++------
 libglimmer/profile.F90               |  10 +-
 libglissade/glissade.F90             | 155 ++++++++++++++-------------
 libglissade/glissade_glacier.F90     |  61 +++++------
 libglissade/glissade_inversion.F90   |   2 +-
 libglissade/glissade_utils.F90       |  66 +++++++++---
 libglissade/glissade_velo_higher.F90 |  35 +++---
 10 files changed, 330 insertions(+), 232 deletions(-)

diff --git a/libglide/glide_diagnostics.F90 b/libglide/glide_diagnostics.F90
index 8aec8052..a4571ed1 100644
--- a/libglide/glide_diagnostics.F90
+++ b/libglide/glide_diagnostics.F90
@@ -1266,48 +1266,57 @@ subroutine glide_write_diag (model,       time)
 
        ng = model%glacier%ngdiag
 
-       write(message,'(a35,i14)') 'Diagnostic glacier index (RGI)     ', &
-            model%glacier%cism_to_rgi_glacier_id(ng)
-       call write_log(trim(message), type = GM_DIAGNOSTIC)
+       if (ng > 0) then
 
-       write(message,'(a35,i14)') 'Diagnostic glacier index (CISM)    ', ng
-       call write_log(trim(message), type = GM_DIAGNOSTIC)
+          write(message,'(a35,i14)') 'Diagnostic glacier index (RGI)     ', &
+               model%glacier%cism_to_rgi_glacier_id(ng)
+          call write_log(trim(message), type = GM_DIAGNOSTIC)
 
-       write(message,'(a35,f14.6)') 'Glacier area_init (km^2)           ', &
-            model%glacier%area_init(ng) / 1.0d6
-       call write_log(trim(message), type = GM_DIAGNOSTIC)
+          write(message,'(a35,i14)') 'Diagnostic glacier index (CISM)    ', ng
+          call write_log(trim(message), type = GM_DIAGNOSTIC)
 
-       write(message,'(a35,f14.6)') 'Glacier area (km^2)                ', &
-            model%glacier%area(ng) / 1.0d6
-       call write_log(trim(message), type = GM_DIAGNOSTIC)
+          write(message,'(a35,f14.6)') 'Glacier area_init (km^2)           ', &
+               model%glacier%area_init(ng) / 1.0d6
+          call write_log(trim(message), type = GM_DIAGNOSTIC)
 
-       write(message,'(a35,f14.6)') 'Glacier area_init_extent (km^2)    ', &
-            model%glacier%area_init_extent(ng) / 1.0d6
-       call write_log(trim(message), type = GM_DIAGNOSTIC)
+          write(message,'(a35,f14.6)') 'Glacier area (km^2)                ', &
+               model%glacier%area(ng) / 1.0d6
+          call write_log(trim(message), type = GM_DIAGNOSTIC)
 
-       write(message,'(a35,f14.6)') 'Glacier volume (km^3)              ', &
-            model%glacier%volume(ng) / 1.0d9
-       call write_log(trim(message), type = GM_DIAGNOSTIC)
+          write(message,'(a35,f14.6)') 'Glacier area_init_extent (km^2)    ', &
+               model%glacier%area_init_extent(ng) / 1.0d6
+          call write_log(trim(message), type = GM_DIAGNOSTIC)
 
-       write(message,'(a35,f14.6)') 'Glacier volume_init (km^3)         ', &
-            model%glacier%volume_init(ng) / 1.0d9
-       call write_log(trim(message), type = GM_DIAGNOSTIC)
+          write(message,'(a35,f14.6)') 'Glacier volume (km^3)              ', &
+               model%glacier%volume(ng) / 1.0d9
+          call write_log(trim(message), type = GM_DIAGNOSTIC)
 
-       write(message,'(a35,f14.6)') 'Glacier volume_init_extent (km^3)  ', &
-            model%glacier%volume_init_extent(ng) / 1.0d9
-       call write_log(trim(message), type = GM_DIAGNOSTIC)
+          write(message,'(a35,f14.6)') 'Glacier volume_init (km^3)         ', &
+               model%glacier%volume_init(ng) / 1.0d9
+          call write_log(trim(message), type = GM_DIAGNOSTIC)
 
-       write(message,'(a35,f14.6)') 'mu_star (mm/yr w.e./deg C)         ', &
-            model%glacier%mu_star(ng)
-       call write_log(trim(message), type = GM_DIAGNOSTIC)
+          write(message,'(a35,f14.6)') 'Glacier volume_init_extent (km^3)  ', &
+               model%glacier%volume_init_extent(ng) / 1.0d9
+          call write_log(trim(message), type = GM_DIAGNOSTIC)
 
-       write(message,'(a35,f14.6)') 'alpha_snow                         ', &
-            model%glacier%alpha_snow(ng)
-       call write_log(trim(message), type = GM_DIAGNOSTIC)
+          write(message,'(a35,f14.6)') 'mu_star (mm/yr w.e./deg C)         ', &
+               model%glacier%mu_star(ng)
+          call write_log(trim(message), type = GM_DIAGNOSTIC)
 
-       write(message,'(a35,f14.6)') 'beta_artm (deg C)                  ', &
-            model%glacier%beta_artm(ng)
-       call write_log(trim(message), type = GM_DIAGNOSTIC)
+          write(message,'(a35,f14.6)') 'alpha_snow                         ', &
+               model%glacier%alpha_snow(ng)
+          call write_log(trim(message), type = GM_DIAGNOSTIC)
+
+          write(message,'(a35,f14.6)') 'beta_artm (deg C)                  ', &
+               model%glacier%beta_artm(ng)
+          call write_log(trim(message), type = GM_DIAGNOSTIC)
+
+       else   ! glacier ID = 0
+
+          write(message,'(a35,i14)') 'Diagnostic glacier index (CISM)    ', ng
+          call write_log(trim(message), type = GM_DIAGNOSTIC)
+
+       endif
 
        call write_log(' ')
 
diff --git a/libglide/glide_nc_custom.F90 b/libglide/glide_nc_custom.F90
index a672203a..bd464a4f 100644
--- a/libglide/glide_nc_custom.F90
+++ b/libglide/glide_nc_custom.F90
@@ -116,7 +116,8 @@ subroutine glide_nc_filldvars(outfile,    model)
        ! This does not work, in general, when computing on active blocks only, because the local versions
        !  of model%general%x1 may not span the global domain.
        ! The revised code calls parallel_put_var to write (x0,y0) and (x1,y1) to the output file.
-       ! This assumes that x1_global and y1_global were read from the input file and saved in a global array.
+       ! This assumes that x1_global and y1_global were read from the input file and saved in a global array
+       ! (e.g., in subroutine glide_io_read).
 
        status = parallel_inq_varid(NCO%id,'x1',varid)
        status = parallel_put_var(NCO%id,varid,model%general%x1_global)
@@ -126,8 +127,8 @@ subroutine glide_nc_filldvars(outfile,    model)
        status = parallel_put_var(NCO%id,varid,model%general%y1_global)
        call nc_errorhandle(__FILE__,__LINE__,status)
 
-       ! create the x0 and y0 grids from x1 and y1
-
+       ! create the x0 and y0 grids from x1 and y1;
+       ! this does not require model%general%x0_global and y0_global to have been filled
        status = parallel_inq_varid(NCO%id,'x0',varid)
        do i = 1, global_ewn-1
           x0_global(i) = (model%general%x1_global(i) + model%general%x1_global(i+1)) / 2.0d0
diff --git a/libglimmer/cism_reprosum_mod.F90 b/libglimmer/cism_reprosum_mod.F90
index 0740f459..ec202273 100644
--- a/libglimmer/cism_reprosum_mod.F90
+++ b/libglimmer/cism_reprosum_mod.F90
@@ -62,7 +62,6 @@ module cism_reprosum_mod
    use glimmer_global, only: r8 => dp
    use glimmer_global, only: i8
    use glimmer_paramets, only: iulog
-   use glimmer_utils, only: double_to_binary
    use cism_infnan_mod,only: cism_infnan_inf_type, assignment(=), &
                              cism_infnan_posinf, cism_infnan_neginf, &
                              cism_infnan_nan, &
@@ -85,6 +84,7 @@ module cism_reprosum_mod
 !------------------------------------------------------------------------
 ! Public interfaces -----------------------------------------------------
 !------------------------------------------------------------------------
+   !WHL  - cism_reprosum_setopts is not currently called
    public :: &
       cism_reprosum_setopts,        &! set runtime options
       cism_reprosum_calc,           &! calculate distributed sum
@@ -98,10 +98,7 @@ module cism_reprosum_mod
 
    real(r8), public    :: cism_reprosum_reldiffmax = -1.0_r8
 
-   !WHL mod
-!!   logical, parameter, public :: verbose_reprosum = .false.
-   logical, parameter, public :: verbose_reprosum = .true.
-   ! end WHL mod
+   logical, parameter, public :: verbose_reprosum = .false.
 
 !------------------------------------------------------------------------
 ! Private interfaces ----------------------------------------------------
@@ -1192,20 +1189,12 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands,     &
          repro_sum_stats(6) = repro_sum_stats(6) + gbl_lor_red
       endif
 
-      !WHL mod: optional diagnostics
       if (verbose_reprosum) then
          call mpi_comm_rank(MPI_COMM_WORLD, mypid, ierr)
           if (mypid == 0) then
-             write(iulog,*) 'Exit reprosum, nflds =', nflds
-             write(iulog,*) '   n, arr_gsum, binary_str:'
-!!            do n = 1, nflds
-             do n = 1, min(2,nflds)
-                call double_to_binary(arr_gsum(n), binary_str)
-                write(iulog,*) n, arr_gsum(n), binary_str
-             enddo
+             write(iulog,*) 'Exit reprosum, nflds, arr_gsum =', nflds, arr_gsum
           endif
        endif
-       ! end WHL mod
 
   end subroutine cism_reprosum_calc
 
diff --git a/libglimmer/parallel_mpi.F90 b/libglimmer/parallel_mpi.F90
index bab339cd..57c2b20e 100644
--- a/libglimmer/parallel_mpi.F90
+++ b/libglimmer/parallel_mpi.F90
@@ -30,15 +30,7 @@ module cism_parallel
   use glimmer_global, only : dp, sp
   use glimmer_paramets, only: iulog
 
-!TODO - Not sure setopts is needed
-!TODO - Remove coupled ifdefs and always use the CISM version?
-!TODO - Use cism_reprosum_mod from individual functions
-#ifdef CCSMCOUPLED
-  use shr_reprosum_mod, only: shr_reprosum_setopts, shr_reprosum_calc
-#else
-  use cism_reprosum_mod, only: cism_reprosum_setopts, cism_reprosum_calc
-#endif
-  use cism_reprosum_mod, only: verbose_reprosum
+  use cism_reprosum_mod, only: cism_reprosum_calc, verbose_reprosum
 
   implicit none
 
@@ -332,6 +324,7 @@ module cism_parallel
   end interface
 
   interface parallel_halo_extrapolate
+     module procedure parallel_halo_extrapolate_real8_1d
      module procedure parallel_halo_extrapolate_integer_2d
      module procedure parallel_halo_extrapolate_real8_2d
   end interface
@@ -2206,6 +2199,7 @@ function distributed_get_var_real8_1d(ncid, varid, values, parallel, start)
        global_values(:) = 0.0d0
        distributed_get_var_real8_1d = &
             nf90_get_var(ncid,varid,global_values(1:myn),start)
+
        allocate(displs(tasks+1))
        allocate(sendcounts(tasks))
        sendcounts(:) = bounds(2,:)-bounds(1,:)+1
@@ -7598,26 +7592,35 @@ subroutine parallel_halo_real4_2d(a, parallel)
   end subroutine parallel_halo_real4_2d
 
 
-  subroutine parallel_halo_real8_2d(a, parallel, periodic_offset_ew, periodic_offset_ns)
+  subroutine parallel_halo_real8_2d(a, parallel, &
+                                    periodic_offset_ew, periodic_offset_ns, zero_global_boundary_no_ice_bc)
 
-    !WHL - added optional arguments for periodic offsets, to support ismip-hom test cases
+    ! Added optional arguments for periodic offsets, to support ismip-hom test cases
+    ! Also added an optional argument related to the no_ice BCs
 
     use mpi_mod
     implicit none
     real(dp),dimension(:,:) :: a
     type(parallel_type) :: parallel
+
     real(dp), intent(in), optional :: &
-       periodic_offset_ew,  &! offset halo values by this amount
-                             ! if positive, the offset is positive for W halo, negative for E halo
-       periodic_offset_ns    ! offset halo values by this amount
-                             ! if positive, the offset is positive for S halo, negative for N halo
+         periodic_offset_ew,     &! offset halo values by this amount
+                                  ! if positive, the offset is positive for W halo, negative for E halo
+         periodic_offset_ns       ! offset halo values by this amount
+                                  ! if positive, the offset is positive for S halo, negative for N halo
     
+    logical, intent(in), optional :: &
+         zero_global_boundary_no_ice_bc  ! if true, then zero out values in grid cells adjacent
+                                         ! to the global boundary when using no_ice BCs
+
     integer :: erequest,ierror,nrequest,srequest,wrequest
     real(dp),dimension(lhalo, parallel%local_nsn-lhalo-uhalo) :: esend,wrecv
     real(dp),dimension(uhalo, parallel%local_nsn-lhalo-uhalo) :: erecv,wsend
     real(dp),dimension(parallel%local_ewn, lhalo) :: nsend,srecv
     real(dp),dimension(parallel%local_ewn, uhalo) :: nrecv,ssend
 
+    logical :: zero_global_boundary_no_ice  ! local version of zero_global_boundary_no_ice_bc
+
     ! begin
     associate(  &
          outflow_bc  => parallel%outflow_bc,   &
@@ -7634,6 +7637,12 @@ subroutine parallel_halo_real8_2d(a, parallel, periodic_offset_ew, periodic_offs
          northwest_corner  => parallel%northwest_corner    &
          )
 
+    if (present(zero_global_boundary_no_ice_bc)) then
+       zero_global_boundary_no_ice = zero_global_boundary_no_ice_bc
+    else
+       zero_global_boundary_no_ice = .true.
+    endif
+
     ! staggered grid
     if (size(a,1)==local_ewn-1.and.size(a,2)==local_nsn-1) return
 
@@ -7723,31 +7732,53 @@ subroutine parallel_halo_real8_2d(a, parallel, periodic_offset_ew, periodic_offs
 
     elseif (no_ice_bc) then
 
-       ! Set values to zero in cells adjacent to the global boundary;
-       ! includes halo cells and one row of locally owned cells
+       if (zero_global_boundary_no_ice) then
 
-       if (this_rank >= east) then  ! at east edge of global domain
-          a(local_ewn-uhalo:,:) = 0.d0
-       endif
+          ! Set values to zero in cells adjacent to the global boundary;
+          ! includes halo cells and one row of locally owned cells.
 
-       if (this_rank <= west) then  ! at west edge of global domain
-          a(:lhalo+1,:) = 0.d0
-       endif
+          if (this_rank >= east) then  ! at east edge of global domain
+             a(local_ewn-uhalo:,:) = 0.d0
+          endif
 
-       if (this_rank >= north) then  ! at north edge of global domain
-          a(:,local_nsn-uhalo:) = 0.d0
-       endif
+          if (this_rank <= west) then  ! at west edge of global domain
+             a(:lhalo+1,:) = 0.d0
+          endif
 
-       if (this_rank <= south) then  ! at south edge of global domain
-          a(:,:lhalo+1) = 0.d0
-       endif
+          if (this_rank >= north) then  ! at north edge of global domain
+             a(:,local_nsn-uhalo:) = 0.d0
+          endif
 
-       ! Some interior blocks have a single cell at a corner of the global boundary.
-       ! Set values in corner cells to zero, along with adjacent halo cells.
-       if (southwest_corner) a(:lhalo+1,:lhalo+1) = 0.d0
-       if (southeast_corner) a(local_ewn-lhalo:,:lhalo+1) = 0.d0
-       if (northeast_corner) a(local_ewn-lhalo:,local_nsn-lhalo:) = 0.d0
-       if (northwest_corner) a(:lhalo+1,local_nsn-lhalo:) = 0.d0
+          if (this_rank <= south) then  ! at south edge of global domain
+             a(:,:lhalo+1) = 0.d0
+          endif
+
+          ! Some interior blocks have a single cell at a corner of the global boundary.
+          ! Set values in corner cells to zero, along with adjacent halo cells.
+          if (southwest_corner) a(:lhalo+1,:lhalo+1) = 0.d0
+          if (southeast_corner) a(local_ewn-lhalo:,:lhalo+1) = 0.d0
+          if (northeast_corner) a(local_ewn-lhalo:,local_nsn-lhalo:) = 0.d0
+          if (northwest_corner) a(:lhalo+1,local_nsn-lhalo:) = 0.d0
+
+       else  ! set values to zero in halo cells but not in locally owned cells
+
+          if (this_rank >= east) then  ! at east edge of global domain
+             a(local_ewn-uhalo+1:,:) = 0.d0
+          endif
+
+          if (this_rank <= west) then  ! at west edge of global domain
+             a(:lhalo,:) = 0.d0
+          endif
+
+          if (this_rank >= north) then  ! at north edge of global domain
+             a(:,local_nsn-uhalo+1:) = 0.d0
+          endif
+
+          if (this_rank <= south) then  ! at south edge of global domain
+             a(:,:lhalo) = 0.d0
+          endif
+
+       endif   ! zero_global_boundary_no_ice
 
     endif   ! outflow or no_ice bc
 
@@ -8003,6 +8034,41 @@ subroutine parallel_halo_real8_4d(a, parallel)
 
   end subroutine parallel_halo_real8_4d
 
+!=======================================================================
+
+  ! subroutines for 1D halo updates
+
+  subroutine parallel_halo_extrapolate_real8_1d(a, parallel, interval_in)
+
+    !Note: Extrapolate a 1D real8 variable into halo cells to the east and west.
+    !      Currently used only to compute halo values for grid cell coordinates.
+
+    use mpi_mod
+    implicit none
+    real(dp),dimension(:) :: a
+    type(parallel_type) :: parallel
+    real(dp),intent(in), optional :: &
+         interval_in  ! uniform difference between adjacent values, e.g. grid cell size dew or dns
+
+    integer :: i
+    real(dp) :: interval  ! local version of interval_in
+
+    if (present(interval_in)) then
+       interval = interval_in
+    else
+       interval = 0.0d0
+    endif
+
+    do i = 1, lhalo
+       a(i) = a(lhalo+1) - interval*(lhalo+1-i)
+    enddo
+
+    do i = size(a)-uhalo+1, size(a)
+       a(i) = a(size(a)-uhalo) + interval*(uhalo+i-size(a))
+    enddo
+
+  end subroutine parallel_halo_extrapolate_real8_1d
+
 !=======================================================================
 
   ! subroutines belonging to the parallel_halo_extrapolate interface
diff --git a/libglimmer/profile.F90 b/libglimmer/profile.F90
index f88ee51c..92840b14 100644
--- a/libglimmer/profile.F90
+++ b/libglimmer/profile.F90
@@ -35,8 +35,6 @@ module profile
 
 #if (defined CCSMCOUPLED || defined CESMTIMERS)
   use perf_mod
-  !TODO - Add an 'only' for 'use cism_parallel'?
-  use cism_parallel
 #endif
 
   use glimmer_global, only: dp
@@ -89,16 +87,18 @@ end subroutine profile_init
   
   !> register a new series of meassurements
   function profile_register(prof,msg)
-    use glimmer_log
+    use glimmer_paramets, only: iulog
+    use mpi
     implicit none
     type(profile_type) :: prof !< structure storing profile definitions
     character(len=*), intent(in) :: msg !< the message to be associated
     integer profile_register
+    integer :: ierr
 
     prof%nump = prof%nump+1
     if (prof%nump > max_prof) then
-       call write_log('Maximum number of profiles reached',type=GM_FATAL, &
-            file=__FILE__,line=__LINE__)
+       write(iulog,*) ('Maximum number of profiles reached')
+       call mpi_abort(MPI_COMM_WORLD, 1001, ierr)
     end if
     profile_register = prof%nump
     prof%pname(prof%nump) = trim(msg)
diff --git a/libglissade/glissade.F90 b/libglissade/glissade.F90
index 6c020822..cfcfec39 100644
--- a/libglissade/glissade.F90
+++ b/libglissade/glissade.F90
@@ -94,10 +94,10 @@ subroutine glissade_initialise(model, evolve_ice)
 
     use cism_parallel, only: parallel_type, parallel_finalise, &
          distributed_grid, distributed_grid_active_blocks,  parallel_global_edge_mask, &
-         parallel_halo, parallel_halo_extrapolate, parallel_reduce_max, &
+         parallel_halo, parallel_halo_extrapolate, &
          staggered_parallel_halo_extrapolate, staggered_no_penetration_mask, &
          parallel_create_comm_row, parallel_create_comm_col, &
-         parallel_is_zero, not_parallel
+         parallel_reduce_max, parallel_is_zero, not_parallel
 
     use glide_setup
     use glimmer_ncio, only: openall_in, openall_out, glimmer_nc_get_var, glimmer_nc_get_dimlength
@@ -366,34 +366,47 @@ subroutine glissade_initialise(model, evolve_ice)
     itest = model%numerics%idiag_local
     jtest = model%numerics%jdiag_local
 
-    ! Check whether x0 and y0 were read in. If not, then compute them from x1 and y1.
+    ! Make sure the grid coordinates (x1,y1) and (x0,y0) have been read in.
+    ! If (x1,y1) have not been read in, then abort the run.
+    ! If (x0,y0) have not been read in, then compute them from (x1,y1).
+    ! Extrapolate these coordinates to halo cells as needed.
+    ! Note: The extrapolation works only on a regular grid.
+    !TODO - Put the following code in a subroutine.
+
     if (parallel_is_zero(model%general%x1)) then
-       if (main_task) write(iulog,*) 'Warning: model%general%x1 = 0'
-    else
-!       if (main_task) write(iulog,*) 'x1_global:', model%general%x1_global(:)
-!       if (main_task) write(iulog,*) 'y1_global:', model%general%y1_global(:)
-!       if (main_task) write(iulog,*) 'x1:', model%general%x1(:)
-!       if (main_task) write(iulog,*) 'y1:', model%general%y1(:)
+       call write_log('model%general%x1 = 0.0 everywhere', GM_FATAL)
+    else  ! extrapolate x1 to halo cells
+       call parallel_halo_extrapolate(model%general%x1, parallel, model%numerics%dew)
+    endif
+
+    if (parallel_is_zero(model%general%y1)) then
+       call write_log('model%general%y1 = 0.0 everywhere', GM_FATAL)
+    else  ! extrapolate y1 to halo cells
+       call parallel_halo_extrapolate(model%general%y1, parallel, model%numerics%dns)
     endif
 
+    ! Check whether x0 and y0 were read in. If not, then compute them from x1 and y1.
     if (parallel_is_zero(model%general%x0)) then
-       if (main_task) write(iulog,*) 'Initialize x0'
+       if (main_task) write(iulog,*) 'x0 not read in; initialize from x1'
        do i = 1, model%general%ewn-1
           model%general%x0(i) = 0.5d0 * (model%general%x1(i) + model%general%x1(i+1))
        enddo
+    else
+       ! extrapolate x0 to halo cells
+       call parallel_halo_extrapolate(model%general%x0, parallel, model%numerics%dew)
     endif
-    if (main_task) write(iulog,*) 'x0:', model%general%x0(:)
 
     if (parallel_is_zero(model%general%y0)) then
-       if (main_task) write(iulog,*) 'Initialize y0'
+       if (main_task) write(iulog,*) 'y0 not read in; initialize from y1'
        do j = 1, model%general%nsn-1
           model%general%y0(j) = 0.5d0 * (model%general%y1(j) + model%general%y1(j+1))
        enddo
+    else
+       ! extrapolate y0 to halo cells
+       call parallel_halo_extrapolate(model%general%y0, parallel, model%numerics%dns)
     endif
-    if (main_task) write(iulog,*) 'y0:', model%general%y0(:)
 
-    ! Check that lat and lon fields were read in, if desired
-    !TODO - Use the parallel_is_nonzero function instead, here and below
+    ! Check that lat and lon fields were read in
     if (model%options%read_lat_lon) then
        if (parallel_is_zero(model%general%lat)) then
           call write_log('Failed to read latitude (lat) field from input file', GM_FATAL)
@@ -612,50 +625,9 @@ subroutine glissade_initialise(model, evolve_ice)
 
     endif  ! geothermal heat flux
 
-    ! If running with glaciers, then process the input glacier data
-    ! On start-up, this subroutine counts the glaciers.  It should be called before glide_io_createall,
-    !  which needs to know nglacier to set up glacier output files with the right dimensions.
-    ! On restart, most of the required glacier arrays are in the restart file, and this subroutine
-    !  computes a few remaining variable.
-
-    if (model%options%enable_glaciers) then
-
-       ! Glaciers are run with a no-ice BC to allow removal of inactive regions.
-       ! This can be problematic when running in a sub-region that has glaciers along the global boundary.
-       ! A halo update here for 'thck' will remove ice from cells along the global boundary.
-       ! It is best to do this before initializing glaciers, so that ice that initially exists
-       !  in these cells is removed before computing the area and thickness targets.
-       !TODO - These calls are repeated a few lines below.  Try moving them up, before the call
-       !       to glissade_glacier_init.  I don't think it's possible to move the glissade_glacier_init call
-       !       down, because we need to compute nglacier before setting up output files.
-
-       call parallel_halo(model%geometry%thck, parallel)
-       ! calculate the lower and upper ice surface
-       call glide_calclsrf(model%geometry%thck, model%geometry%topg, model%climate%eus, model%geometry%lsrf)
-       model%geometry%usrf = max(0.d0, model%geometry%thck + model%geometry%lsrf)
-
-       ! Initialize glaciers
-       ! Note: This subroutine can return modified values of model%numerics%dew, model%numerics%dns,
-       !        and model%geometry%cell_area.
-       !       This is a fix to deal with the fact that actual grid cell dimensions can be different
-       !        from the nominal dimensions on a projected grid.
-       !       See comments near the top of glissade_glacier_init.
-
-       call glissade_glacier_init(model, model%glacier)
-
-    endif
-
-    ! open all output files
-    call openall_out(model)
-
-    ! create glide I/O variables
-    call glide_io_createall(model, model)
-
-    ! initialize glissade components
-
     ! Set some variables in halo cells
     ! Note: We need thck and artm in halo cells so that temperature will be initialized correctly
-    !        (if not read from input file).
+    !        (if not read from the input file).
     !       We do an update here for temp in case temp is read from an input file.
     !       If temp is computed below in glissade_init_therm (based on the value of options%temp_init),
     !        then the halos will receive the correct values.
@@ -664,37 +636,36 @@ subroutine glissade_initialise(model, evolve_ice)
     call parallel_halo(model%climate%artm,  parallel)
     call parallel_halo(model%temper%temp,   parallel)
     call parallel_halo(model%temper%tempunstag, parallel)
-
-    ! calculate the lower and upper ice surface
-    call glide_calclsrf(model%geometry%thck, model%geometry%topg, model%climate%eus, model%geometry%lsrf)
-    model%geometry%usrf = max(0.d0, model%geometry%thck + model%geometry%lsrf)
+    if (model%options%whichtemp == TEMP_ENTHALPY) &
+         call parallel_halo(model%temper%waterfrac, parallel)
 
     ! Note: For outflow BCs, most fields (thck, usrf, temp, etc.) are set to zero in the global halo,
     !        to create ice-free conditions. However, we might not want to set topg = 0 in the global halo,
     !        because then the global halo will be interpreted as ice-free land, whereas we may prefer to
     !        treat it as ice-free ocean. For this reason, topg is extrapolated from adjacent cells.
-    !       Similarly, for no_ice BCs, we want to zero out ice state variables adjacent to the global boundary,
+    !       For no_ice BCs, we want to zero out ice state variables adjacent to the global boundary,
     !        but we do not want to zero out the topography.
-    ! Note: For periodic BCs, there is an optional argument periodic_offset_ew for topg.
-    !       This is for ismip-hom experiments. A positive EW offset means that
+    !       For periodic BCs, there are optional periodic_offset arguments for topg.
+    !        These are for ismip-hom experiments or similar geometries. A positive EW offset means that
     !        the topography in west halo cells will be raised, and the topography
     !        in east halo cells will be lowered.  This ensures that the topography
     !        and upper surface elevation are continuous between halo cells
     !        and locally owned cells at the edge of the global domain.
-    !       In other cases (anything but ismip-hom), periodic_offset_ew = periodic_offset_ns = 0,
-    !        and this argument will have no effect.
+    !       After this call, topg does not need another halo update unless isostasy is active.
 
-    if (model%general%global_bc == GLOBAL_BC_OUTFLOW .or.  &
-        model%general%global_bc == GLOBAL_BC_NO_ICE) then
+    if (model%general%global_bc == GLOBAL_BC_OUTFLOW) then
        call parallel_halo_extrapolate(model%geometry%topg, parallel)
+    elseif (model%general%global_bc == GLOBAL_BC_NO_ICE) then
+       call parallel_halo(model%geometry%topg, parallel, zero_global_boundary_no_ice_bc = .false.)
     else  ! other global BCs, including periodic
        call parallel_halo(model%geometry%topg, parallel, &
                           periodic_offset_ew = model%numerics%periodic_offset_ew, &
                           periodic_offset_ns = model%numerics%periodic_offset_ns)
     endif
 
-    if (model%options%whichtemp == TEMP_ENTHALPY) &
-         call parallel_halo(model%temper%waterfrac, parallel)
+    ! calculate the lower and upper ice surface (will be correct in halos following the halo updates above)
+    call glide_calclsrf(model%geometry%thck, model%geometry%topg, model%climate%eus, model%geometry%lsrf)
+    model%geometry%usrf = max(0.d0, model%geometry%thck + model%geometry%lsrf)
 
     ! halo update for kinbcmask (= 1 where uvel and vvel are prescribed, elsewhere = 0)
     ! Note: Instead of assuming that kinbcmask is periodic, we extrapolate it into the global halo
@@ -702,6 +673,34 @@ subroutine glissade_initialise(model, evolve_ice)
     !       on the global staggered grid).
     call staggered_parallel_halo_extrapolate (model%velocity%kinbcmask, parallel)  ! = 1 for Dirichlet BCs
 
+    if (model%options%enable_glaciers) then
+
+       ! If running with glaciers, then process the input glacier data and initialize glacier arrays
+
+       ! Note: On start-up, this subroutine counts the glaciers.  It should be called before glide_io_createall,
+       !       which needs to know nglacier to set up glacier output files with the right dimensions.
+       !       On restart, most of the required glacier arrays are in the restart file, and this subroutine
+       !       computes a few remaining variables.
+       ! Note: Glaciers are usually run with a no-ice BC to allow removal of inactive regions.
+       !       This means that any grid cells adjacent to the global boundary are not handled correctly.
+       !       The preceding halo update for 'thck' removes ice from these grid cells.
+       ! Note: If glacier%length_scale_factor /= 1, This subroutine modifies the values of model%numerics%dew,
+       !       model%numerics%dns, model%geometry%cell_area, and the grid coordinates (x0,y0) and (x1,y1).
+       !       This is done if the true grid cell dimensions differ from the nominal dimensions on a projected grid.
+       !       See comments near the top of glissade_glacier_init.
+
+       call glissade_glacier_init(model, model%glacier)
+
+    endif
+
+    ! open all output files
+    call openall_out(model)
+
+    ! create glide I/O variables
+    call glide_io_createall(model, model)
+
+    ! initialize glissade components
+
     !TODO - Remove call to init_velo in glissade_initialise?
     !       Most of what's done in init_velo is needed for SIA only, but still need velowk for call to wvelintg
     call init_velo(model)
@@ -861,10 +860,11 @@ subroutine glissade_initialise(model, evolve_ice)
     !        adjacent to or beyond the global boundary. This is an appropriate treatment for
     !        ice state variables, but not for bed topography and related fields (like relx).
     !TODO - Is this halo update necessary?
-    if (model%general%global_bc == GLOBAL_BC_OUTFLOW .or. &
-        model%general%global_bc == GLOBAL_BC_NO_ICE) then
+    if (model%general%global_bc == GLOBAL_BC_OUTFLOW) then
        call parallel_halo_extrapolate(model%isostasy%relx, parallel)
-    else
+    elseif (model%general%global_bc == GLOBAL_BC_NO_ICE) then
+       call parallel_halo(model%isostasy%relx, parallel, zero_global_boundary_no_ice_bc = .false.)
+    else  ! other global BCs, including periodic
        call parallel_halo(model%isostasy%relx, parallel)
     endif
 
@@ -2232,7 +2232,6 @@ subroutine glissade_thickness_tracer_solve(model)
 
        ! pre-transport halo updates for thickness and tracers
        call parallel_halo(model%geometry%thck, parallel)
-       call parallel_halo(model%geometry%topg, parallel)
        call parallel_halo_tracers(model%geometry%tracers, parallel)
        call parallel_halo_tracers(model%geometry%tracers_usrf, parallel)
        call parallel_halo_tracers(model%geometry%tracers_lsrf, parallel)
@@ -3070,12 +3069,14 @@ subroutine glissade_isostasy_solve(model)
        !        but the argument is included to be on the safe side.
        ! TODO: Do we need similar logic for halo updates of relx?
 
-       if (model%general%global_bc == GLOBAL_BC_OUTFLOW .or. &
-           model%general%global_bc == GLOBAL_BC_NO_ICE) then
+       if (model%general%global_bc == GLOBAL_BC_OUTFLOW) then
           call parallel_halo_extrapolate(model%geometry%topg, parallel)
+       elseif (model%general%global_bc == GLOBAL_BC_NO_ICE) then
+          call parallel_halo(model%geometry%topg, parallel, zero_global_boundary_no_ice_bc = .false.)
        else  ! other global BCs, including periodic
           call parallel_halo(model%geometry%topg, parallel, &
-                             periodic_offset_ew = model%numerics%periodic_offset_ew)
+                          periodic_offset_ew = model%numerics%periodic_offset_ew, &
+                          periodic_offset_ns = model%numerics%periodic_offset_ns)
        endif
 
        ! update the marine connection mask, which depends on topg
diff --git a/libglissade/glissade_glacier.F90 b/libglissade/glissade_glacier.F90
index 67744610..3d6d2641 100644
--- a/libglissade/glissade_glacier.F90
+++ b/libglissade/glissade_glacier.F90
@@ -91,7 +91,7 @@ subroutine glissade_glacier_init(model, glacier)
 
     integer :: i, j, nc, ng, count
     integer :: iglobal, jglobal
-    integer :: ng_west, ng_east, ng_south, ng_north
+    integer :: ng_ne, ng_nw, ng_se, ng_sw
     integer :: min_id, max_id
     real(dp) :: max_glcval
     real(dp) :: theta_rad             ! latitude in radians
@@ -155,9 +155,15 @@ subroutine glissade_glacier_init(model, glacier)
 
     if (glacier%scale_area) then
 
-       ! Optionally, rescale the grid cell dimensions dew and dns
+       ! Optionally, rescale the grid cell dimensions and coordinates
        ! This is answer-changing throughout the code.
+       ! Note: The global arrays model%general%x1_global, etc., which are written to output files, are not rescaled.
+       !       These arrays are computed from the input file, which typically ignores the scale factor.
        if (glacier%length_scale_factor /= 1.0d0) then
+          model%general%x0 = model%general%x0 * glacier%length_scale_factor
+          model%general%y0 = model%general%y0 * glacier%length_scale_factor
+          model%general%x1 = model%general%x1 * glacier%length_scale_factor
+          model%general%y1 = model%general%y1 * glacier%length_scale_factor
           model%numerics%dew = model%numerics%dew * glacier%length_scale_factor
           model%numerics%dns = model%numerics%dns * glacier%length_scale_factor
           dew = model%numerics%dew
@@ -661,35 +667,25 @@ subroutine glissade_glacier_init(model, glacier)
     endif
     call broadcast(glacier%ngdiag, rtest)
 
-    ! Define a mask whose value is 1 at vertices along the boundary between two glaciers.
-    ! At runtime, Cp is set to a large value at masked vertices to reduce flow between glaciers.
+    ! Define a mask whose value is 1 at vertices that border two different glaciers.
+    ! At runtime, Cp is set to a large value at these vertices to reduce mass exchange between glaciers.
+    !TODO: Consider removing the mask. This would allow CISM to reduce basal friction to thin the ice if needed.
     glacier%boundary_mask(:,:) = 0
 
-    ! Loop over locally owned cells
-    do j = nhalo, nsn-nhalo
-       do i = nhalo, ewn-nhalo
-          ng = glacier%cism_glacier_id_init(i,j)
-          if (ng > 0) then
-             ng_west  = glacier%cism_glacier_id_init(i-1,j)
-             ng_east  = glacier%cism_glacier_id_init(i+1,j)
-             ng_south = glacier%cism_glacier_id_init(i,j-1)
-             ng_north = glacier%cism_glacier_id_init(i,j+1)
-             if (ng_west > 0 .and. ng_west /= ng) then
-                glacier%boundary_mask(i-1,j-1) = 1
-                glacier%boundary_mask(i-1,j)   = 1
-             endif
-             if (ng_east > 0 .and. ng_east /= ng) then
-                glacier%boundary_mask(i,j-1) = 1
-                glacier%boundary_mask(i,j)   = 1
-             endif
-             if (ng_south > 0 .and. ng_south /= ng) then
-                glacier%boundary_mask(i-1,j-1) = 1
-                glacier%boundary_mask(i,j-1)   = 1
-             endif
-             if (ng_north > 0 .and. ng_north /= ng) then
-                glacier%boundary_mask(i-1,j) = 1
-                glacier%boundary_mask(i,j)   = 1
-             endif
+    ! Loop over locally owned vertices
+    do j = nhalo+1, nsn-nhalo
+       do i = nhalo+1, ewn-nhalo
+          ng_ne = glacier%cism_glacier_id_init(i+1,j+1)
+          ng_nw = glacier%cism_glacier_id_init(i,j+1)
+          ng_se = glacier%cism_glacier_id_init(i+1,j)
+          ng_sw = glacier%cism_glacier_id_init(i,j)
+          if ( (ng_ne > 0 .and. ng_nw > 0 .and. ng_ne /= ng_nw) .or. &
+               (ng_ne > 0 .and. ng_se > 0 .and. ng_ne /= ng_se) .or. &
+               (ng_ne > 0 .and. ng_sw > 0 .and. ng_ne /= ng_sw) .or. &
+               (ng_nw > 0 .and. ng_se > 0 .and. ng_nw /= ng_se) .or. &
+               (ng_nw > 0 .and. ng_sw > 0 .and. ng_nw /= ng_sw) .or. &
+               (ng_se > 0 .and. ng_sw > 0 .and. ng_se /= ng_sw) ) then
+             glacier%boundary_mask(i,j) = 1
           endif
        enddo
     enddo
@@ -697,6 +693,7 @@ subroutine glissade_glacier_init(model, glacier)
     call staggered_parallel_halo(glacier%boundary_mask, parallel)
 
     if (verbose_glacier) then
+       call point_diag(glacier%cism_glacier_id_init, 'cism_glacier_id_init', itest, jtest, rtest, 7, 7)
        call point_diag(glacier%boundary_mask, 'Glacier boundary mask', itest, jtest, rtest, 7, 7)
     endif
 
@@ -1431,8 +1428,7 @@ subroutine glissade_glacier_update(model, glacier)
        !-------------------------------------------------------------------------
 
        if (verbose_glacier) then
-          call point_diag(model%geometry%topg, 'topg', itest, jtest, rtest, 7, 7)
-          call point_diag(thck, 'Before advance_retreat, thck', itest, jtest, rtest, 7, 7)
+          call point_diag(model%geometry%thck, 'Before advance_retreat, thck', itest, jtest, rtest, 7, 7)
        endif
 
        ! Assign nonzero IDs in grid cells where ice has reached the minimum glacier thickness.
@@ -2495,6 +2491,7 @@ subroutine glacier_invert_powerlaw_c(&
 
     if (verbose_glacier) then
        call point_diag(stag_thck, 'stag_thck (m)', itest, jtest, rtest, 7, 7)
+       call point_diag(stag_thck_target, 'stag_thck_target (m)', itest, jtest, rtest, 7, 7)
        call point_diag(stag_dthck, 'stag_thck - stag_thck_target (m)', itest, jtest, rtest, 7, 7)
        call point_diag(stag_dthck_dt, 'stag_dthck_dt (m/yr)', itest, jtest, rtest, 7, 7)
        call point_diag(powerlaw_c, 'new powerlaw_c', itest, jtest, rtest, 7, 7)
@@ -2850,7 +2847,7 @@ subroutine glacier_advance_retreat(&
     call parallel_halo(cism_glacier_id, parallel)
 
     ! Check advanced cells (beyond the initial extent) for problematic glacier IDs.
-    ! This code protects against glacier 'pirating', which ccan occur when an advanced cell
+    ! This code protects against glacier 'pirating', which can occur when an advanced cell
     !  is adjacent to two different glaciers, call them A and B.
     ! Suppose the cell is fed primarily by glacier A but has the same ID as glacier B,
     !  and has a more positive SMB as a result of belonging to B rather than A.
diff --git a/libglissade/glissade_inversion.F90 b/libglissade/glissade_inversion.F90
index c8d38bff..d12f25ca 100644
--- a/libglissade/glissade_inversion.F90
+++ b/libglissade/glissade_inversion.F90
@@ -47,7 +47,7 @@ module glissade_inversion
   ! a target ice thickness field.
   !-----------------------------------------------------------------------------
 
-    logical, parameter :: verbose_inversion = .false.
+  logical, parameter :: verbose_inversion = .false.
 
 !***********************************************************************
 
diff --git a/libglissade/glissade_utils.F90 b/libglissade/glissade_utils.F90
index 0a959f1e..4f1de07f 100644
--- a/libglissade/glissade_utils.F90
+++ b/libglissade/glissade_utils.F90
@@ -687,7 +687,7 @@ subroutine glissade_usrf_to_thck(usrf, topg, eus, thck)
     ! That is, if topg - eus < 0 (marine-based ice), and if the upper surface is too close
     !  to sea level to ground the ice, then the ice thickness is chosen to satisfy
     !  rhoi*H = -rhoo*(topg-eus).
-    ! Note: usrf, topg, eus and thck must all have the same units (often but not necessarily meters).
+    ! Note: usrf, topg, eus and thck must all have the same units (usually but not necessarily meters).
 
     use glimmer_physcon, only : rhoo, rhoi
 
@@ -952,7 +952,7 @@ end subroutine glissade_input_fluxes
 
 
   ! subroutines belonging to the write_array_to_file interface
-  subroutine write_array_to_file_real8_2d(arr, fileunit, filename, parallel)
+  subroutine write_array_to_file_real8_2d(arr, fileunit, filename, parallel, write_binary)
 
     ! Copy the input array into a global array and write all values to an output file.
     ! This can be useful for debugging, if we want to find differences between two fields
@@ -970,20 +970,38 @@ subroutine write_array_to_file_real8_2d(arr, fileunit, filename, parallel)
     integer, intent(in) :: fileunit
     character(len=*), intent(in) :: filename
     type(parallel_type), intent(in) :: parallel
+    logical, intent(in), optional :: write_binary
 
     integer :: i, j
     character(len=64) :: binary_str
     real(dp), dimension(:,:), allocatable :: arr_global
+    logical :: binary_output
+
+    if (present(write_binary)) then
+       binary_output = write_binary
+    else
+       binary_output = .false.
+    endif
 
     call gather_var(arr, arr_global, parallel)
     if (main_task) then
        open(unit=fileunit, file=trim(filename), status='replace', position='append')
-       do j = 1, parallel%global_nsn
-          do i = 1, parallel%global_ewn
-             call double_to_binary(arr_global(i,j), binary_str)
-             write (fileunit, '(2i6,a4,a64)') i, j, '    ', binary_str
+
+       if (binary_output) then
+          do j = 1, parallel%global_nsn
+             do i = 1, parallel%global_ewn
+                call double_to_binary(arr_global(i,j), binary_str)
+                write (fileunit, '(2i6,a4,a64)') i, j, '    ', binary_str
+             enddo
           enddo
-       enddo
+       else
+          do j = 1, parallel%global_nsn
+             do i = 1, parallel%global_ewn
+                write (fileunit, '(2i6,a4,f24.16)') i, j, '    ', arr_global(i,j)
+             enddo
+          enddo
+       endif
+
        close(unit=fileunit)
        deallocate(arr_global)
     endif
@@ -991,7 +1009,7 @@ subroutine write_array_to_file_real8_2d(arr, fileunit, filename, parallel)
   end subroutine write_array_to_file_real8_2d
 
 
-  subroutine write_array_to_file_real8_3d(arr, fileunit, filename, parallel, cycle_indices)
+  subroutine write_array_to_file_real8_3d(arr, fileunit, filename, parallel, write_binary, cycle_indices)
 
     ! Copy the input array into a global array and write all values to an output file.
     ! This can be useful for debugging, if we want to find differences between two fields
@@ -1009,14 +1027,22 @@ subroutine write_array_to_file_real8_3d(arr, fileunit, filename, parallel, cycle
     integer, intent(in) :: fileunit
     character(len=*), intent(in) :: filename
     type(parallel_type), intent(in) :: parallel
+    logical, intent(in), optional :: write_binary
     logical, intent(in), optional :: cycle_indices   ! if true, then index 3->1, 1->2, 2->3
 
     integer :: i, j, k, kmax
     character(len=64) :: binary_str
     real(dp), dimension(:,:,:), allocatable :: arr_global
     real(dp), dimension(:,:,:), allocatable :: arr_cycle
+    logical :: binary_output
     logical :: cycle_ind
 
+    if (present(write_binary)) then
+       binary_output = write_binary
+    else
+       binary_output = .false.
+    endif
+
     if (present(cycle_indices)) then
        cycle_ind = cycle_indices
     else
@@ -1042,14 +1068,26 @@ subroutine write_array_to_file_real8_3d(arr, fileunit, filename, parallel, cycle
 
     if (main_task) then
        open(unit=fileunit, file=trim(filename), status='unknown')
-       do j = 1, parallel%global_nsn
-          do i = 1, parallel%global_ewn
-             do k = 1, kmax
-                call double_to_binary(arr_global(k,i,j), binary_str)
-                write (fileunit, '(3i6,a4,a64)') i, j, k, '    ', binary_str
+
+       if (binary_output) then
+          do j = 1, parallel%global_nsn
+             do i = 1, parallel%global_ewn
+                do k = 1, kmax
+                   call double_to_binary(arr_global(k,i,j), binary_str)
+                   write (fileunit, '(3i6,a4,a64)') i, j, k, '    ', binary_str
+                enddo
              enddo
           enddo
-       enddo
+       else
+          do j = 1, parallel%global_nsn
+             do i = 1, parallel%global_ewn
+                do k = 1, kmax
+                   write (fileunit, '(3i6,a4,f24.16)') i, j, k, '    ', arr_global(k,i,j)
+                enddo
+             enddo
+          enddo
+       endif
+
        close(unit=fileunit)
        deallocate(arr_global)
     endif
diff --git a/libglissade/glissade_velo_higher.F90 b/libglissade/glissade_velo_higher.F90
index 3b3ec31b..fb6796f7 100644
--- a/libglissade/glissade_velo_higher.F90
+++ b/libglissade/glissade_velo_higher.F90
@@ -2408,15 +2408,12 @@ subroutine glissade_velo_higher_solve(model,                &
                                                efvs_qp_3d)
 
              !WHL - debug - BFB check
-             if (0 == 1) then
-!!             if (verbose_reprosum .and. counter == 1) then
+             if (verbose_reprosum .and. counter == 1) then
                 if (main_task) write(iulog,*) 'Write out matrices after assemble_stiffness_matrix'
-                call write_array_to_file(Auu_2d, 21, 'global_Auu1', parallel, cycle_indices = .true.)
-                call write_array_to_file(Auv_2d, 22, 'global_Auv1', parallel, cycle_indices = .true.)
-                call write_array_to_file(Avu_2d, 23, 'global_Avu1', parallel, cycle_indices = .true.)
-                call write_array_to_file(Avv_2d, 24, 'global_Avv1', parallel, cycle_indices = .true.)
-                call write_array_to_file(bu_2d,  25, 'global_bu1',  parallel)
-                call write_array_to_file(bv_2d,  26, 'global_bv1',  parallel)
+                call write_array_to_file(Auu_2d, 21, 'global_Auu1', parallel, write_binary = .true., cycle_indices = .true.)
+                call write_array_to_file(Auv_2d, 22, 'global_Auv1', parallel, write_binary = .true., cycle_indices = .true.)
+                call write_array_to_file(Avu_2d, 23, 'global_Avu1', parallel, write_binary = .true., cycle_indices = .true.)
+                call write_array_to_file(Avv_2d, 24, 'global_Avv1', parallel, write_binary = .true., cycle_indices = .true.)
              endif
 
              !WHL - debug - BFB check
@@ -2700,15 +2697,15 @@ subroutine glissade_velo_higher_solve(model,                &
              call t_stopf('glissade_halo_bxxs')
 
              !WHL - debug - Write all the matrix elements and rhs elements (in binary form) to files
-!!             if (verbose_reprosum .and. counter == 1) then
-             if (0 == 1) then
+             if (verbose_reprosum .and. counter == 1) then
                 if (main_task) write(iulog,*) 'Write out matrices after adding BC'
-                call write_array_to_file(Auu_2d, 21, 'global_Auu3', parallel, cycle_indices = .true.)
-                call write_array_to_file(Auv_2d, 22, 'global_Auv3', parallel, cycle_indices = .true.)
-                call write_array_to_file(Avu_2d, 23, 'global_Avu3', parallel, cycle_indices = .true.)
-                call write_array_to_file(Avv_2d, 24, 'global_Avv3', parallel, cycle_indices = .true.)
-                call write_array_to_file(bu_2d,  25, 'global_bu3',  parallel)
-                call write_array_to_file(bv_2d,  26, 'global_bv3',  parallel)
+!!                call write_array_to_file(Auu_2d(:,:,5), 30, 'global_Auu2', parallel)  ! diagonal terms only
+                call write_array_to_file(Auu_2d, 31, 'global_Auu2', parallel, write_binary = .true., cycle_indices = .true.)
+                call write_array_to_file(Auv_2d, 32, 'global_Auv2', parallel, write_binary = .true., cycle_indices = .true.)
+                call write_array_to_file(Avu_2d, 33, 'global_Avu2', parallel, write_binary = .true., cycle_indices = .true.)
+                call write_array_to_file(Avv_2d, 34, 'global_Avv2', parallel, write_binary = .true., cycle_indices = .true.)
+                call write_array_to_file(bu_2d,  35, 'global_bu2',  parallel, write_binary = .true.)
+                call write_array_to_file(bv_2d,  36, 'global_bv2',  parallel, write_binary = .true.)
              endif
 
              !---------------------------------------------------------------------------
@@ -3192,9 +3189,9 @@ subroutine glissade_velo_higher_solve(model,                &
        ! Optional diagnostics
 
        if (verbose_beta .and. counter > 1 .and. mod(counter-1,12)==0) then
-!!       if (verbose_beta) then
 
-          call point_diag(log10(max(beta_internal,1.d-99)), 'log_beta', itest, jtest, rtest, 7, 7, '(f10.5)')
+          if (this_rank == rtest) write(iulog,*) 'Counter =', counter
+          call point_diag(log10(max(beta_internal,1.d-99)), 'log_beta', itest, jtest, rtest, 7, 7)
           if (solve_2d) then
              call point_diag(uvel_2d, 'Mean uvel (m/yr)', itest, jtest, rtest, 7, 7)
              call point_diag(vvel_2d, 'Mean vvel (m/yr)', itest, jtest, rtest, 7, 7)
@@ -4157,7 +4154,7 @@ subroutine get_vertex_geometry(nx,             ny,                   &
     staggered_jhi = parallel%staggered_jhi
 
     !----------------------------------------------------------------
-    ! Compute the x and y coordinates of each vertex.
+    ! Copy the x and y coordinates of each vertex from x0 and y0.
     ! By convention, vertex (i,j) lies at the NE corner of cell(i,j).
     !----------------------------------------------------------------
 

From 1a8f477a6585137f1f5a15685be6f15a0999aa5e Mon Sep 17 00:00:00 2001
From: William Lipscomb <lipscomb@ucar.edu>
Date: Mon, 12 Jan 2026 18:21:26 -0700
Subject: [PATCH 08/21] Subglacial hydrology changes to support reprosums

This commit modifies the routing algorithm of the subglacial flow-routing scheme
to support reproducible sums.

The challenge is that as water fluxes are routed downstream, they often pass from
one processor to another. This means that fluxes must be stored temporarily in halo cells,
then routed further downstream on a subsequent call to the routing subroutine.
Depending on the number of tasks, fluxes can enter a given cell in a different order
as they are accumulated. This means the final fluxes can differ at roundoff level.

The fix here is to convert the fluxes and related floating-point arrays (e.g., the weights that
compute how much water is refrozen) from dp to i8 variables before passing them to the routing subroutine.
Since the real variables are O(1) or less, they are multiplied by a factor of 1.e16 to conserve water
and make the results as close as possible to those without reprosums.

I was not able to make this work for (1) flux-routing schemes other than D8 or (2) the option
with refreezing enabled. If either of these options is turned on, the fluxes are multiplied
by floating-point numbers during each step of the routing, breaking reproducibility.
I added some code to check these options at startup and to switch, if needed, to options
consistent with reproducible sums.

I encapsulated most of the reprosum-related code in a subroutine called route_flux_to_margin_or_halo.
There are now two versions of this subroutine: one that works with reprosum-safe i8 arrays
and one that uses dp arrays as in the original code.

To support the new integer8 variables and subroutine, I added subroutines parallel_halo_integer8_2d,
parallel_global_sum_integer8_2d, and parallel_reduce_sum_integer8 in the cism_parallel_module.

For an AIS test with subglacial hydrology turned on (with D8 routing and without refreezing),
I verified that the results are now BFB with different numbers of tasks.
The results agree at roundoff level with runs without reprosums.
---
 libglide/glide_setup.F90             |  16 +-
 libglimmer/parallel_mpi.F90          | 185 ++++++-
 libglissade/glissade.F90             |  19 +-
 libglissade/glissade_basal_water.F90 | 766 ++++++++++++++++++++-------
 libglissade/glissade_velo_higher.F90 |  96 +---
 5 files changed, 786 insertions(+), 296 deletions(-)

diff --git a/libglide/glide_setup.F90 b/libglide/glide_setup.F90
index d28088be..682ff56d 100644
--- a/libglide/glide_setup.F90
+++ b/libglide/glide_setup.F90
@@ -1971,9 +1971,6 @@ subroutine print_options(model)
                model%basal_hydro%ho_flux_routing_scheme >= size(ho_flux_routing_scheme)) then
              call write_log('Error, HO flux routing scheme out of range', GM_FATAL)
           end if
-          write(message,*) 'ho_flux_routing_scheme  : ',model%basal_hydro%ho_flux_routing_scheme,  &
-               ho_flux_routing_scheme(model%basal_hydro%ho_flux_routing_scheme)
-          call write_log(message)
        endif
 
        write(message,*) 'ho_whicheffecpress      : ',model%options%which_ho_effecpress,  &
@@ -3388,6 +3385,19 @@ subroutine print_basal_hydro(model)
             model%basal_hydro%ho_flux_routing_scheme >= size(ho_flux_routing_scheme)) then
           call write_log('Error, HO flux routing scheme out of range', GM_FATAL)
        end if
+       if (model%options%reproducible_sums) then
+          if (model%basal_hydro%ho_flux_routing_scheme /= HO_FLUX_ROUTING_D8) then
+             write(message,*) 'With reproducible sums, only D8 flux-routing is supported; switching to D8'
+             model%basal_hydro%ho_flux_routing_scheme = HO_FLUX_ROUTING_D8
+             call write_log(message)
+          endif
+          if (model%basal_hydro%btemp_scale > 0.0d0) then
+             write(message,*) 'With reproducible sums, the flux-routing does not support refreezing;' // &
+                  'setting btemp_scale = 0'
+             call write_log(message)
+             model%basal_hydro%btemp_scale = 0.0d0
+          endif
+       endif  ! reproducible sums
        write(message,*) 'ho_flux_routing_scheme        : ',model%basal_hydro%ho_flux_routing_scheme,  &
             ho_flux_routing_scheme(model%basal_hydro%ho_flux_routing_scheme)
        call write_log(message)
diff --git a/libglimmer/parallel_mpi.F90 b/libglimmer/parallel_mpi.F90
index 57c2b20e..6a941283 100644
--- a/libglimmer/parallel_mpi.F90
+++ b/libglimmer/parallel_mpi.F90
@@ -27,7 +27,7 @@
 module cism_parallel
 
   use netcdf
-  use glimmer_global, only : dp, sp
+  use glimmer_global, only : dp, sp, i8
   use glimmer_paramets, only: iulog
 
   use cism_reprosum_mod, only: cism_reprosum_calc, verbose_reprosum
@@ -297,6 +297,7 @@ module cism_parallel
   interface parallel_global_sum
      module procedure parallel_global_sum_integer_2d
      module procedure parallel_global_sum_integer_3d
+     module procedure parallel_global_sum_integer8_2d
      module procedure parallel_global_sum_real8_2d
      module procedure parallel_global_sum_real8_3d
   end interface
@@ -321,6 +322,7 @@ module cism_parallel
      module procedure parallel_halo_real8_2d
      module procedure parallel_halo_real8_3d
      module procedure parallel_halo_real8_4d
+     module procedure parallel_halo_integer8_4d
   end interface
 
   interface parallel_halo_extrapolate
@@ -401,6 +403,7 @@ module cism_parallel
 
   interface parallel_reduce_sum
      module procedure parallel_reduce_sum_integer
+     module procedure parallel_reduce_sum_integer8
      module procedure parallel_reduce_sum_real4
      module procedure parallel_reduce_sum_real8
      module procedure parallel_reduce_sum_integer_nvar
@@ -6117,6 +6120,45 @@ function parallel_global_sum_integer_3d(a, parallel, mask_3d)
 
   end function parallel_global_sum_integer_3d
 
+!=======================================================================
+
+  function parallel_global_sum_integer8_2d(a, parallel, mask_2d)
+
+    ! Calculates the global sum of a 2D integer(i8) field
+
+    integer(i8), dimension(:,:),intent(in) :: a
+    type(parallel_type) :: parallel
+    integer, dimension(:,:), intent(in), optional :: mask_2d
+
+    integer :: i, j
+    integer, dimension(parallel%local_ewn,parallel%local_nsn) :: mask
+    integer(i8) :: local_sum
+    integer(i8) :: parallel_global_sum_integer8_2d
+
+    associate(  &
+         local_ewn   => parallel%local_ewn,    &
+         local_nsn   => parallel%local_nsn)
+
+   if (present(mask_2d)) then
+       mask = mask_2d
+    else
+       mask = 1
+    endif
+
+    local_sum = 0
+    do j = nhalo+1, local_nsn-nhalo
+       do i = nhalo+1, local_ewn-nhalo
+          if (mask(i,j) == 1) then
+             local_sum = local_sum + a(i,j)
+          endif
+       enddo
+    enddo
+    parallel_global_sum_integer8_2d = parallel_reduce_sum(local_sum)
+
+    end associate
+
+  end function parallel_global_sum_integer8_2d
+
 !=======================================================================
 
   function parallel_global_sum_real8_2d(a, parallel, mask_2d)
@@ -8034,6 +8076,130 @@ subroutine parallel_halo_real8_4d(a, parallel)
 
   end subroutine parallel_halo_real8_4d
 
+
+  subroutine parallel_halo_integer8_4d(a, parallel)
+
+    use mpi_mod
+    implicit none
+    integer(i8),dimension(:,:,:,:) :: a
+    type(parallel_type) :: parallel
+
+    integer :: erequest,ierror,one,nrequest,srequest,wrequest
+    integer(i8),dimension(size(a,1), size(a,2), lhalo, parallel%local_nsn-lhalo-uhalo) :: esend,wrecv
+    integer(i8),dimension(size(a,1), size(a,2), uhalo, parallel%local_nsn-lhalo-uhalo) :: erecv,wsend
+    integer(i8),dimension(size(a,1), size(a,2), parallel%local_ewn, lhalo) :: nsend,srecv
+    integer(i8),dimension(size(a,1), size(a,2), parallel%local_ewn, uhalo) :: nrecv,ssend
+
+    ! begin
+    associate(  &
+         outflow_bc  => parallel%outflow_bc,   &
+         no_ice_bc   => parallel%no_ice_bc,    &
+         local_ewn   => parallel%local_ewn,    &
+         local_nsn   => parallel%local_nsn,    &
+         east        => parallel%east,         &
+         west        => parallel%west,         &
+         north       => parallel%north,        &
+         south       => parallel%south,        &
+         southwest_corner  => parallel%southwest_corner,   &
+         southeast_corner  => parallel%southeast_corner,   &
+         northeast_corner  => parallel%northeast_corner,   &
+         northwest_corner  => parallel%northwest_corner    &
+         )
+
+    ! staggered grid
+    if (size(a,3)==local_ewn-1.and.size(a,4)==local_nsn-1) return
+
+    ! unknown grid
+    if (size(a,3)/=local_ewn.or.size(a,4)/=local_nsn) then
+         write(iulog,*) "Unknown Grid: Size a=(", size(a,1), ",", size(a,2), ",", size(a,3), ",", size(a,4), ") &
+                 &and local_ewn and local_nsn = ", local_ewn, ",", local_nsn
+         call parallel_stop(__FILE__,__LINE__)
+    endif
+
+    ! unstaggered grid
+    call mpi_irecv(wrecv,size(wrecv),mpi_real8,west,west,&
+         comm,wrequest,ierror)
+    call mpi_irecv(erecv,size(erecv),mpi_real8,east,east,&
+         comm,erequest,ierror)
+    call mpi_irecv(srecv,size(srecv),mpi_real8,south,south,&
+         comm,srequest,ierror)
+    call mpi_irecv(nrecv,size(nrecv),mpi_real8,north,north,&
+         comm,nrequest,ierror)
+
+    esend(:,:,:,:) = &
+         a(:,:,local_ewn-uhalo-lhalo+1:local_ewn-uhalo,1+lhalo:local_nsn-uhalo)
+    call mpi_send(esend,size(esend),mpi_real8,east,this_rank,comm,ierror)
+    wsend(:,:,:,:) = a(:,:,1+lhalo:1+lhalo+uhalo-1,1+lhalo:local_nsn-uhalo)
+    call mpi_send(wsend,size(wsend),mpi_real8,west,this_rank,comm,ierror)
+
+    call mpi_wait(wrequest,mpi_status_ignore,ierror)
+    a(:,:,:lhalo,1+lhalo:local_nsn-uhalo) = wrecv(:,:,:,:)
+    call mpi_wait(erequest,mpi_status_ignore,ierror)
+    a(:,:,local_ewn-uhalo+1:,1+lhalo:local_nsn-uhalo) = erecv(:,:,:,:)
+
+    nsend(:,:,:,:) = a(:,:,:,local_nsn-uhalo-lhalo+1:local_nsn-uhalo)
+    call mpi_send(nsend,size(nsend),mpi_real8,north,this_rank,comm,ierror)
+    ssend(:,:,:,:) = a(:,:,:,1+lhalo:1+lhalo+uhalo-1)
+    call mpi_send(ssend,size(ssend),mpi_real8,south,this_rank,comm,ierror)
+
+    call mpi_wait(srequest,mpi_status_ignore,ierror)
+    a(:,:,:,:lhalo) = srecv(:,:,:,:)
+    call mpi_wait(nrequest,mpi_status_ignore,ierror)
+    a(:,:,:,local_nsn-uhalo+1:) = nrecv(:,:,:,:)
+
+    if (outflow_bc) then   ! set values in global halo to zero
+                           ! interior halo cells should not be affected
+
+       if (this_rank >= east) then  ! at east edge of global domain
+          a(:,:,local_ewn-uhalo+1:,:) = 0
+       endif
+
+       if (this_rank <= west) then  ! at west edge of global domain
+          a(:,:,:lhalo,:) = 0
+       endif
+
+       if (this_rank >= north) then  ! at north edge of global domain
+          a(:,:,:,local_nsn-uhalo+1:) = 0
+       endif
+
+       if (this_rank <= south) then  ! at south edge of global domain
+          a(:,:,:,:lhalo) = 0
+       endif
+
+    elseif (no_ice_bc) then
+
+       ! Set values to zero in cells adjacent to the global boundary;
+       ! includes halo cells and one row of locally owned cells
+
+       if (this_rank >= east) then  ! at east edge of global domain
+          a(:,:,local_ewn-uhalo:,:) = 0
+       endif
+
+       if (this_rank <= west) then  ! at west edge of global domain
+          a(:,:,:lhalo+1,:) = 0
+       endif
+
+       if (this_rank >= north) then  ! at north edge of global domain
+          a(:,:,:,local_nsn-uhalo:) = 0
+       endif
+
+       if (this_rank <= south) then  ! at south edge of global domain
+          a(:,:,:,:lhalo+1) = 0
+       endif
+
+       ! Some interior blocks have a single cell at a corner of the global boundary.
+       ! Set values in corner cells to zero, along with adjacent halo cells.
+       if (southwest_corner) a(:,:,:lhalo+1,:lhalo+1) = 0
+       if (southeast_corner) a(:,:,local_ewn-lhalo:,:lhalo+1) = 0
+       if (northeast_corner) a(:,:,local_ewn-lhalo:,local_nsn-lhalo:) = 0
+       if (northwest_corner) a(:,:,:lhalo+1,local_nsn-lhalo:) = 0
+
+    endif   ! outflow or no_ice bc
+
+    end associate
+
+  end subroutine parallel_halo_integer8_4d
+
 !=======================================================================
 
   ! subroutines for 1D halo updates
@@ -9276,6 +9442,23 @@ function parallel_reduce_sum_integer(x)
   end function parallel_reduce_sum_integer
 
 
+  function parallel_reduce_sum_integer8(x)
+
+    use mpi_mod
+    implicit none
+    integer(i8) :: x
+
+    integer :: ierror
+    integer(i8) :: recvbuf,sendbuf, parallel_reduce_sum_integer8
+
+    ! begin
+    sendbuf = x
+    call mpi_allreduce(sendbuf,recvbuf,1,mpi_integer8,mpi_sum,comm,ierror)
+    parallel_reduce_sum_integer8 = recvbuf
+
+  end function parallel_reduce_sum_integer8
+
+
   function parallel_reduce_sum_real4(x)
 
     use mpi_mod
diff --git a/libglissade/glissade.F90 b/libglissade/glissade.F90
index cfcfec39..ceae1895 100644
--- a/libglissade/glissade.F90
+++ b/libglissade/glissade.F90
@@ -1773,6 +1773,7 @@ subroutine glissade_thermal_solve(model, dt)
     use glissade_masks, only: glissade_get_masks
     !WHL - debug
     use cism_parallel, only: parallel_reduce_max
+    use glissade_utils, only: write_array_to_file
 
     implicit none
 
@@ -1911,12 +1912,6 @@ subroutine glissade_thermal_solve(model, dt)
           endwhere
        endif
 
-       !WHL - debug - Set mask = 0 where thck = 0 for dome test
-       ! An alternative would be to identify cells that have a path through land to the domain edge
-       where (model%geometry%thck == 0)
-          bwat_mask = 0
-       endwhere
-
        call parallel_halo(bwat_mask, parallel)
 
        ! Set the meltwater source for the basal hydrology scheme.
@@ -1931,15 +1926,6 @@ subroutine glissade_thermal_solve(model, dt)
           model%basal_hydro%bmlt_hydro = 0.0d0
        endwhere
 
-       call glissade_calcbwat(&
-            model%options%which_ho_bwat,      &
-            model%basal_hydro,                &
-            dt,                               &  ! s
-            model%geometry%thck,              &  ! m
-            model%numerics%thklim_temp,       &  ! m
-            model%basal_melt%bmlt_ground,     &  ! m/s
-            model%basal_hydro%bwat)              ! m
-
        ! Compute the steady-state basal water flux based on a flux-routing scheme
 
        call glissade_bwat_flux_routing(&
@@ -1960,7 +1946,8 @@ subroutine glissade_thermal_solve(model, dt)
             model%basal_hydro%bwat_diag,                      &  ! m
             model%temper%bhydroflx,                           &  ! W/m2
             model%basal_hydro%head,                           &  ! m
-            model%basal_hydro%grad_head)                         ! m/m
+            model%basal_hydro%grad_head,                      &  ! m/m
+            reprosum_in = model%options%reproducible_sums)
 
        ! halo updates (not sure if all are needed)
        call parallel_halo(model%basal_hydro%bwatflx, parallel)
diff --git a/libglissade/glissade_basal_water.F90 b/libglissade/glissade_basal_water.F90
index 593b33b2..8f9e03ef 100644
--- a/libglissade/glissade_basal_water.F90
+++ b/libglissade/glissade_basal_water.F90
@@ -26,13 +26,14 @@
 
 module glissade_basal_water
 
-   use glimmer_global, only: dp
+   use glimmer_global, only: dp, i8
    use glimmer_paramets, only: iulog, eps11, eps08
    use glimmer_physcon, only: rhoi, rhow, lhci, grav, scyr
    use glimmer_log
    use glimmer_utils, only: point_diag
    use glide_types
-   use cism_parallel, only: main_task, this_rank, nhalo, parallel_type, parallel_halo
+   use cism_parallel, only: main_task, this_rank, nhalo, parallel_type, &
+        parallel_halo, parallel_global_sum
 
    !WHL - debug
    use glimmer_utils, only: double_to_binary
@@ -42,8 +43,16 @@ module glissade_basal_water
    private
    public :: glissade_basal_water_init, glissade_calcbwat, glissade_bwat_flux_routing
 
-   logical, parameter :: verbose_bwat = .false.
-!!   logical, parameter :: verbose_bwat = .true.
+!!   logical, parameter :: verbose_bwat = .false.
+   logical, parameter :: verbose_bwat = .true.
+
+   character(len=64) :: binary_str
+
+   ! two versions of this subroutine; the second supports reproducible sums
+   interface route_flux_to_margin_or_halo
+      module procedure route_flux_to_margin_or_halo_real8
+      module procedure route_flux_to_margin_or_halo_integer8
+   end interface
 
  contains
 
@@ -179,7 +188,8 @@ subroutine glissade_bwat_flux_routing(&
        delta_Tb,      btemp_scale,   &
        bwatflx,       bwat_diag,     &
        bhydroflx,                    &
-       head,          grad_head)
+       head,          grad_head,     &
+       reprosum_in)
 
     ! Compute the subglacial water flux and water depth using a steady-state flux routing scheme.
     ! Water is routed down the hydropotential. For routing purposes, assume p_w = p_i (i.e., N = 0).
@@ -218,7 +228,7 @@ subroutine glissade_bwat_flux_routing(&
 
     integer, dimension(nx,ny), intent(in) ::  &
          bwat_mask,               & ! mask to identify cells through which basal water is routed;
-                                    ! = 0 for floating and ocean cells; cells at global domain edge;
+                                    ! = 0 for floating and ocean cells, cells at global domain edge,
                                     !  and cells with thck = 0 and forced negative SMB
          floating_mask              ! = 1 if ice is present (thck > thklim) and floating, else = 0
 
@@ -237,6 +247,11 @@ subroutine glissade_bwat_flux_routing(&
          head,                    & ! hydraulic head (m)
          grad_head                  ! gradient of hydraulic head (m/m), averaged to cell centers
 
+    ! Note: The reprosum option requires (1) D8 routing (each cell routes its flux to one downstream neighbor only)
+    !       and (2) no temperature-weighted refreezing.
+    logical, intent(in), optional :: &
+         reprosum_in                ! if true, then do a computation independent of the number of tasks
+
     ! Local variables
 
     integer :: i, j, p
@@ -277,6 +292,8 @@ subroutine glissade_bwat_flux_routing(&
     real(dp) :: c_flux_to_depth              ! proportionality coefficient in Sommers et al., Eq. 6
     real(dp) :: Reynolds                     ! Reynolds number (unitless), = 0 for pure laminar flow
 
+    logical :: reprosum                      ! local version of reprosum_in
+
     integer :: nx_test, ny_test
     real(dp), dimension(:,:), allocatable :: phi_test
     integer,  dimension(:,:), allocatable :: mask_test
@@ -285,12 +302,16 @@ subroutine glissade_bwat_flux_routing(&
        write(iulog,*) 'In glissade_bwat_flux_routing: rtest, itest, jtest =', rtest, itest, jtest
     endif
 
+    if (present(reprosum_in)) then
+       reprosum = reprosum_in
+    else
+       reprosum = .false.
+    endif
+
     ! Uncomment if the following fields are not already up to date in halo cells
 !    call parallel_halo(thk,  parallel)
 !    call parallel_halo(topg, parallel)
     call parallel_halo(bmlt_hydro, parallel)
-    !TODO - Add bfrz?
-
 
     ! Compute the hydraulic head
     ! For purposes of flux routing, assume N = 0.
@@ -313,7 +334,6 @@ subroutine glissade_bwat_flux_routing(&
     endif
 
     ! Route basal water down the gradient of hydraulic head, giving a water flux
-    ! TODO - Pass in bfrz_pot, return bfrz?
 
     call route_basal_water(&
          nx,      ny,            &
@@ -328,7 +348,8 @@ subroutine glissade_bwat_flux_routing(&
          bwat_mask,              &
          bwatflx,                &
          bwatflx_refreeze,       &
-         lakes)
+         lakes,                  &
+         reprosum)
 
     call parallel_halo(bwatflx, parallel)
 
@@ -366,7 +387,11 @@ subroutine glissade_bwat_flux_routing(&
     bwatflx(:,:) = bwatflx(:,:) / (dx*dy)
 
     ! Given bwatflx_refreeze in m^3/s, compute bhydroflx in W/m2.
-    ! This is the heat flux needed to refreeze the meltwater held in each cell
+    ! This is the heat flux needed to refreeze the meltwater held in each cell.
+    ! This heat flux is supplied at the bed during the next thermal solve.
+    ! If there is more than enough heat to thaw the bed, some meltwater will be returned later
+    !  instead of refrozen.
+
     bhydroflx(:,:) = bwatflx_refreeze(:,:) * rhoi * lhci / (dx*dy)
 
     if (verbose_bwat) then
@@ -447,7 +472,8 @@ subroutine route_basal_water(&
          bwat_mask,              &
          bwatflx,                &
          bwatflx_refreeze,       &
-         lakes)
+         lakes,                  &
+         reprosum_in)
 
     ! Route water from the basal melt field to its destination, recording the water flux along the way.
     ! Water flow direction is determined according to the gradient of the hydraulic head.
@@ -456,11 +482,7 @@ subroutine route_basal_water(&
     ! This results in the lakes field, which is the difference between the filled head and the original head.
     !  The method used is by Quinn et. al. (1991).
     !
-    ! Based on code by Jesse Johnson (2005), adapted from the glimmer_routing file by Ian Rutt.
-
-    ! TODO - Pass in bfrz_pot, return bfrz.
-
-    use cism_parallel, only: parallel_global_sum
+    ! Originally based on code by Jesse Johnson and Ian Rutt in the Glimmer model
 
     !WHL - debug
     use cism_parallel, only: parallel_globalindex, parallel_reduce_max
@@ -494,19 +516,22 @@ subroutine route_basal_water(&
 
     integer, dimension(nx,ny), intent(in)  ::  &
          bwat_mask               ! mask to identify cells through which basal water is routed;
-                                 ! = 1 where ice is present and not floating
+                                 ! excludes floating and ocean cells
 
     real(dp), dimension(nx,ny), intent(out) ::  &
          bwatflx,             &  ! water flux through a grid cell (m^3/s)
          bwatflx_refreeze,    &  ! water flux held for refreezing (m^3/s)
          lakes                   ! lakes field, difference between filled and original head
 
+    logical, intent(in), optional :: &
+         reprosum_in             ! if true, then do a computation independent of the number of tasks
+
     ! Local variables
 
     integer :: nlocal            ! number of locally owned cells
     integer :: count, count_max  ! iteration counters
-    integer :: i, j, k, ii, jj, ip, jp, p
-    integer :: i1, j1, i2, j2, itmp, jtmp, iglobal, jglobal
+    integer :: i, j, k, iglobal, jglobal
+    integer :: ii, jj, imax, jmax
 
     logical :: finished    ! true when an iterative loop has finished
 
@@ -514,15 +539,13 @@ subroutine route_basal_water(&
          sorted_ij         ! i and j indices of all cells, sorted from low to high values of head
 
     real(dp), dimension(-1:1,-1:1,nx,ny) ::  &
-         flux_fraction, &  ! fraction of flux from each cell that flows downhill to each of 8 neighbors
-         bwatflx_halo      ! water flux (m^3/s) routed to a neighboring halo cell; routed further in next iteration
+         flux_fraction     ! fraction of flux from each cell that flows downhill to each of 8 neighbors
 
     real(dp), dimension(nx,ny) ::  &
          head_filled,           & ! head after depressions are filled (m)
+         btemp_weight,          & ! temperature-dependent weighting factor, favoring flow where the bed is thawed
          bwatflx_accum,         & ! water flux through the cell (m^3/s) accumulated over multiple iterations
-         bwatflx_refreeze_accum,& ! water flux (m^3/s) refreezing in place, accumulated over multiple iterations
-         sum_bwatflx_halo,      & ! bwatflx summed over the first 2 dimensions in each grid cell
-         btemp_weight             ! temperature-dependent weighting factor, favoring flow where the bed is thawed
+         bwatflx_refreeze_accum   ! water flux (m^3/s) refreezing in place, accumulated over multiple iterations
 
     integer, dimension(nx,ny) ::  &
          local_mask,            & ! = 1 for cells owned by the local processor, else = 0
@@ -537,11 +560,33 @@ subroutine route_basal_water(&
          err,                   & ! water conservation error
          global_flux_sum          ! flux sum over all cells in global domain
 
+    ! The following i8 variables are for computing reproducible sums
+    integer(i8), dimension(nx,ny) ::  &
+         bwatflx_int,               & ! water flux through a grid cell (m^3/s)
+         btemp_weight_int,          & ! temperature-dependent weighting factor, favoring flow where the bed is thawed
+         bwatflx_accum_int,         & ! water flux through the cell (m^3/s) accumulated over multiple iterations
+         bwatflx_refreeze_accum_int   ! water flux (m^3/s) refreezing in place, accumulated over multiple iterations
+
+    integer(i8), dimension(-1:1,-1:1,nx,ny) ::  &
+         flux_fraction_int            ! fraction of flux from each cell that flows downhill to each of 8 neighbors
+
+    real(dp), parameter :: &
+         factor_bwatflx = 1.d16       ! factor for converting between bwatflx and bwatflx_int;
+                                      ! large value desired for water mass conservation
+
+    logical :: reprosum               ! local version of reprosum_in
+
     character(len=100) :: message
 
     !WHL - debug
     character(len=64) :: binary_str
 
+    if (present(reprosum_in)) then
+       reprosum = reprosum_in
+    else
+       reprosum = .false.
+    endif
+
     ! Allocate the sorted_ij array
 
     nlocal = parallel%own_ewn * parallel%own_nsn
@@ -622,21 +667,25 @@ subroutine route_basal_water(&
     ! (2) When water enters a frozen cell (delta_Tb > 0), btemp_weight is used to determine
     !     how much of the flux is refrozen in place rather than passing through.
     !     A low value of btemp_weight means that less water passes through.
+    ! Note: For reproducible sums, refreezing is not supported; must have btemp_weight = 1 everywhere.
 
     btemp_weight = 1.0d0
 
     if (btemp_scale > 0.0d0) then
-       where (bwat_mask == 1)
-          where (delta_Tb > 0.0d0)
-             btemp_weight = exp(-delta_Tb/btemp_scale)
+       if (.not. reprosum) then
+          where (bwat_mask == 1)
+             where (delta_Tb > 0.0d0)
+                btemp_weight = exp(-delta_Tb/btemp_scale)
+             endwhere
           endwhere
-       endwhere
-       if (verbose_bwat) then
-          call point_diag(delta_Tb, 'Tpmp - Tb', itest, jtest, rtest, 7, 7)
-          call point_diag(btemp_weight, 'btemp_weight', itest, jtest, rtest, 7, 7)
        endif
     endif
 
+    if (verbose_bwat) then
+       call point_diag(delta_Tb, 'Tpmp - Tb', itest, jtest, rtest, 7, 7)
+       call point_diag(btemp_weight, 'btemp_weight', itest, jtest, rtest, 7, 7)
+    endif
+
     ! Compute the fraction of the incoming flux sent to each downstream neighbor.
 
     call get_flux_fraction(&
@@ -650,14 +699,8 @@ subroutine route_basal_water(&
          bwat_mask,             &
          flux_fraction)
 
-    ! Initialize bwatflx in locally owned cells with the basal melt, which will be routed downslope.
-    ! Multiply by area, so units are m^3/s.
-    ! The halo water flux, bwatflx_halo, holds water routed to halo cells;
-    !  it will be routed downhill during the next iteration.
-    ! The accumulated flux, bwatflx_accum, holds the total flux over multiple iterations.
-    ! Some or all of the water entering a frozen cell can be refrozen in place.
-    !  The heat flux associated with refreezing is passed to the next thermal solve.
-    !  If this heat flux is enough to thaw the cell, some of the meltwater is returned later.
+    ! Initialize bwatflx in locally owned cells.
+    ! Set to the local melt rate, multiplied by area (so the units are m^3/s).
 
     bwatflx = 0.0d0
     do j = nhalo+1, ny-nhalo
@@ -666,174 +709,132 @@ subroutine route_basal_water(&
        enddo
     enddo
 
-    ! Initialize other fluxes
-    bwatflx_halo = 0.0d0
-    bwatflx_refreeze = 0.0d0
-    bwatflx_accum = 0.0d0
-    bwatflx_refreeze_accum = 0.0d0
-
     ! Compute total input of meltwater (m^3/s)
     total_flux_in = parallel_global_sum(bwatflx, parallel)
-
     if (verbose_bwat .and. this_rank == rtest) then
-       write(iulog,*) ' '
        write(iulog,*) 'Total input basal melt flux (m^3/s):', total_flux_in
+!!       call double_to_binary(total_flux_in, binary_str)
+!!       write(iulog,*) '   Binary string', binary_str
     endif
 
-    ! Loop over locally owned cells, from highest to lowest.
-    ! During each iteration, there are two possible outcomes for routing:
-    ! (1) Routed to the ice sheet margin, to a cell with bwat_mask = 0.
-    !     In this case, the routing of that flux is done.
-    ! (2) Routed to a halo cell, i.e. a downslope cell on a neighboring processor.
-    !     In this case, the flux will be routed further downhill on the next iteration.
-    ! When all the water has been routed to the margin, we are done.
+    ! Route the water downstream, keeping track of the steady-state flux through each cell.
+    ! The loop goes from highest to lowest values of head on the local processor.
+    ! At the end of the loop, all the incoming flux has either been
+    ! (1) routed to the ice sheet margin,
+    ! (2) set aside for later refreezing, or
+    ! (3) routed to a halo cell, from which it will continue downstream on the next iteration.
+    ! When all the water has been routed to the margin or set aside for refreezing, we are done.
 
-    count = 0
     ! Note: It is hard to predict how many iterations will be sufficient.
-    !       With Dinf or FD8, we can have flow back and forth across processor boundaries,
+    !       With Dinf or FD8 we can have flow back and forth across processor boundaries,
     !        requiring many iterations to reach the margin.
     !       For Greenland 4 km, Dinf requires ~20 iterations on 4 cores, and FD8 can require > 40.
     !       For Antarctica 8 km, FD8 can require > 50.
-    count_max = 100
-    finished = .false.
 
-    do while (.not.finished)
-
-       count = count + 1
+    ! Initialize the cumulative fluxes
+    bwatflx_accum = 0.0d0
+    bwatflx_refreeze_accum = 0.0d0
 
-       if (verbose_bwat .and. this_rank == rtest) then
-          write(iulog,*) 'flux routing, count =', count
-       endif
+    if (reprosum) then
 
-       do k = nlocal, 1, -1
+       ! Convert bwatflx to a scaled i8 array
+       bwatflx_int(:,:) = nint(bwatflx(:,:)*factor_bwatflx, i8)
 
-          ! Get i and j indices of current cell
-          i = sorted_ij(k,1)
-          j = sorted_ij(k,2)
+       ! Convert flux_fraction to i8
+       ! Note: This will work only for the D8 scheme, where all the flux goes downstream
+       !       to a single cell.
+       flux_fraction_int(:,:,:,:) = nint(flux_fraction(:,:,:,:), i8)
 
-          ! Route the flux
-          if (bwat_mask(i,j) == 1 .and. bwatflx(i,j) > 0.0d0) then
+       ! Convert btemp_weight to i8
+       btemp_weight_int(:,:) = 1
+       ! Note: Can round up to 1 and down to 0 by uncommenting the following line.
+       !       However, a mix of 1's and 0's leads to oscillations in basal temperature,
+       !        so it is safer to turn off refreezing by setting btemp_weight = 1 everywhere.
+!       btemp_weight_int(:,:) = nint(btemp_weight(:,:), i8)
 
-             ! Distribute the flux to downslope neighbors.
-             ! Where the bed is frozen, all or part of the flux is refrozen in place instead of being routed downstream.
-             do jj = -1,1
-                do ii = -1,1
-                   ip = i + ii
-                   jp = j + jj
-                   if (flux_fraction(ii,jj,i,j) > 0.0d0) then
-                      if (halo_mask(ip,jp) == 1) then
-                         bwatflx_halo(ii,jj,i,j) = bwatflx(i,j)*flux_fraction(ii,jj,i,j)*btemp_weight(i,j)
-                         bwatflx_refreeze(i,j) = bwatflx_refreeze(i,j) &
-                              + bwatflx(i,j)*flux_fraction(ii,jj,i,j)*(1.0d0 - btemp_weight(i,j))
-                         if (verbose_bwat .and. this_rank==rtest .and. i==itest .and. j==jtest .and. count <= 2) then
-                            write(iulog,*) 'Flux to halo, i, j, ii, jj, flux:', &
-                                 i, j, ii, jj, bwatflx(i,j)*flux_fraction(ii,jj,i,j)
-                         endif
-                      elseif (local_mask(ip,jp) == 1) then
-                         bwatflx(ip,jp) = bwatflx(ip,jp) + bwatflx(i,j)*flux_fraction(ii,jj,i,j)*btemp_weight(i,j)
-                         bwatflx_refreeze(i,j) = bwatflx_refreeze(i,j) &
-                              + bwatflx(i,j)*flux_fraction(ii,jj,i,j)*(1.0d0 - btemp_weight(i,j))
-                      endif
-                   endif   ! flux_fraction > 0
-                enddo
-             enddo
-          endif  ! bwat_mask = 1, bwatflx > 0
-       enddo  ! loop from high to low
+       ! Initialize other arrays
+       bwatflx_accum_int = 0
+       bwatflx_refreeze_accum_int = 0
 
-       ! Accumulate bwatflx from the latest iteration, then reset to zero for the next iteration.
-       bwatflx_accum = bwatflx_accum + bwatflx
-       bwatflx_refreeze_accum = bwatflx_refreeze_accum + bwatflx_refreeze
-       bwatflx = 0.0d0
-       bwatflx_refreeze = 0.0d0
+    endif   ! reprosum
 
-       if (verbose_bwat .and. this_rank == rtest .and. count <= 2) then
-          i = itest
-          j = jtest
-          write(iulog,*) 'i, j, bwatflx_accum:', i, j, bwatflx_accum(i,j)
-       endif
+    count = 0
+    count_max = 100
+    finished = .false.
 
-       ! If bwatflx_halo = 0 everywhere, then we are done.
-       ! (If the remaining flux is very small (< eps11), discard it to avoid
-       !  unnecessary extra iterations.)
-       ! If bwatflx_halo remains, then communicate it to neighboring tasks and
-       !  continue routing on the next iteration.
-
-       do j = 1, ny
-          do i = 1, nx
-             sum_bwatflx_halo(i,j) = sum(bwatflx_halo(:,:,i,j))
-!             if (verbose_bwat .and. sum_bwatflx_halo(i,j) > eps11 .and. count > 50) then
-!               write(iulog,*) 'Nonzero bwatflx_halo, count, rank, i, j, sum_bwatflx_halo:', &
-!                     count, this_rank, i, j, sum_bwatflx_halo(i,j)
-!               call parallel_globalindex(i, j, iglobal, jglobal, parallel)
-!               write(iulog,*) '     iglobal, jglobal:', iglobal, jglobal
-!             endif
-          enddo
-       enddo
-       global_flux_sum = parallel_global_sum(sum_bwatflx_halo, parallel)
+    do while (.not.finished)
 
-       if (verbose_bwat .and. count <= 2) then
-          if (this_rank == rtest) then
-             write(iulog,*) 'Before halo update, sum of bwatflx_halo:', global_flux_sum
-          endif
-          call point_diag(sum_bwatflx_halo, 'sum_bwatflx_halo', itest, jtest, rtest, 7, 7)
+       count = count + 1
+       if (verbose_bwat .and. this_rank == rtest) write(iulog,*) 'flux routing, count =', count
+       if (count > count_max) then
+          call write_log('Hydrology error: too many iterations in route_basal_water', GM_FATAL)
        endif
 
-       if (global_flux_sum > eps11) then
-
-          finished = .false.
+       if (reprosum) then
+
+          ! route downstream
+          ! Note: The fluxes are scaled by factor_bwatflx
+
+          call route_flux_to_margin_or_halo(&
+            nx, ny, nlocal,      &
+            itest, jtest, rtest, count, &
+            parallel,            &
+            sorted_ij,           &
+            local_mask,          &
+            halo_mask,           &
+            bwat_mask,           &
+            flux_fraction_int,   &
+            btemp_weight_int,    &
+            bwatflx_int,         &
+            bwatflx_accum_int,   &
+            bwatflx_refreeze_accum_int, &
+            finished)
 
-          ! Communicate bmltflx_halo to the halo cells of neighboring processors
-          call parallel_halo(bwatflx_halo(:,:,:,:), parallel)
-
-          ! bmltflx_halo is now available in the halo cells of the local processor.
-          ! Route downslope to the adjacent locally owned cells.
-          ! These fluxes will be routed further downslope during the next iteration.
-
-          do j = 2, ny-1
-             do i = 2, nx-1
-                if (halo_mask(i,j) == 1 .and. sum(bwatflx_halo(:,:,i,j)) > 0.0d0) then
-                   do jj = -1,1
-                      do ii = -1,1
-                         if (bwatflx_halo(ii,jj,i,j) > 0.0d0) then
-                            ip = i + ii
-                            jp = j + jj
-                            if (local_mask(ip,jp) == 1) then
-                               bwatflx(ip,jp) = bwatflx(ip,jp) + bwatflx_halo(ii,jj,i,j)
-                               if (verbose_bwat .and. ip==itest .and. jp==jtest .and. this_rank==rtest &
-                                    .and. count <= 2) then
-                                  write(iulog,*) 'Nonzero bwatflx from halo, rank, i, j:', &
-                                       this_rank, ip, jp, bwatflx_halo(ii,jj,i,j)
-                               endif
-                            endif
-                         endif   !  bwatflx_halo > 0 to a local cell
-                      enddo   ! ii
-                   enddo   ! jj
-                endif   ! bwatflx_halo > 0 from this halo cell
-             enddo   ! i
-          enddo   ! j
+          if (verbose_bwat .and. this_rank == rtest .and. count <= 2) then
+             i = itest; j = jtest
+             write(iulog,*) 'count, rank i, j, bwatflx_accum (m/yr), bwatflx_refreeze_accum:', &
+                  count, rtest, i, j, real(bwatflx_accum_int(i,j),dp)/factor_bwatflx, &
+                  real(bwatflx_refreeze_accum_int(i,j),dp)/factor_bwatflx
+          endif
 
-          ! Reset bwatflx_halo for the next iteration
-          bwatflx_halo = 0.0d0
+       else   ! non-reproducible sums
+
+          call route_flux_to_margin_or_halo(&
+            nx, ny, nlocal,      &
+            itest, jtest, rtest, count, &
+            parallel,            &
+            sorted_ij,           &
+            local_mask,          &
+            halo_mask,           &
+            bwat_mask,           &
+            flux_fraction,       &
+            btemp_weight,        &
+            bwatflx,             &
+            bwatflx_accum,       &
+            bwatflx_refreeze_accum, &
+            finished)
 
-          global_flux_sum = parallel_global_sum(bwatflx, parallel)
           if (verbose_bwat .and. this_rank == rtest .and. count <= 2) then
-             ! Should be equal to the global sum of bwatflx_halo computed above
-             write(iulog,*) 'After halo update, sum(bwatflx from halo) =', global_flux_sum
-             write(iulog,*) ' '
+             i = itest; j = jtest
+             write(iulog,*) 'count, rank i, j, bwatflx_accum(m/yr), bwatflx_refreeze_accum:', &
+                  count, rtest, i, j, bwatflx_accum(i,j) * scyr/(dx*dy), &
+                  bwatflx_refreeze_accum(i,j) * scyr/(dx*dy)
           endif
 
-       else   ! bwatflx_halo = 0 everywhere; no fluxes to route to adjacent processors
-          if (verbose_bwat .and. this_rank == rtest) write(iulog,*) 'Done routing fluxes'
-          finished = .true.
-          bwatflx = bwatflx_accum
-          bwatflx_refreeze = bwatflx_refreeze_accum
-       endif
+       endif   ! reprosum
 
-       if (count > count_max) then
-          call write_log('Hydrology error: too many iterations in route_basal_water', GM_FATAL)
-       endif
+    enddo   ! finished
 
-    enddo  ! finished routing
+    if (reprosum) then
+       ! Convert fluxes back to real(dp)
+       bwatflx_accum = real(bwatflx_accum_int, dp) / factor_bwatflx
+       bwatflx_refreeze_accum = real(bwatflx_refreeze_accum_int, dp) / factor_bwatflx
+    endif
+
+    ! Copy the accumulated values to the output arrays bwatflx and bwatflx_refreeze
+    bwatflx = bwatflx_accum
+    bwatflx_refreeze = bwatflx_refreeze_accum
+    if (verbose_bwat .and. this_rank == rtest) write(iulog,*) 'Done routing fluxes'
 
     ! Identify cells just beyond the ice sheet margin, which can receive from upstream but not send downstream
     where (bwat_mask == 0 .and. bwatflx > 0.0d0)
@@ -849,18 +850,11 @@ subroutine route_basal_water(&
 
     if (verbose_bwat .and. this_rank == rtest) then
        write(iulog,*) 'Total bwatflx at margin (m^3/s):', total_flux_margin
-       call double_to_binary(total_flux_margin, binary_str)
-       write(iulog,*) '   ', binary_str
        write(iulog,*) 'Total bwatflx_refreeze (m^3/s)=', total_flux_refreeze
-       call double_to_binary(total_flux_refreeze, binary_str)
-       write(iulog,*) '   ', binary_str
        write(iulog,*) 'Total bwatflx (m^3/s)=', total_flux_out
-       call double_to_binary(total_flux_out, binary_str)
-       write(iulog,*) '   ', binary_str
        write(iulog,*) 'Difference between output and input =', total_flux_out - total_flux_in
     endif
 
-
     ! Not sure if a threshold of eps11 is large enough.  Increase if needed.
     if (total_flux_in > 0.0d0) then
        err = abs(total_flux_in - total_flux_out)
@@ -1066,7 +1060,7 @@ subroutine fill_depressions(&
     real(dp), dimension(nx,ny), intent(out) :: &
          phi                 ! output field with depressions filled
 
-    ! Local variables --------------------------------------
+    ! Local variables
 
     logical, dimension(nx,ny) ::  &
          known               ! = true for cells where the final phi(i,j) is known
@@ -1276,7 +1270,7 @@ subroutine sort_heights(&
     integer, intent(in) ::  &
          nx, ny,               & ! number of grid cells in each direction
          nlocal,               & ! number of locally owned cells
-         itest, jtest, rtest     ! coordinates of diagnostic point
+         itest, jtest, rtest     ! coordinates of diagnostic point  !! not currently used
 
     real(dp), dimension(nx,ny), intent(in) :: &
          phi                     ! input field, to be sorted from low to high
@@ -1319,17 +1313,6 @@ subroutine sort_heights(&
 
     call indexx(vect, ind)
 
-    if (verbose_bwat .and. this_rank == rtest) then
-       write(iulog,*) ' '
-       write(iulog,*) 'Sort from low to high, nlocal =', nlocal
-       write(iulog,*) 'k, local i and j, ind(k), phi:'
-       do k = nlocal, nlocal-10, -1
-          i = floor(real(ind(k)-1)/real(ny_local)) + 1 + nhalo
-          j = mod(ind(k)-1,ny_local) + 1 + nhalo
-          write(iulog,*) k, i, j, ind(k), phi(i,j)
-       enddo
-    endif
-
     ! Fill the sorted_ij array with the i and j values of each cell.
     ! Note: These are the i and j values we would have if there were no halo cells.
     do k = 1, nlocal
@@ -1600,6 +1583,391 @@ subroutine get_flux_fraction(&
 
   end subroutine get_flux_fraction
 
+!==============================================================
+
+  subroutine route_flux_to_margin_or_halo_real8(&
+       nx, ny, nlocal,      &
+       itest, jtest, rtest, count, &
+       parallel,            &
+       sorted_ij,           &
+       local_mask,          &
+       halo_mask,           &
+       bwat_mask,           &
+       flux_fraction,       &
+       btemp_weight,        &
+       bwatflx,             &
+       bwatflx_accum,       &
+       bwatflx_refreeze_accum, &
+       finished)
+
+    ! Given the input bwatflx, route the water downstream, keeping track of fluxes along the way.
+    ! The loop goes from highest to lowest values of 'head' on the local processor.
+    ! At the end of the loop, all the incoming flux has either been
+    ! (1) routed to the ice sheet margin;
+    ! (2) set aside for later refreezing; or
+    ! (3) routed to a halo cell, from which it continues downstream the next time the subroutine is called.
+    ! The subroutine is called iteratively until all no water remains in halo cells.
+
+    implicit none
+
+    ! Input/output variables
+
+    integer, intent(in) :: &
+         nx, ny,               & ! number of cells in each direction
+         nlocal,               & ! number of locally owned grid cells on the processor
+         itest, jtest, rtest,  & ! coordinates of diagnostic point
+         count                   ! iteration count (diagnostic only)
+
+    type(parallel_type), intent(in) ::  &
+         parallel                ! info for parallel communication
+
+    integer, dimension(nlocal,2), intent(in)  :: &
+         sorted_ij               ! i and j indices of each local cell, sorted low to high
+
+    integer, dimension(nx,ny), intent(in) :: &
+         local_mask,           & ! = 1 for cells owned by the local processor, else = 0
+         halo_mask,            & ! = 1 for the layer of halo cells adjacent to locally owned cells, else = 0
+         bwat_mask               ! = 1 for cells through which basal water is routed; excludes floating and ocean cells
+
+    real(dp), dimension(-1:1,-1:1,nx,ny), intent(in) ::  &
+         flux_fraction           ! fraction of flux from a cell that flows downhill to each of 8 neighbors
+                                 ! last two indices identify the source cell;
+                                 ! 1st two indices give relative location of receiving cell
+
+    real(dp), dimension(nx,ny), intent(in) :: &
+         btemp_weight            ! temperature-dependent weighting factor, favoring flow where the bed is thawed
+
+    real(dp), dimension(nx,ny), intent(inout) :: &
+         bwatflx,              & ! on input: water flux (m^3/s) to be routed to the margin or halo
+                                 ! on output: flux routed to halo, to be routed further next time
+         bwatflx_accum,        & ! cumulative bwatflx (m/3/s) over multiple iterations
+         bwatflx_refreeze_accum  ! cumulative bwatflx_refreeze (m^s/s) over multiple iterations
+
+    logical, intent(inout) :: &
+         finished                ! initially F; set to T when all water has been routed as far as it can go
+
+    ! Local variables
+
+    integer :: i, j, k
+    integer :: ii, jj, ip, jp
+
+    real(dp), dimension(-1:1,-1:1,nx,ny)::  &
+         bwatflx_halo            ! flux routed to halo cells
+                                 ! last two indices identify the source cell;
+                                 ! 1st two indices give relative location of receiving cell
+
+    real(dp), dimension(nx,ny) :: &
+         bwatflx_refreeze,     & ! flux (m^3/s) saved for later refreezing; not routed further downstream
+         sum_bwatflx_halo        ! bwatflx_halo summed over the first 2 indices
+
+    real(dp) :: &
+         flx_thru,             & ! flux (m^3/s) that continues downstream
+         global_halo_sum         ! global sum of water flux in halo cells
+
+    ! Initialize fluxes
+    bwatflx_refreeze = 0.0d0
+    bwatflx_halo = 0.0d0
+
+    ! loop from high to low values on the local processor
+    do k = nlocal, 1, -1
+
+       ! Get i and j indices of current cell
+       i = sorted_ij(k,1)
+       j = sorted_ij(k,2)
+
+       if (bwat_mask(i,j) == 1 .and. bwatflx(i,j) > 0.0d0) then
+
+          ! Distribute the flux to downstream neighbors.
+          ! Based on the temperature-dependent weighting factor btemp_weight, all or part of the flux
+          !  is refrozen in place instead of being routed downstream.
+          flx_thru = bwatflx(i,j) * btemp_weight(i,j)
+          bwatflx_refreeze(i,j) = bwatflx(i,j) * (1.0d0 - btemp_weight(i,j))
+          do jj = -1,1
+             do ii = -1,1
+                ip = i + ii
+                jp = j + jj
+                if (flux_fraction(ii,jj,i,j) > 0.0d0) then
+                   if (halo_mask(ip,jp) == 1) then
+                      bwatflx_halo(ii,jj,i,j) = flx_thru*flux_fraction(ii,jj,i,j)
+                      if (verbose_bwat .and. this_rank==rtest .and. i==itest .and. j==jtest .and. count <= 2) then
+                         write(iulog,*) 'Flux to halo, i, j, ii, jj, flux:', &
+                              i, j, ii, jj, flx_thru*flux_fraction(ii,jj,i,j)
+                      endif
+                   elseif (local_mask(ip,jp) == 1) then
+                      bwatflx(ip,jp) = bwatflx(ip,jp) + flx_thru*flux_fraction(ii,jj,i,j)
+                      if (verbose_bwat .and. this_rank==rtest .and. i==itest .and. j==jtest .and. count <= 2) then
+                         write(iulog,*) 'Flux to neighbor, i, j, ii, jj, flux:', &
+                              i, j, ii, jj, flx_thru*flux_fraction(ii,jj,i,j)
+                      endif
+                   endif
+                endif   ! flux_fraction > 0
+             enddo  ! ii
+          enddo  ! jj
+       endif  ! bwat_mask = 1, bwatflx > 0
+    enddo  ! loop from high to low
+
+    ! Accumulate the fluxes in the output arrays
+    bwatflx_accum = bwatflx_accum + bwatflx
+    bwatflx_refreeze_accum = bwatflx_refreeze_accum + bwatflx_refreeze
+
+    ! Compute the total bwatflx in halo cells
+    do j = 1, ny
+       do i = 1, nx
+          sum_bwatflx_halo(i,j) = sum(bwatflx_halo(:,:,i,j))
+       enddo
+    enddo
+    global_halo_sum = parallel_global_sum(sum_bwatflx_halo, parallel)
+
+    ! If bwatflx_halo = 0 everywhere, then we are done.
+    ! Where bwatflx_halo is nonzero, communicate it to the neighboring task.
+    ! It will be routed further downstream the next time this subroutine is called.
+
+    if (global_halo_sum > 0.0d0) then
+
+       if (verbose_bwat .and. count <= 2) then
+          if (this_rank == rtest) write(iulog,*) 'Before halo update, global_halo_sum:', global_halo_sum
+          call point_diag(sum_bwatflx_halo, 'sum_bwatflx_halo', itest, jtest, rtest, 7, 7)
+       endif
+
+       ! Reset bwatflx to zero for the halo transfer
+       bwatflx = 0.0d0
+
+       ! Communicate bmltflx_halo to the halo cells of neighboring processors
+       call parallel_halo(bwatflx_halo(:,:,:,:), parallel)
+
+       ! bmltflx_halo is now available in the halo cells of the local processor.
+       ! Route downslope to the adjacent locally owned cells.
+       ! These fluxes will be routed further downstream during the next iteration.
+
+       do j = 2, ny-1
+          do i = 2, nx-1
+             if (halo_mask(i,j) == 1 .and. sum(bwatflx_halo(:,:,i,j)) > 0.0d0) then
+                do jj = -1,1
+                   do ii = -1,1
+                      if (bwatflx_halo(ii,jj,i,j) > 0.0d0) then
+                         ip = i + ii
+                         jp = j + jj
+                         if (local_mask(ip,jp) == 1) then
+                            bwatflx(ip,jp) = bwatflx(ip,jp) + bwatflx_halo(ii,jj,i,j)
+                            if (verbose_bwat .and. ip==itest .and. jp==jtest .and. this_rank==rtest .and. count <= 2) then
+                               write(iulog,*) 'Nonzero bwatflx from halo, rank, i, j:', &
+                                    this_rank, ip, jp, bwatflx_halo(ii,jj,i,j)
+                            endif
+                         endif
+                      endif   !  bwatflx_halo > 0 to a local cell
+                   enddo   ! ii
+                enddo   ! jj
+             endif   ! bwatflx_halo > 0 from this halo cell
+          enddo   ! i
+       enddo   ! j
+
+    else
+
+       finished = .true.   ! no water in halo cells to route further
+
+    endif  ! global_halo_sum > 0
+
+  end subroutine route_flux_to_margin_or_halo_real8
+
+!==============================================================
+
+  subroutine route_flux_to_margin_or_halo_integer8(&
+       nx, ny, nlocal,      &
+       itest, jtest, rtest, count, &
+       parallel,            &
+       sorted_ij,           &
+       local_mask,          &
+       halo_mask,           &
+       bwat_mask,           &
+       flux_fraction,       &
+       btemp_weight,        &
+       bwatflx,             &
+       bwatflx_accum,       &
+       bwatflx_refreeze_accum, &
+       finished)
+
+    ! Given the input bwatflx, route the water downstream, keeping track of fluxes along the way.
+    ! The loop goes from highest to lowest values of 'head' on the local processor.
+    ! At the end of the loop, all the incoming flux has either been
+    ! (1) routed to the ice sheet margin;
+    ! (2) set aside for later refreezing; or
+    ! (3) routed to a halo cell, from which it continues downstream the next time the subroutine is called.
+    ! The subroutine is called iteratively until all no water remains in halo cells.
+
+    implicit none
+
+    ! Input/output variables
+
+    integer, intent(in) :: &
+         nx, ny,               & ! number of cells in each direction
+         nlocal,               & ! number of locally owned grid cells on the processor
+         itest, jtest, rtest,  & ! coordinates of diagnostic point
+         count                   ! iteration count (diagnostic only)
+
+    type(parallel_type), intent(in) ::  &
+         parallel                ! info for parallel communication
+
+    integer, dimension(nlocal,2), intent(in)  :: &
+         sorted_ij               ! i and j indices of each local cell, sorted low to high
+
+    integer, dimension(nx,ny), intent(in) :: &
+         local_mask,           & ! = 1 for cells owned by the local processor, else = 0
+         halo_mask,            & ! = 1 for the layer of halo cells adjacent to locally owned cells, else = 0
+         bwat_mask               ! = 1 for cells through which basal water is routed; excludes floating and ocean cells
+
+    ! Note: Both flux_fraction and btemp_weight are constrained to be 0 or 1.
+    !       This means that the routing is limited to D8 (all the flux goes to one downstream cell),
+    !        and partial refreezing is not allowed (i.e., btemp_weight = 1 everywhere).
+    !       Thus, btemp_weight is not needed, but I kept it to keep the code similar to the subroutine above.
+    !       We could make refreezing all-or-nothing (i.e., weights of either 0 or 1), but this leads to
+    !        oscillations in bed temperature.
+    !       I thought of rescaling flux_fraction and btemp_weight to largish i8 integers (e.g., 1000)
+    !        to keep everything BFB, and then scaling back at the end. The problem is that this subroutine
+    !        may need to be called repeatedly, and each scaling would lead to larger and larger integers
+    !        that eventually exceed the i8 limit on integer size, ~10^(19).
+
+    integer(i8), dimension(-1:1,-1:1,nx,ny), intent(in) ::  &
+         flux_fraction           ! fraction of flux from a cell that flows downhill to each of 8 neighbors
+                                 ! last two indices identify the source cell;
+                                 ! 1st two indices give relative location of receiving cell
+
+    integer(i8), dimension(nx,ny), intent(in) :: &
+         btemp_weight            ! temperature-dependent weighting factor, favoring flow where the bed is thawed
+
+    integer(i8), dimension(nx,ny), intent(inout) :: &
+         bwatflx,              & ! on input: water flux (m^3/s * factor_bwatflx) to be routed to the margin or halo
+                                 ! on output: flux routed to halo, to be routed further next time
+         bwatflx_accum,        & ! cumulative bwatflx (m^3/s * factor_bwatflx) over multiple iterations
+         bwatflx_refreeze_accum  ! cumulative bwatflx_refreeze (m^3/s * factor_bwatflx) over multiple iterations
+
+    logical, intent(inout) :: &
+         finished                ! initially F; set to T when all water has been routed as far as it can go
+
+    ! Local variables
+
+    integer :: i, j, k
+    integer :: ii, jj, ip, jp
+
+    ! Note: Some of the local variables are scaled by products of all three scale factors above.
+    integer(i8), dimension(-1:1,-1:1,nx,ny)::  &
+         bwatflx_halo            ! flux routed to halo cells
+                                 ! last two indices identify the source cell;
+                                 ! 1st two indices give relative location of receiving cell
+
+    integer(i8), dimension(nx,ny) :: &
+         bwatflx_refreeze,     & ! flux  saved for later refreezing; not routed further downstream
+         sum_bwatflx_halo        ! bwatflx_halo summed over the first 2 indices
+
+    integer(i8) :: &
+         flx_thru,             & ! flux (m^3/s) that continues downstream
+         global_halo_sum         ! global sum of water flux in halo cells
+
+    real(dp), dimension(nx,ny)::  &
+         bwatflx_dp, bwatflx_halo_dp, bwatflx_refreeze_dp  ! temporary dp versions of i8 arrays
+
+    ! Initialize fluxes
+    bwatflx_halo = 0
+    bwatflx_refreeze = 0
+
+    ! loop from high to low values on the local processor
+    do k = nlocal, 1, -1
+
+       ! Get i and j indices of current cell
+       i = sorted_ij(k,1)
+       j = sorted_ij(k,2)
+
+       if (bwat_mask(i,j) == 1 .and. bwatflx(i,j) > 0.0d0) then
+
+          ! Distribute the flux to downstream neighbors.
+          ! Note: If btemp_weight = 1 everwhere, there is no refreezing.
+          flx_thru = bwatflx(i,j) * btemp_weight(i,j)
+          bwatflx_refreeze(i,j) = bwatflx(i,j) * (1 - btemp_weight(i,j))
+          do jj = -1,1
+             do ii = -1,1
+                ip = i + ii
+                jp = j + jj
+                if (flux_fraction(ii,jj,i,j) > 0) then
+                   if (halo_mask(ip,jp) == 1) then
+                      bwatflx_halo(ii,jj,i,j) = flx_thru*flux_fraction(ii,jj,i,j)
+                      if (verbose_bwat .and. this_rank==rtest .and. i==itest .and. j==jtest .and. count <= 2) then
+                         write(iulog,*) 'Flux to halo, i, j, ii, jj, flux:', &
+                              i, j, ii, jj, flx_thru*flux_fraction(ii,jj,i,j)
+                      endif
+                   elseif (local_mask(ip,jp) == 1) then
+                      bwatflx(ip,jp) = bwatflx(ip,jp) + flx_thru*flux_fraction(ii,jj,i,j)
+                      if (verbose_bwat .and. this_rank==rtest .and. i==itest .and. j==jtest .and. count <= 2) then
+                         write(iulog,*) 'Flux to neighbor, i, j, ii, jj, flux:', &
+                              i, j, ii, jj, flx_thru*flux_fraction(ii,jj,i,j)
+                      endif
+                   endif
+                endif   ! flux_fraction > 0
+             enddo  ! ii
+          enddo  ! jj
+       endif  ! bwat_mask = 1, bwatflx > 0
+    enddo  ! loop from high to low
+
+    ! Accumulate the fluxes in the output arrays
+    bwatflx_accum = bwatflx_accum + bwatflx
+    bwatflx_refreeze_accum = bwatflx_refreeze_accum + bwatflx_refreeze
+
+    ! Compute the total bwatflx in halo cells
+    do j = 1, ny
+       do i = 1, nx
+          sum_bwatflx_halo(i,j) = sum(bwatflx_halo(:,:,i,j))
+       enddo
+    enddo
+    global_halo_sum = parallel_global_sum(sum_bwatflx_halo, parallel)
+
+    ! If bwatflx_halo = 0 everywhere, then we are done.
+    ! Where bwatflx_halo is nonzero, communicate it to the neighboring task.
+    ! It will be routed further downstream the next time this subroutine is called.
+
+    if (global_halo_sum > 0) then
+
+       if (verbose_bwat .and. count <= 2) then
+          if (this_rank == rtest) write(iulog,*) 'Before halo update, global_halo_sum (m^3/s):', global_halo_sum
+       endif
+
+       ! Reset bwatflx to zero for the halo transfer
+       bwatflx = 0
+
+       ! Communicate bmltflx_halo to the halo cells of neighboring processors
+       call parallel_halo(bwatflx_halo(:,:,:,:), parallel)
+
+       ! bmltflx_halo is now available in the halo cells of the local processor.
+       ! Route downslope to the adjacent locally owned cells.
+       ! These fluxes will be routed further downstream during the next iteration.
+       ! Note: This calculation does not use flux_fraction or btemp_weight, so no rescaling is needed at the end.
+       do j = 2, ny-1
+          do i = 2, nx-1
+             if (halo_mask(i,j) == 1 .and. sum(bwatflx_halo(:,:,i,j)) > 0) then
+                do jj = -1,1
+                   do ii = -1,1
+                      if (bwatflx_halo(ii,jj,i,j) > 0) then
+                         ip = i + ii
+                         jp = j + jj
+                         if (local_mask(ip,jp) == 1) then
+                            bwatflx(ip,jp) = bwatflx(ip,jp) + bwatflx_halo(ii,jj,i,j)
+                            if (verbose_bwat .and. ip==itest .and. jp==jtest .and. this_rank==rtest .and. count <= 2) then
+                               write(iulog,*) 'Nonzero bwatflx from halo, rank, i, j:', &
+                                    this_rank, ip, jp, bwatflx_halo(ii,jj,i,j)
+                            endif
+                         endif
+                      endif   !  bwatflx_halo > 0 to a local cell
+                   enddo   ! ii
+                enddo   ! jj
+             endif   ! bwatflx_halo > 0 from this halo cell
+          enddo   ! i
+       enddo   ! j
+
+    else
+
+       finished = .true.   ! no water in halo cells to route further
+
+    endif  ! global_halo_sum > 0
+
+  end subroutine route_flux_to_margin_or_halo_integer8
+
 !==============================================================
 
   !+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
@@ -1608,8 +1976,8 @@ end subroutine get_flux_fraction
   ! They are a GPL-licenced replacement for the Numerical Recipes routine indexx.
   ! They are not derived from any NR code, but are based on a quicksort routine by
   ! Michael Lamont (http://linux.wku.edu/~lamonml/kb.html), originally written
-  ! in C, and issued under the GNU General Public License. The conversion to
-  ! Fortran 90, and modification to do an index sort was done by Ian Rutt.
+  ! in C, and issued under the GNU General Public License. Ian Rutt did the conversion
+  ! to Fortran 90 and modified the algorithm to do an index sort.
   !
   !+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 
diff --git a/libglissade/glissade_velo_higher.F90 b/libglissade/glissade_velo_higher.F90
index fb6796f7..099f4fd7 100644
--- a/libglissade/glissade_velo_higher.F90
+++ b/libglissade/glissade_velo_higher.F90
@@ -2285,7 +2285,6 @@ subroutine glissade_velo_higher_solve(model,                &
              vbas(:,:) = vvel(nz,:,:)
           endif
 
-!!       if (verbose_beta) then
           if (verbose_beta .and. counter==1) then
              if (this_rank == rtest) write(iulog,*) 'Before calcbeta, counter =', counter
              call point_diag(usrf, 'usrf (m)', itest, jtest, rtest, 7, 7)
@@ -2322,10 +2321,6 @@ subroutine glissade_velo_higher_solve(model,                &
                whichbeta_limit,                  &
                itest = itest, jtest = jtest, rtest = rtest)
 
-          if (verbose_basal) then
-             call point_diag(beta_internal, 'After calcbeta, beta', itest, jtest, rtest, 7, 7, '(f12.0)')
-          endif
-
 !          if (verbose_beta) then
 !             maxbeta = maxval(beta_internal(:,:))
 !             maxbeta = parallel_reduce_max(maxbeta)
@@ -2409,16 +2404,11 @@ subroutine glissade_velo_higher_solve(model,                &
 
              !WHL - debug - BFB check
              if (verbose_reprosum .and. counter == 1) then
-                if (main_task) write(iulog,*) 'Write out matrices after assemble_stiffness_matrix'
-                call write_array_to_file(Auu_2d, 21, 'global_Auu1', parallel, write_binary = .true., cycle_indices = .true.)
-                call write_array_to_file(Auv_2d, 22, 'global_Auv1', parallel, write_binary = .true., cycle_indices = .true.)
-                call write_array_to_file(Avu_2d, 23, 'global_Avu1', parallel, write_binary = .true., cycle_indices = .true.)
-                call write_array_to_file(Avv_2d, 24, 'global_Avv1', parallel, write_binary = .true., cycle_indices = .true.)
-             endif
-
-             !WHL - debug - BFB check
-!!             if (0 == 1) then
-             if (verbose_reprosum) then
+!!                if (main_task) write(iulog,*) 'Write out matrices after assemble_stiffness_matrix'
+!!                call write_array_to_file(Auu_2d, 21, 'global_Auu1', parallel, write_binary = .true., cycle_indices = .true.)
+!!                call write_array_to_file(Auv_2d, 22, 'global_Auv1', parallel, write_binary = .true., cycle_indices = .true.)
+!!                call write_array_to_file(Avu_2d, 23, 'global_Avu1', parallel, write_binary = .true., cycle_indices = .true.)
+!!                call write_array_to_file(Avv_2d, 24, 'global_Avv1', parallel, write_binary = .true., cycle_indices = .true.)
                 sum_Auu(:) = parallel_global_sum_stagger(Auu_2d, nNodeNeighbors_2d, parallel)
                 sum_Auv(:) = parallel_global_sum_stagger(Auv_2d, nNodeNeighbors_2d, parallel)
                 sum_Avu(:) = parallel_global_sum_stagger(Avu_2d, nNodeNeighbors_2d, parallel)
@@ -2552,36 +2542,6 @@ subroutine glissade_velo_higher_solve(model,                &
                    call point_diag(beta_eff_y, 'beta_eff_y', itest, jtest, rtest, 7, 7, '(e10.3)')
                 endif
 
-                if (verbose_reprosum) then
-                   sum_omega = parallel_global_sum_stagger(omega, parallel)
-                   sum_stag_omega = parallel_global_sum_stagger(stag_omega, parallel)
-                   sum_betax = parallel_global_sum_stagger(beta_eff_x, parallel)
-                   sum_betay = parallel_global_sum_stagger(beta_eff_y, parallel)
-                   if (main_task) then
-                      call double_to_binary(sum_omega, binary_str)
-                      write(iulog,*) 'Before bc_2d: sum_omega, binary_str:', sum_omega, binary_str
-                      call double_to_binary(sum_stag_omega, binary_str)
-                      write(iulog,*) '              sum_stag_omega, binary_str:', sum_stag_omega, binary_str
-                      call double_to_binary(sum_betax, binary_str)
-                      write(iulog,*) '              sum_betax, binary_str:', sum_betax, binary_str
-                      call double_to_binary(sum_betay, binary_str)
-                      write(iulog,*) '              sum_betay, binary_str:', sum_betay, binary_str
-                   endif
-                   if (this_rank == rtest) then
-                      do j = jtest-2, jtest+2
-                         do i = itest-2, itest+2
-                            if (i >= staggered_ilo .and. i <= staggered_ihi .and. &
-                                j >= staggered_jlo .and. j <= staggered_jhi) then
-                               call double_to_binary(stag_omega(i,j), binary_str)
-                               write(iulog,*) 'i, j, stag_omega:', i, j, binary_str
-                               call double_to_binary(beta_eff_x(i,j), binary_str)
-                               write(iulog,*) '      beta_eff_x:', i, j, binary_str
-                            endif
-                         enddo
-                      enddo
-                   endif
-                endif   ! verbose_reprosum
-
                 if (diva_slope_correction) then
 
                    ! Incorporate basal sliding boundary conditions with basal curvature,
@@ -2698,14 +2658,14 @@ subroutine glissade_velo_higher_solve(model,                &
 
              !WHL - debug - Write all the matrix elements and rhs elements (in binary form) to files
              if (verbose_reprosum .and. counter == 1) then
-                if (main_task) write(iulog,*) 'Write out matrices after adding BC'
+!!                if (main_task) write(iulog,*) 'Write out matrices after adding BC'
 !!                call write_array_to_file(Auu_2d(:,:,5), 30, 'global_Auu2', parallel)  ! diagonal terms only
-                call write_array_to_file(Auu_2d, 31, 'global_Auu2', parallel, write_binary = .true., cycle_indices = .true.)
-                call write_array_to_file(Auv_2d, 32, 'global_Auv2', parallel, write_binary = .true., cycle_indices = .true.)
-                call write_array_to_file(Avu_2d, 33, 'global_Avu2', parallel, write_binary = .true., cycle_indices = .true.)
-                call write_array_to_file(Avv_2d, 34, 'global_Avv2', parallel, write_binary = .true., cycle_indices = .true.)
-                call write_array_to_file(bu_2d,  35, 'global_bu2',  parallel, write_binary = .true.)
-                call write_array_to_file(bv_2d,  36, 'global_bv2',  parallel, write_binary = .true.)
+!!                call write_array_to_file(Auu_2d, 31, 'global_Auu2', parallel, write_binary = .true., cycle_indices = .true.)
+!!                call write_array_to_file(Auv_2d, 32, 'global_Auv2', parallel, write_binary = .true., cycle_indices = .true.)
+!!                call write_array_to_file(Avu_2d, 33, 'global_Avu2', parallel, write_binary = .true., cycle_indices = .true.)
+!!                call write_array_to_file(Avv_2d, 34, 'global_Avv2', parallel, write_binary = .true., cycle_indices = .true.)
+!!                call write_array_to_file(bu_2d,  35, 'global_bu2',  parallel, write_binary = .true.)
+!!                call write_array_to_file(bv_2d,  36, 'global_bv2',  parallel, write_binary = .true.)
              endif
 
              !---------------------------------------------------------------------------
@@ -8630,11 +8590,7 @@ subroutine basal_sliding_bc_2d_diva(&
 
     if (whichassemble_beta == HO_ASSEMBLE_BETA_LOCAL) then
 
-       if (nNeighbors == nNodeNeighbors_3d) then  ! 3D problem
-          m = indxA_3d(0,0,0)
-       else  ! 2D problem
-          m = indxA_2d(0,0)
-       endif
+       m = indxA_2d(0,0)
 
        ! Average the lower ice surface elevation to vertices
        call glissade_stagger(&
@@ -8656,16 +8612,16 @@ subroutine basal_sliding_bc_2d_diva(&
        call parallel_halo(theta_basal_slope_y, parallel)
 
        if (verbose_basal) then
-          call point_diag(theta_basal_slope_x*180.d0/pi, 'theta_basal_slope_x (deg)', itest, jtest, rtest, 7, 7, '(f10.0)')
-          call point_diag(theta_basal_slope_y*180.d0/pi, 'theta_basal_slope_y (deg)', itest, jtest, rtest, 7, 7, '(f10.0)')
+          call point_diag(theta_basal_slope_x*180.d0/pi, 'theta_basal_slope_x (deg)', itest, jtest, rtest, 7, 7)
+          call point_diag(theta_basal_slope_y*180.d0/pi, 'theta_basal_slope_y (deg)', itest, jtest, rtest, 7, 7)
        endif
 
        ! Sum over active vertices
        do j = 1, ny-1
           do i = 1, nx-1
              if (active_vertex(i,j)) then
-                Auu(i,j,m) = Auu(i,j,m) + dx*dy/vol0 * beta_eff_x(i,j) / cos(theta_basal_slope_x(i,j))
-                Avv(i,j,m) = Avv(i,j,m) + dx*dy/vol0 * beta_eff_y(i,j) / cos(theta_basal_slope_y(i,j))
+                Auu(i,j,m) = Auu(i,j,m) + (dx*dy/vol0) * beta_eff_x(i,j) / cos(theta_basal_slope_x(i,j))
+                Avv(i,j,m) = Avv(i,j,m) + (dx*dy/vol0) * beta_eff_y(i,j) / cos(theta_basal_slope_y(i,j))
              endif   ! active_vertex
           enddo   ! i
        enddo   ! j
@@ -8802,20 +8758,6 @@ subroutine basal_sliding_bc_2d_diva(&
 
     endif   ! whichassemble_beta
 
-    if (verbose_basal .and. this_rank==rtest) then
-       i = itest
-       j = jtest
-       if (nNeighbors == nNodeNeighbors_3d) then  ! 3D problem
-          m = indxA_3d(0,0,0)
-       else
-          m = indxA_2d(0,0)
-       endif
-       write(iulog,*) ' '
-       write(iulog,*) 'Basal BC: i, j, diagonal index =', i, j, m
-       write(iulog,*) 'New Auu diagonal:', Auu(i,j,m)
-       write(iulog,*) 'New Avv diagonal:', Avv(i,j,m)
-    endif
-
   end subroutine basal_sliding_bc_2d_diva
 
 !****************************************************************************
@@ -10871,7 +10813,7 @@ subroutine check_symmetry_assembled_matrix_3d(nx,    ny,     nz,      &
 
     if (verbose_matrix) then
        global_maxdiff = parallel_reduce_max(maxdiff)
-       if (maxdiff == global_maxdiff) then
+       if (global_maxdiff > 0.0d0 .and. maxdiff == global_maxdiff) then
           ! maxdiff is on this processor; compute and broadcast the global index
           call parallel_globalindex(imax, jmax, iglobal, jglobal, parallel)
           write(iulog,*) 'Max asymmetry =', global_maxdiff
@@ -11077,7 +11019,7 @@ subroutine check_symmetry_assembled_matrix_2d(nx,            ny,       &
 
     if (verbose_matrix) then
        global_maxdiff = parallel_reduce_max(maxdiff)
-       if (maxdiff == global_maxdiff) then
+       if (global_maxdiff > 0.0d0 .and. maxdiff == global_maxdiff) then
           ! maxdiff is on this processor; compute and broadcast the global index
           call parallel_globalindex(imax, jmax, iglobal, jglobal, parallel)
           write(iulog,*) 'Max asymmetry =', global_maxdiff

From 329fc87d051ccb301bddf6a73c3e3cf5f8f93ccc Mon Sep 17 00:00:00 2001
From: William Lipscomb <lipscomb@ucar.edu>
Date: Wed, 14 Jan 2026 16:48:41 -0700
Subject: [PATCH 09/21] Define two distinct temperature scales for basal
 hydrology

This commit replaces btemp_scale, a temperature scale in the flux-routing hydrology,
with two distinct scales that have different functions.
The two new scales are (1) btemp_flow_scale and (2) btemp_freeze_scale.

btemp_flow_scale is used to weigh the likelihood of alternative downstream paths,
as computed in subroutine get_flux_fraction. The scale determines a weighting factor:
  btemp_weight_flow = exp(-delta_Tb/btemp_flow_scale),
  where delta_Tb is the difference between PMP temperature and bed temperature.
A low value of btemp_weight_flow means that water is more likely to be routed to a
neighboring grid cell instead of passing through a frozen one.

btemp_freeze_scale is used to compute how much of the incoming water flux refreezes
locally rather than passing through. The weight is computed as:
   btemp_weight_freeze = exp(-delta_Tb/btemp_freeze_scale).
A low value of btemp_weight_freeze means that more water refreezes in place.

For both scales, a larger value makes the hydrology less sensitive to bed temperature.
A zero value (the default) means that the scaling is ignored, and the flow is independent
of bed temperature.

Since the refreezing option is not supported for reproducible sums, I will try running
the hydrology with the flow scale alone. If this works well, the refreezing option
might be removed later.
---
 libglide/glide_setup.F90             |  19 ++--
 libglide/glide_types.F90             |   6 +-
 libglissade/glissade.F90             |   3 +-
 libglissade/glissade_basal_water.F90 | 156 +++++++++++++++------------
 4 files changed, 110 insertions(+), 74 deletions(-)

diff --git a/libglide/glide_setup.F90 b/libglide/glide_setup.F90
index 682ff56d..af481071 100644
--- a/libglide/glide_setup.F90
+++ b/libglide/glide_setup.F90
@@ -3310,7 +3310,8 @@ subroutine handle_basal_hydro(section, model)
     ! flux routing
     call GetValue(section, 'ho_flux_routing_scheme', model%basal_hydro%ho_flux_routing_scheme)
     call GetValue(section, 'const_source', model%basal_hydro%const_source)
-    call GetValue(section, 'btemp_scale', model%basal_hydro%btemp_scale)
+    call GetValue(section, 'btemp_flow_scale', model%basal_hydro%btemp_flow_scale)
+    call GetValue(section, 'btemp_freeze_scale', model%basal_hydro%btemp_freeze_scale)
 
     ! effective pressure options and parameters
     call GetValue(section, 'effecpress_delta',   model%basal_hydro%effecpress_delta)
@@ -3391,11 +3392,11 @@ subroutine print_basal_hydro(model)
              model%basal_hydro%ho_flux_routing_scheme = HO_FLUX_ROUTING_D8
              call write_log(message)
           endif
-          if (model%basal_hydro%btemp_scale > 0.0d0) then
+          if (model%basal_hydro%btemp_freeze_scale > 0.0d0) then
              write(message,*) 'With reproducible sums, the flux-routing does not support refreezing;' // &
-                  'setting btemp_scale = 0'
+                  ' setting btemp_freeze_scale = 0'
              call write_log(message)
-             model%basal_hydro%btemp_scale = 0.0d0
+             model%basal_hydro%btemp_freeze_scale = 0.0d0
           endif
        endif  ! reproducible sums
        write(message,*) 'ho_flux_routing_scheme        : ',model%basal_hydro%ho_flux_routing_scheme,  &
@@ -3405,8 +3406,14 @@ subroutine print_basal_hydro(model)
           write(message,*) 'constant melt source at the bed (m/yr): ', model%basal_hydro%const_source
           call write_log(message)
        endif
-       if (model%basal_hydro%btemp_scale > 0.0d0) then
-          write(message,*) 'temp scale (deg C) for frz/thaw transition: ', model%basal_hydro%btemp_scale
+       if (model%basal_hydro%btemp_flow_scale > 0.0d0) then
+          write(message,*) 'temp scale (deg C) for flow around frozen bed: ', &
+               model%basal_hydro%btemp_flow_scale
+          call write_log(message)
+       endif
+       if (model%basal_hydro%btemp_freeze_scale > 0.0d0) then
+          write(message,*) 'temp scale (deg C) for refreezing at the bed: ', &
+               model%basal_hydro%btemp_freeze_scale
           call write_log(message)
        endif
        if (model%options%which_ho_effecpress == HO_EFFECPRESS_BWAT) then
diff --git a/libglide/glide_types.F90 b/libglide/glide_types.F90
index 2d001f06..0731616b 100644
--- a/libglide/glide_types.F90
+++ b/libglide/glide_types.F90
@@ -2158,7 +2158,11 @@ module glide_types
      !TODO - Add visc_water and omega_hydro? Currently set in glissade_basal_water module
      real(dp) :: const_source = 0.0d0            !> constant melt source at the bed (m/yr)
                                                  !> could be used to represent an englacial or surface source
-     real(dp) :: btemp_scale = 0.0d0             !> temperature scale (degC) for transition between thawed and frozen bed
+     real(dp) :: btemp_flow_scale = 0.0d0        !> temperature scale (degC) for transition between thawed and frozen bed;
+                                                 !> used to route flow away from cells with a frozen bed;
+                                                 !> btemp_scale = 0 => temperature-independent flow
+     real(dp) :: btemp_freeze_scale = 0.0d0      !> temperature scale (degC) for transition between thawed and frozen bed;
+                                                 !> used to refreeze water beneath cells with a frozen bed;
                                                  !> btemp_scale = 0 => temperature-independent flow
      ! parameters for macroporous sheet
      real(dp) :: bwat_threshold = 1.0d-3         !> scale over which N ramps down from overburden to a small value (m)
diff --git a/libglissade/glissade.F90 b/libglissade/glissade.F90
index ceae1895..be44b235 100644
--- a/libglissade/glissade.F90
+++ b/libglissade/glissade.F90
@@ -1941,7 +1941,8 @@ subroutine glissade_thermal_solve(model, dt)
             floating_mask,                                    &
             model%basal_hydro%bmlt_hydro,                     &  ! m/s
             model%temper%bpmp - model%temper%btemp_ground,    &  ! degC
-            model%basal_hydro%btemp_scale,                    &  ! degC
+            model%basal_hydro%btemp_flow_scale,               &  ! degC
+            model%basal_hydro%btemp_freeze_scale,             &  ! degC
             model%basal_hydro%bwatflx,                        &  ! m/s
             model%basal_hydro%bwat_diag,                      &  ! m
             model%temper%bhydroflx,                           &  ! W/m2
diff --git a/libglissade/glissade_basal_water.F90 b/libglissade/glissade_basal_water.F90
index 8f9e03ef..68ef2d3f 100644
--- a/libglissade/glissade_basal_water.F90
+++ b/libglissade/glissade_basal_water.F90
@@ -185,7 +185,9 @@ subroutine glissade_bwat_flux_routing(&
        thklim,                       &
        bwat_mask,     floating_mask, &
        bmlt_hydro,                   &
-       delta_Tb,      btemp_scale,   &
+       delta_Tb,                     &
+       btemp_flow_scale,             &
+       btemp_freeze_scale,           &
        bwatflx,       bwat_diag,     &
        bhydroflx,                    &
        head,          grad_head,     &
@@ -222,9 +224,16 @@ subroutine glissade_bwat_flux_routing(&
          delta_Tb                   ! difference T_pmp - T_bed (degC)
 
     real(dp), intent(in) ::  &
-         thklim,                  & ! minimum ice thickness for basal melt and hydropotential calculations (m)
+         thklim                     ! minimum ice thickness for basal melt and hydropotential calculations (m)
                                     ! Note: This is typically model%geometry%thklim_temp
-         btemp_scale                ! temperature scale for transition from frozen to thawed bed (degC)
+
+    ! Note: These scales ensure a smooth transition in behavior between frozen and thawed beds.
+    !       Both scales are computed in a similar way, but they apply to different parts of the algorithm.
+    ! TODO: Decide whether to keep both scales. Only the flow scale works for reprosums, so we might want
+    !       to remove the freeze scale.
+    real(dp), intent(in) ::  &
+         btemp_flow_scale,        & ! temperature scale for routing water flow around cells with a frozen bed (deg C)
+         btemp_freeze_scale         ! temperature scale for refreezing water beneath cells with a frozen bed (degC)
 
     integer, dimension(nx,ny), intent(in) ::  &
          bwat_mask,               & ! mask to identify cells through which basal water is routed;
@@ -344,7 +353,8 @@ subroutine glissade_bwat_flux_routing(&
          head,                   &
          bmlt_hydro,             &
          delta_Tb,               &
-         btemp_scale,            &
+         btemp_flow_scale,       &
+         btemp_freeze_scale,     &
          bwat_mask,              &
          bwatflx,                &
          bwatflx_refreeze,       &
@@ -468,7 +478,8 @@ subroutine route_basal_water(&
          head,                   &
          bmlt_hydro,             &
          delta_Tb,               &
-         btemp_scale,            &
+         btemp_flow_scale,       &
+         btemp_freeze_scale,     &
          bwat_mask,              &
          bwatflx,                &
          bwatflx_refreeze,       &
@@ -507,8 +518,9 @@ subroutine route_basal_water(&
          delta_Tb                ! difference T_pmp - T_bed (degC)
 
     real(dp), intent(in) :: &
-         btemp_scale             ! temperature scale for transition from frozen to thawed bed (degC)
-                                 ! If btemp_scale = 0, assume no temperature dependence
+         btemp_flow_scale,     & ! temperature scale for routing water flow around cells with a frozen bed (deg C)
+         btemp_freeze_scale      ! temperature scale for refreezing water beneath cells with a frozen bed (degC)
+                                 ! If scale = 0, assume no temperature dependence
 
     real(dp), dimension(nx,ny), intent(inout)  ::  &
          head                    ! hydraulic head (m)
@@ -543,7 +555,8 @@ subroutine route_basal_water(&
 
     real(dp), dimension(nx,ny) ::  &
          head_filled,           & ! head after depressions are filled (m)
-         btemp_weight,          & ! temperature-dependent weighting factor, favoring flow where the bed is thawed
+         btemp_weight_flow,     & ! temp-dependent weighting factor, forcing flow around cells with frozen beds
+         btemp_weight_freeze,   & ! temp-dependent weighting factor, favoring refreezing in cells with frozen beds
          bwatflx_accum,         & ! water flux through the cell (m^3/s) accumulated over multiple iterations
          bwatflx_refreeze_accum   ! water flux (m^3/s) refreezing in place, accumulated over multiple iterations
 
@@ -563,7 +576,7 @@ subroutine route_basal_water(&
     ! The following i8 variables are for computing reproducible sums
     integer(i8), dimension(nx,ny) ::  &
          bwatflx_int,               & ! water flux through a grid cell (m^3/s)
-         btemp_weight_int,          & ! temperature-dependent weighting factor, favoring flow where the bed is thawed
+         btemp_weight_freeze_int,   & ! temp-dependent weighting factor, favoring refreezing in cells with frozen beds
          bwatflx_accum_int,         & ! water flux through the cell (m^3/s) accumulated over multiple iterations
          bwatflx_refreeze_accum_int   ! water flux (m^3/s) refreezing in place, accumulated over multiple iterations
 
@@ -660,22 +673,33 @@ subroutine route_basal_water(&
        enddo
     enddo
 
-    ! Compute a temperature-dependent weighting factor for flux routing.
-    ! This is used in two parts of the code:
-    ! (1) In subroutine get_flux_fraction, btemp_weight is used to weight potential downstream paths.
-    !     A small value of btemp_weight means that a cell is less likely to receive water from upstream.
-    ! (2) When water enters a frozen cell (delta_Tb > 0), btemp_weight is used to determine
-    !     how much of the flux is refrozen in place rather than passing through.
-    !     A low value of btemp_weight means that less water passes through.
-    ! Note: For reproducible sums, refreezing is not supported; must have btemp_weight = 1 everywhere.
-
-    btemp_weight = 1.0d0
+    ! Compute temperature-dependent weighting factors for flux routing.
+    ! There are two scales with related but distinct functions:
+    ! (1) In subroutine get_flux_fraction, btemp_flow_scale is used to weigh potential downstream paths.
+    !     A low value of btemp_weight_flow means that water is less likely to pass through.
+    ! (2) When water enters a frozen cell (delta_Tb > 0), btemp_freeze_scale determines
+    !     how much of the flux refreezes in place rather than passing through.
+    !     A small value of btemp_weight_freeze means that more water refreezes, and less passes through.
+    ! Note: For reproducible sums, refreezing is not supported; must have btemp_weight_freeze = 1 everywhere.
+    ! TODO: Possibly remove btemp_freeze_scale and just keep btemp_flow_scale.
+
+    btemp_weight_flow = 1.0d0
+    if (btemp_flow_scale > 0.0d0) then
+       if (.not. reprosum) then
+          where (bwat_mask == 1)
+             where (delta_Tb > 0.0d0)
+                btemp_weight_flow = exp(-delta_Tb/btemp_flow_scale)
+             endwhere
+          endwhere
+       endif
+    endif
 
-    if (btemp_scale > 0.0d0) then
+    btemp_weight_freeze = 1.0d0
+    if (btemp_freeze_scale > 0.0d0) then
        if (.not. reprosum) then
           where (bwat_mask == 1)
              where (delta_Tb > 0.0d0)
-                btemp_weight = exp(-delta_Tb/btemp_scale)
+                btemp_weight_freeze = exp(-delta_Tb/btemp_freeze_scale)
              endwhere
           endwhere
        endif
@@ -683,7 +707,8 @@ subroutine route_basal_water(&
 
     if (verbose_bwat) then
        call point_diag(delta_Tb, 'Tpmp - Tb', itest, jtest, rtest, 7, 7)
-       call point_diag(btemp_weight, 'btemp_weight', itest, jtest, rtest, 7, 7)
+       call point_diag(btemp_weight_flow, 'btemp_weight_flow', itest, jtest, rtest, 7, 7)
+       call point_diag(btemp_weight_freeze, 'btemp_weight_freeze', itest, jtest, rtest, 7, 7)
     endif
 
     ! Compute the fraction of the incoming flux sent to each downstream neighbor.
@@ -695,7 +720,7 @@ subroutine route_basal_water(&
          flux_routing_scheme,   &
          sorted_ij,             &
          head,                  &
-         btemp_weight,          &
+         btemp_weight_flow,     &
          bwat_mask,             &
          flux_fraction)
 
@@ -745,12 +770,12 @@ subroutine route_basal_water(&
        !       to a single cell.
        flux_fraction_int(:,:,:,:) = nint(flux_fraction(:,:,:,:), i8)
 
-       ! Convert btemp_weight to i8
-       btemp_weight_int(:,:) = 1
+       ! Convert btemp_weight_freeze to i8
+       btemp_weight_freeze_int(:,:) = 1
        ! Note: Can round up to 1 and down to 0 by uncommenting the following line.
        !       However, a mix of 1's and 0's leads to oscillations in basal temperature,
-       !        so it is safer to turn off refreezing by setting btemp_weight = 1 everywhere.
-!       btemp_weight_int(:,:) = nint(btemp_weight(:,:), i8)
+       !        so it is safer to turn off refreezing by setting btemp_weight_freeze = 1 everywhere.
+!       btemp_weight_freeze_int(:,:) = nint(btemp_weight_freeze(:,:), i8)
 
        ! Initialize other arrays
        bwatflx_accum_int = 0
@@ -776,17 +801,17 @@ subroutine route_basal_water(&
           ! Note: The fluxes are scaled by factor_bwatflx
 
           call route_flux_to_margin_or_halo(&
-            nx, ny, nlocal,      &
+            nx, ny, nlocal,             &
             itest, jtest, rtest, count, &
-            parallel,            &
-            sorted_ij,           &
-            local_mask,          &
-            halo_mask,           &
-            bwat_mask,           &
-            flux_fraction_int,   &
-            btemp_weight_int,    &
-            bwatflx_int,         &
-            bwatflx_accum_int,   &
+            parallel,                   &
+            sorted_ij,                  &
+            local_mask,                 &
+            halo_mask,                  &
+            bwat_mask,                  &
+            flux_fraction_int,          &
+            btemp_weight_freeze_int,    &
+            bwatflx_int,                &
+            bwatflx_accum_int,          &
             bwatflx_refreeze_accum_int, &
             finished)
 
@@ -808,7 +833,7 @@ subroutine route_basal_water(&
             halo_mask,           &
             bwat_mask,           &
             flux_fraction,       &
-            btemp_weight,        &
+            btemp_weight_freeze, &
             bwatflx,             &
             bwatflx_accum,       &
             bwatflx_refreeze_accum, &
@@ -1334,7 +1359,7 @@ subroutine get_flux_fraction(&
        flux_routing_scheme,   &
        sorted_ij,             &
        head,                  &
-       btemp_weight,          &
+       btemp_weight_flow,     &
        bwat_mask,             &
        flux_fraction)
 
@@ -1364,7 +1389,7 @@ subroutine get_flux_fraction(&
 
     real(dp), dimension(nx,ny), intent(in) :: &
          head,                 & ! hydraulic head (m)
-         btemp_weight            ! temperature-dependent weighting factor, favoring flow where the bed is thawed
+         btemp_weight_flow       ! temperature-dependent weighting factor, forcing flow around cells with frozen beds
 
     integer, dimension(nx,ny), intent(in) :: &
          bwat_mask               ! = 1 for cells in the region where basal water fluxes can be nonzero
@@ -1423,7 +1448,7 @@ subroutine get_flux_fraction(&
                    jp = j + jj
                    if (ip >= 1 .and. ip <= nx .and. jp > 1 .and. jp <= ny) then
                       if (head(ip,jp) < head(i,j)) then
-                         slope(ii,jj) = btemp_weight(ip,jp) * (head(i,j) - head(ip,jp)) / dists(ii,jj)
+                         slope(ii,jj) = btemp_weight_flow(ip,jp) * (head(i,j) - head(ip,jp)) / dists(ii,jj)
                       endif
                    endif
                 endif
@@ -1594,7 +1619,7 @@ subroutine route_flux_to_margin_or_halo_real8(&
        halo_mask,           &
        bwat_mask,           &
        flux_fraction,       &
-       btemp_weight,        &
+       btemp_weight_freeze, &
        bwatflx,             &
        bwatflx_accum,       &
        bwatflx_refreeze_accum, &
@@ -1635,7 +1660,7 @@ subroutine route_flux_to_margin_or_halo_real8(&
                                  ! 1st two indices give relative location of receiving cell
 
     real(dp), dimension(nx,ny), intent(in) :: &
-         btemp_weight            ! temperature-dependent weighting factor, favoring flow where the bed is thawed
+         btemp_weight_freeze     ! temperature-dependent weighting factor, favoring refreezing at frozen beds
 
     real(dp), dimension(nx,ny), intent(inout) :: &
          bwatflx,              & ! on input: water flux (m^3/s) to be routed to the margin or halo
@@ -1678,10 +1703,10 @@ subroutine route_flux_to_margin_or_halo_real8(&
        if (bwat_mask(i,j) == 1 .and. bwatflx(i,j) > 0.0d0) then
 
           ! Distribute the flux to downstream neighbors.
-          ! Based on the temperature-dependent weighting factor btemp_weight, all or part of the flux
+          ! Based on the temperature-dependent weighting factor btemp_weight_freeze, all or part of the flux
           !  is refrozen in place instead of being routed downstream.
-          flx_thru = bwatflx(i,j) * btemp_weight(i,j)
-          bwatflx_refreeze(i,j) = bwatflx(i,j) * (1.0d0 - btemp_weight(i,j))
+          flx_thru = bwatflx(i,j) * btemp_weight_freeze(i,j)
+          bwatflx_refreeze(i,j) = bwatflx(i,j) * (1.0d0 - btemp_weight_freeze(i,j))
           do jj = -1,1
              do ii = -1,1
                 ip = i + ii
@@ -1772,18 +1797,18 @@ end subroutine route_flux_to_margin_or_halo_real8
 !==============================================================
 
   subroutine route_flux_to_margin_or_halo_integer8(&
-       nx, ny, nlocal,      &
+       nx, ny, nlocal,             &
        itest, jtest, rtest, count, &
-       parallel,            &
-       sorted_ij,           &
-       local_mask,          &
-       halo_mask,           &
-       bwat_mask,           &
-       flux_fraction,       &
-       btemp_weight,        &
-       bwatflx,             &
-       bwatflx_accum,       &
-       bwatflx_refreeze_accum, &
+       parallel,                   &
+       sorted_ij,                  &
+       local_mask,                 &
+       halo_mask,                  &
+       bwat_mask,                  &
+       flux_fraction,              &
+       btemp_weight_freeze,        &
+       bwatflx,                    &
+       bwatflx_accum,              &
+       bwatflx_refreeze_accum,     &
        finished)
 
     ! Given the input bwatflx, route the water downstream, keeping track of fluxes along the way.
@@ -1815,13 +1840,13 @@ subroutine route_flux_to_margin_or_halo_integer8(&
          halo_mask,            & ! = 1 for the layer of halo cells adjacent to locally owned cells, else = 0
          bwat_mask               ! = 1 for cells through which basal water is routed; excludes floating and ocean cells
 
-    ! Note: Both flux_fraction and btemp_weight are constrained to be 0 or 1.
+    ! Note: Both flux_fraction and btemp_weight_freeze are constrained to be 0 or 1.
     !       This means that the routing is limited to D8 (all the flux goes to one downstream cell),
-    !        and partial refreezing is not allowed (i.e., btemp_weight = 1 everywhere).
-    !       Thus, btemp_weight is not needed, but I kept it to keep the code similar to the subroutine above.
+    !        and partial refreezing is not allowed (i.e., btemp_weight_freeze = 1 everywhere).
+    !       Thus, btemp_weight_freeze is not needed, but I kept it to keep the code similar to the subroutine above.
     !       We could make refreezing all-or-nothing (i.e., weights of either 0 or 1), but this leads to
     !        oscillations in bed temperature.
-    !       I thought of rescaling flux_fraction and btemp_weight to largish i8 integers (e.g., 1000)
+    !       I thought of rescaling flux_fraction and btemp_weight_freeze to largish i8 integers (e.g., 1000)
     !        to keep everything BFB, and then scaling back at the end. The problem is that this subroutine
     !        may need to be called repeatedly, and each scaling would lead to larger and larger integers
     !        that eventually exceed the i8 limit on integer size, ~10^(19).
@@ -1832,7 +1857,7 @@ subroutine route_flux_to_margin_or_halo_integer8(&
                                  ! 1st two indices give relative location of receiving cell
 
     integer(i8), dimension(nx,ny), intent(in) :: &
-         btemp_weight            ! temperature-dependent weighting factor, favoring flow where the bed is thawed
+         btemp_weight_freeze     ! temperature-dependent weighting factor, favoring refreezing at frozen beds
 
     integer(i8), dimension(nx,ny), intent(inout) :: &
          bwatflx,              & ! on input: water flux (m^3/s * factor_bwatflx) to be routed to the margin or halo
@@ -1879,9 +1904,9 @@ subroutine route_flux_to_margin_or_halo_integer8(&
        if (bwat_mask(i,j) == 1 .and. bwatflx(i,j) > 0.0d0) then
 
           ! Distribute the flux to downstream neighbors.
-          ! Note: If btemp_weight = 1 everwhere, there is no refreezing.
-          flx_thru = bwatflx(i,j) * btemp_weight(i,j)
-          bwatflx_refreeze(i,j) = bwatflx(i,j) * (1 - btemp_weight(i,j))
+          ! Note: If btemp_weight_freeze = 1 everwhere, there is no refreezing.
+          flx_thru = bwatflx(i,j) * btemp_weight_freeze(i,j)
+          bwatflx_refreeze(i,j) = bwatflx(i,j) * (1 - btemp_weight_freeze(i,j))
           do jj = -1,1
              do ii = -1,1
                 ip = i + ii
@@ -1937,7 +1962,6 @@ subroutine route_flux_to_margin_or_halo_integer8(&
        ! bmltflx_halo is now available in the halo cells of the local processor.
        ! Route downslope to the adjacent locally owned cells.
        ! These fluxes will be routed further downstream during the next iteration.
-       ! Note: This calculation does not use flux_fraction or btemp_weight, so no rescaling is needed at the end.
        do j = 2, ny-1
           do i = 2, nx-1
              if (halo_mask(i,j) == 1 .and. sum(bwatflx_halo(:,:,i,j)) > 0) then

From b1bf020e2158cc8a01f40dd5c9bf17ee59db6adc Mon Sep 17 00:00:00 2001
From: William Lipscomb <lipscomb@ucar.edu>
Date: Fri, 16 Jan 2026 18:51:54 -0700
Subject: [PATCH 10/21] Changed the initialization logic for C_c and C_p

This commit makes a small change in the way coulomb_c and powerlaw_c
are initialized. The old logic, called from glissade_initialise after
input files are read in, was to set C_c to a constant on initialization
based on the following logic:

    if (model%options%which_ho_coulomb_c == HO_COULOMB_C_CONSTANT .or. &
        model%options%is_restart == NO_RESTART) then

	if (model%options%elevation_based_coulomb_c) then
            [stuff]
	else
	    model%basal_physics%coulomb_c = model%basal_physics%coulomb_c_const
	endif
    endif
And similarly for powerlaw_c.

This is problematic for CESM coupled runs, which may read C_c from an input file
created by a previous CISM standalone spinup.
When the CESM run is not a restart run, the logic above sets C_c to a constant
instead of using the values from the input file.

I changed the logic so that
(1) For option HO_COULOMB_C_CONSTANT, coulomb_c is set to coulomb_c_const.
(2) For elevation-based coulomb_c, check whether coulomb_c_hi or coulomb_c_lo = 0
    everywhere. If so, these fields are set to prescribed constants, and then
    elevation-based coulomb_c is computed.
(3) For coulomb_c not based on elevation, check whether coulomb_c = 0 everywhere.
    If so, coulomb_c is set to coulomb_c_const.

In this way, we avoid overwriting coulomb_c (or coulomb_c_hi and coulomb_c_lo)
if these fields have already been read in.
---
 libglissade/glissade.F90 | 53 +++++++++++++++++++++++++++-------------
 1 file changed, 36 insertions(+), 17 deletions(-)

diff --git a/libglissade/glissade.F90 b/libglissade/glissade.F90
index be44b235..2f3acb4f 100644
--- a/libglissade/glissade.F90
+++ b/libglissade/glissade.F90
@@ -943,7 +943,7 @@ subroutine glissade_initialise(model, evolve_ice)
     !TODO:  Have a single option that is applied with or without glaciers enabled?
 
     if (model%options%which_ho_powerlaw_c == HO_POWERLAW_C_CONSTANT .or. &
-        model%options%is_restart == NO_RESTART) then
+        parallel_is_zero(model%basal_physics%powerlaw_c)) then
        if (model%options%enable_glaciers .and. &
             model%glacier%set_powerlaw_c /= GLACIER_POWERLAW_C_CONSTANT) then
           ! do nothing; see note above
@@ -953,15 +953,29 @@ subroutine glissade_initialise(model, evolve_ice)
     endif
 
     ! Initialize coulomb_c
-    ! If inverting for coulomb_c, we read in the saved coulomb_c field on restart.
-    if (model%options%which_ho_coulomb_c == HO_COULOMB_C_CONSTANT .or. &
-        model%options%is_restart == NO_RESTART) then
+    ! Note: If inverting for coulomb_c, then coulomb_c is initialized here.
+    !       On restart, however, the saved coulomb_c (or alternatively,
+    !        coulomb_c_hi and coulomb_c_lo, for the elevation-based option)
+    !        should have been read from the restart file and is not reset here.
 
-       !TODO - Make sure the initialization is correct when reading from an external file in a restart.
-       if (model%options%elevation_based_coulomb_c) then
+    if (model%options%which_ho_coulomb_c == HO_COULOMB_C_CONSTANT) then
 
-          model%basal_physics%coulomb_c_hi = model%basal_physics%coulomb_c_const_hi
-          model%basal_physics%coulomb_c_lo = model%basal_physics%coulomb_c_const_lo
+       model%basal_physics%coulomb_c = model%basal_physics%coulomb_c_const
+
+    else   ! either inverting for coulomb_c or reading values from an input file
+
+       if (model%options%elevation_based_coulomb_c) then  ! need coulomb_c_hi and coulomb_c_lo
+
+          if (parallel_is_zero(model%basal_physics%coulomb_c_hi) .or. &
+              parallel_is_zero(model%basal_physics%coulomb_c_lo)) then
+
+             ! initialize to constants
+             model%basal_physics%coulomb_c_hi = model%basal_physics%coulomb_c_const_hi
+             model%basal_physics%coulomb_c_lo = model%basal_physics%coulomb_c_const_lo
+
+          endif
+
+          ! Given coulomb_c_hi and coulomb_c_lo, compute coulomb_c based on elevation
 
           call glissade_elevation_based_coulomb_c(&
                model%general%ewn,   model%general%nsn,   &
@@ -973,17 +987,24 @@ subroutine glissade_initialise(model, evolve_ice)
                model%basal_physics%coulomb_c_bed_hi,     &
                model%basal_physics%coulomb_c)
 
-          call parallel_halo(model%basal_physics%coulomb_c, parallel)
+       else   ! coulomb_c not elevation-based
+
+          if (parallel_is_zero(model%basal_physics%coulomb_c)) then
+
+             ! initialize to constant
+             model%basal_physics%coulomb_c = model%basal_physics%coulomb_c_const
 
-          if (verbose_inversion) then
-             call point_diag(model%basal_physics%coulomb_c, 'Initial coulomb_c', itest, jtest, rtest, 7, 7)
           endif
 
-       else
-          model%basal_physics%coulomb_c = model%basal_physics%coulomb_c_const
+       endif   ! elevation-based
+
+       call parallel_halo(model%basal_physics%coulomb_c, parallel)
+
+       if (verbose_inversion) then
+          call point_diag(model%basal_physics%coulomb_c, 'Initial coulomb_c', itest, jtest, rtest, 7, 7)
        endif
 
-    endif
+    endif   ! coulomb_c options
 
     ! Optionally, do initial calculations for inversion
     ! At the start of the run (but not on restart), this might lead to further thickness adjustments,
@@ -1771,9 +1792,6 @@ subroutine glissade_thermal_solve(model, dt)
     use glissade_therm, only: glissade_therm_driver
     use glissade_basal_water, only: glissade_calcbwat, glissade_bwat_flux_routing
     use glissade_masks, only: glissade_get_masks
-    !WHL - debug
-    use cism_parallel, only: parallel_reduce_max
-    use glissade_utils, only: write_array_to_file
 
     implicit none
 
@@ -2446,6 +2464,7 @@ subroutine glissade_calving_solve(model, init_calving)
     use glissade_masks, only: glissade_get_masks, glissade_ocean_connection_mask, &
          glissade_calving_front_mask
     use glissade_grounding_line, only: glissade_grounded_fraction
+
     implicit none
 
     type(glide_global_type), intent(inout) :: model   ! model instance

From 656981788a06c86e8ffcb9d19b5584d54f890575 Mon Sep 17 00:00:00 2001
From: William Lipscomb <lipscomb@ucar.edu>
Date: Sat, 17 Jan 2026 10:35:15 -0700
Subject: [PATCH 11/21] Reprosum abort logic for tridiagonal preconditioners

This commit adds logic to abort the run if the user requests local triagonal
or global tridiagonal preconditioning along with reproducible sums.
I looked at both preconditioners and concluded that it's not feasible
to make either of them independent of the number of processors.

For the global tridiag preconditioner, the issue is that the algorithm
computes coefficients that connect adjacent processors on a given row or column.
If the block layout is different, then these coefficients will be different
and answers will differ at roundoff level, even though we are solving
the same global equations.

For local tridiag preconditioner, a different block layout means that
we are solving an entirely different set of tridiagonal systems,
ensuring that answers will differ.

With reprosums enabled, the user will therefor need to choose the simpler
diagonal preconditioner (or the SIA preconditioner for the 3D solvers).
This could result in slower throughput, but not enough slower to be a big problem,
especially in CESM coupled runs where the ice sheet model is much cheaper
than other components.

I also fixed a minor bug in the subroutine that computes glacier AARs.
I introduced the bug when I inserted calls to the parallel_global_sum_patch
subroutine. The AARs are now correct again. This fix doesn't change
prognostic results, since the AAR is just a diagnostic.
---
 libglide/glide_setup.F90                 | 9 ++++++++-
 libglissade/glissade_glacier.F90         | 8 ++++----
 libglissade/glissade_velo_higher_pcg.F90 | 3 ++-
 3 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/libglide/glide_setup.F90 b/libglide/glide_setup.F90
index af481071..28f73706 100644
--- a/libglide/glide_setup.F90
+++ b/libglide/glide_setup.F90
@@ -2197,8 +2197,15 @@ subroutine print_options(model)
        if (model%options%whichdycore == DYCORE_GLISSADE .and.   &
            (model%options%which_ho_sparse == HO_SPARSE_PCG_STANDARD .or.  &
             model%options%which_ho_sparse == HO_SPARSE_PCG_CHRONGEAR) ) then 
+          if (model%options%reproducible_sums) then
+             if (model%options%which_ho_precond == HO_PRECOND_TRIDIAG_LOCAL .or. &
+                 model%options%which_ho_precond == HO_PRECOND_TRIDIAG_GLOBAL) then
+                call write_log ('Tridiagonal preconditioners are not supported with reproducible sums.')
+                call write_log ('Please choose a different preconditioner (e.g., diagonal)', GM_FATAL)
+             endif
+          endif
           write(message,*) 'ho_whichprecond         : ',model%options%which_ho_precond,  &
-                            ho_whichprecond(model%options%which_ho_precond)
+               ho_whichprecond(model%options%which_ho_precond)
           call write_log(message)
           if (model%options%which_ho_precond < 0 .or. model%options%which_ho_precond >= size(ho_whichprecond)) then
              call write_log('Error, glissade preconditioner out of range', GM_FATAL)
diff --git a/libglissade/glissade_glacier.F90 b/libglissade/glissade_glacier.F90
index 3d6d2641..ee9e1657 100644
--- a/libglissade/glissade_glacier.F90
+++ b/libglissade/glissade_glacier.F90
@@ -3362,20 +3362,20 @@ subroutine glacier_accumulation_area_ratio(&
     ! Note: Grid cells with SMB = 0 are not counted in either zone.
 
     where (cism_glacier_id > 0 .and. smb > 0.0d0)
-       glacier_id = 1
+       glacier_id = cism_glacier_id
     elsewhere
        glacier_id = 0
     endwhere
 
-    accum_area = parallel_global_sum_patch(cell_area, nglacier, cism_glacier_id, parallel)
+    accum_area = parallel_global_sum_patch(cell_area, nglacier, glacier_id, parallel)
 
     where (cism_glacier_id > 0 .and. smb < 0.0d0)
-       glacier_id = 1
+       glacier_id = cism_glacier_id
     elsewhere
        glacier_id = 0
     endwhere
 
-    ablat_area = parallel_global_sum_patch(cell_area, nglacier, cism_glacier_id, parallel)
+    ablat_area = parallel_global_sum_patch(cell_area, nglacier, glacier_id, parallel)
 
     ! Compute the AAR for each glacier
 
diff --git a/libglissade/glissade_velo_higher_pcg.F90 b/libglissade/glissade_velo_higher_pcg.F90
index 6d162af7..2c2cc482 100644
--- a/libglissade/glissade_velo_higher_pcg.F90
+++ b/libglissade/glissade_velo_higher_pcg.F90
@@ -2341,7 +2341,7 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
              zu(i,j) = 0.d0
           endif
           if (Adiagv(i,j) /= 0.d0) then
-             zv(i,j) = rv(i,j) / Adiagv(i,j)  
+             zv(i,j) = rv(i,j) / Adiagv(i,j)
           else                                        
              zv(i,j) = 0.d0
           endif
@@ -3821,6 +3821,7 @@ subroutine setup_preconditioner_tridiag_global_2d(ilocal,       jlocal,    &
 
     !WHL - debug
     if (verbose_tridiag .and. this_rank == rtest) then
+       write(iulog,*)
        write(iulog,*) 'In setup_preconditioner_tridiag_global_2d: itest, jtest, rtest =', itest, jtest, rtest
     endif
 

From d4758c71f8a0322e8453c82d153983fbca2de441 Mon Sep 17 00:00:00 2001
From: William Lipscomb <lipscomb@ucar.edu>
Date: Sat, 17 Jan 2026 11:04:40 -0700
Subject: [PATCH 12/21] Removed infnan logic from the reprosum module

Pat Worley's original code uses the shared module shr_infnan_mod.
This module supports logic in the reprosum algorithm to ignore infinities
and NaNs in the input arrays. When implementing reprosums in CISM,
I carried along a CISM version of this module, cism_infnan_mod.

However, CISM arrays passed to the reprosum module should not contain
infinities or NaNs, and I would prefer not to carry along a module
that is never used. This commit removes cism_infnan_mod.F90, along
with the infnan-related logic and use statements in cism_reprosum_mod.
Tony Craig did the same thing for the CICE reprosum module:
https://github.com/ESCOMP/CICE/blob/main/cicecore/cicedyn/infrastructure/
comm/mpi/ice_reprosum.F90

The CISM version of the reprosum module still contains some subroutines
that aren't now called from CISM. I am leaving these subroutines in place
in case they turn out to be useful later.
---
 libglimmer/cism_infnan_mod.F90   | 423 -------------------------------
 libglimmer/cism_reprosum_mod.F90 | 188 ++------------
 libglimmer/parallel_mpi.F90      |   4 -
 3 files changed, 21 insertions(+), 594 deletions(-)
 delete mode 100644 libglimmer/cism_infnan_mod.F90

diff --git a/libglimmer/cism_infnan_mod.F90 b/libglimmer/cism_infnan_mod.F90
deleted file mode 100644
index 124b051f..00000000
--- a/libglimmer/cism_infnan_mod.F90
+++ /dev/null
@@ -1,423 +0,0 @@
-! Flag representing compiler support of Fortran 2003's
-! ieee_arithmetic intrinsic module.
-#if defined CPRIBM || defined CPRPGI || defined CPRINTEL ||  defined CPRCRAY || defined CPRNAG
-#define HAVE_IEEE_ARITHMETIC
-#endif
-
-!WHL, Nov. 2025: Adapted from shr_infnan_mod.F90, part of CESM shared code
-!                Changed 'shr' to 'cism' to avoid name conflicts with shared code
-!                I kept only the r8 interfaces (not r4, i8 or i4).
-!                Also, I assumed that all input arrays are 1d.
-
-module cism_infnan_mod
-!---------------------------------------------------------------------
-! Module to test for IEEE Inf and NaN values, which also provides a
-! method of setting +/-Inf and signaling or quiet NaN.
-!
-! All functions are elemental, and thus work on arrays.
-!---------------------------------------------------------------------
-! To test for these values, just call the corresponding function, e.g:
-!
-!   var_is_nan = cism_infnan_isnan(x)
-!
-! You can also use it on arrays:
-!
-!   array_contains_nan = any(cism_infnan_isnan(my_array))
-!
-!---------------------------------------------------------------------
-! To generate these values, assign one of the provided derived-type
-! variables to a real:
-!
-!   use cism_infnan_mod, only: nan => cism_infnan_nan, &
-!                             inf => cism_infnan_inf, &
-!                             assignment(=)
-!   real(r4) :: my_nan
-!   real(r8) :: my_inf_array(2,2)
-!   my_nan = nan
-!   my_inf_array = inf
-!
-! Keep in mind that "cism_infnan_nan" and "cism_infnan_inf" cannot be
-! passed to functions that expect real arguments. To pass a real
-! NaN, you will have to use cism_infnan_nan to set a local real of
-! the correct kind.
-!---------------------------------------------------------------------
-
-  use glimmer_global, only: r4 => sp, r8 => dp
-  use glimmer_global, only: i4, i8
-!!use shr_kind_mod, only: &
-!!     r4 => SHR_KIND_R4, &
-!!     r8 => SHR_KIND_R8
-
-#ifdef HAVE_IEEE_ARITHMETIC
-
-! If we have IEEE_ARITHMETIC, the NaN test is provided for us.
-use, intrinsic :: ieee_arithmetic, only: &
-     cism_infnan_isnan => ieee_is_nan
-
-#else
-
-! Integers of correct size for bit patterns below.
-!!use shr_kind_mod, only: i4 => shr_kind_i4, i8 => shr_kind_i8
-
-#endif
-
-implicit none
-private
-save
-
-! Test functions for NaN/Inf values.
-public :: cism_infnan_isnan
-public :: cism_infnan_isinf
-public :: cism_infnan_isposinf
-public :: cism_infnan_isneginf
-
-! Locally defined isnan.
-#ifndef HAVE_IEEE_ARITHMETIC
-interface cism_infnan_isnan
-   ! TYPE double,real
-   module procedure cism_infnan_isnan_r8
-end interface
-#endif
-
-interface cism_infnan_isinf
-   ! TYPE double,real
-   module procedure cism_infnan_isinf_r8
-end interface
-
-interface cism_infnan_isposinf
-   ! TYPE double,real
-   module procedure cism_infnan_isposinf_r8
-end interface
-
-interface cism_infnan_isneginf
-   ! TYPE double,real
-   module procedure cism_infnan_isneginf_r8
-end interface
-
-! Derived types for generation of NaN/Inf
-! Even though there's no reason to "use" the types directly, some compilers
-! might have trouble with an object being used without its type.
-public :: cism_infnan_nan_type
-public :: cism_infnan_inf_type
-public :: assignment(=)
-public :: cism_infnan_to_r4
-public :: cism_infnan_to_r8
-
-! Type representing Not A Number.
-type :: cism_infnan_nan_type
-   logical :: quiet = .false.
-end type cism_infnan_nan_type
-
-! Type representing +/-Infinity.
-type :: cism_infnan_inf_type
-   logical :: positive = .true.
-end type cism_infnan_inf_type
-
-! Allow assigning reals to NaN or Inf.
-interface assignment(=)
-   ! TYPE double,real
-   ! DIMS 0,1,2,3,4,5,6,7
-   module procedure set_nan_1d_r8
-   ! TYPE double,real
-   ! DIMS 0,1,2,3,4,5,6,7
-   module procedure set_inf_1d_r8
-end interface
-
-! Conversion functions.
-interface cism_infnan_to_r8
-   module procedure nan_r8
-   module procedure inf_r8
-end interface
-
-interface cism_infnan_to_r4
-   module procedure nan_r4
-   module procedure inf_r4
-end interface
-
-! Initialize objects of NaN/Inf type for other modules to use.
-
-! Default NaN is signaling, but also provide snan and qnan to choose
-! explicitly.
-type(cism_infnan_nan_type), public, parameter :: cism_infnan_nan = &
-     cism_infnan_nan_type(.false.)
-type(cism_infnan_nan_type), public, parameter :: cism_infnan_snan = &
-     cism_infnan_nan_type(.false.)
-type(cism_infnan_nan_type), public, parameter :: cism_infnan_qnan = &
-     cism_infnan_nan_type(.true.)
-
-! Default Inf is positive, but provide posinf to go with neginf.
-type(cism_infnan_inf_type), public, parameter :: cism_infnan_inf = &
-     cism_infnan_inf_type(.true.)
-type(cism_infnan_inf_type), public, parameter :: cism_infnan_posinf = &
-     cism_infnan_inf_type(.true.)
-type(cism_infnan_inf_type), public, parameter :: cism_infnan_neginf = &
-     cism_infnan_inf_type(.false.)
-
-! Bit patterns for implementation without ieee_arithmetic.
-! Note that in order to satisfy gfortran's range check, we have to use
-! ibset to set the sign bit from a BOZ pattern.
-#ifndef HAVE_IEEE_ARITHMETIC
-! Single precision.
-integer(i4), parameter :: ssnan_pat = int(Z'7FA00000',i4)
-integer(i4), parameter :: sqnan_pat = int(Z'7FC00000',i4)
-integer(i4), parameter :: sposinf_pat = int(Z'7F800000',i4)
-integer(i4), parameter :: sneginf_pat = ibset(sposinf_pat,bit_size(1_i4)-1)
-! Double precision.
-integer(i8), parameter :: dsnan_pat = int(Z'7FF4000000000000',i8)
-integer(i8), parameter :: dqnan_pat = int(Z'7FF8000000000000',i8)
-integer(i8), parameter :: dposinf_pat = int(Z'7FF0000000000000',i8)
-integer(i8), parameter :: dneginf_pat = ibset(dposinf_pat,bit_size(1_i8)-1)
-#endif
-
-contains
-
-!---------------------------------------------------------------------
-! TEST FUNCTIONS
-!---------------------------------------------------------------------
-! The "isinf" function simply calls "isposinf" and "isneginf".
-!---------------------------------------------------------------------
-
-! TYPE double,real
-elemental function cism_infnan_isinf_r8(x) result(isinf)
-  real(r8), intent(in) :: x
-  logical :: isinf
-
-  isinf = cism_infnan_isposinf(x) .or. cism_infnan_isneginf(x)
-
-end function cism_infnan_isinf_r8
-
-#ifdef HAVE_IEEE_ARITHMETIC
-
-!---------------------------------------------------------------------
-! The "isposinf" and "isneginf" functions get the IEEE class of a
-! real, and test to see if the class is equal to ieee_positive_inf
-! or ieee_negative_inf.
-!---------------------------------------------------------------------
-
-! TYPE double,real
-elemental function cism_infnan_isposinf_r8(x) result(isposinf)
-  use, intrinsic :: ieee_arithmetic, only: &
-       ieee_class, &
-       ieee_positive_inf, &
-       operator(==)
-  real(r8), intent(in) :: x
-  logical :: isposinf
-
-  isposinf = (ieee_positive_inf == ieee_class(x))
-
-end function cism_infnan_isposinf_r8
-
-! TYPE double,real
-elemental function cism_infnan_isneginf_r8(x) result(isneginf)
-  use, intrinsic :: ieee_arithmetic, only: &
-       ieee_class, &
-       ieee_negative_inf, &
-       operator(==)
-  real(r8), intent(in) :: x
-  logical :: isneginf
-
-  isneginf = (ieee_negative_inf == ieee_class(x))
-
-end function cism_infnan_isneginf_r8
-
-#else
-! Don't have ieee_arithmetic.
-
-!!#ifdef CPRGNU   !WHL - Assume this is true
-! NaN testing on gfortran.
-! TYPE double,real
-elemental function cism_infnan_isnan_r8(x) result(is_nan)
-  real(r8), intent(in) :: x
-  logical :: is_nan
-
-  is_nan = isnan(x)
-
-end function cism_infnan_isnan_r8
-! End GNU section.
-!!#endif
-
-!---------------------------------------------------------------------
-! The "isposinf" and "isneginf" functions just test against a known
-! bit pattern if we don't have ieee_arithmetic.
-!---------------------------------------------------------------------
-
-! TYPE double,real
-elemental function cism_infnan_isposinf_r8(x) result(isposinf)
-  real(r8), intent(in) :: x
-  logical :: isposinf
-!!#if ({ITYPE} == TYPEREAL)
-!!  integer(i4), parameter :: posinf_pat = sposinf_pat
-!!#else
-  integer(i8), parameter :: posinf_pat = dposinf_pat
-!!#endif
-
-  isposinf = (x == transfer(posinf_pat,x))
-
-end function cism_infnan_isposinf_r8
-
-! TYPE double,real
-elemental function cism_infnan_isneginf_r8(x) result(isneginf)
-  real(r8), intent(in) :: x
-  logical :: isneginf
-!!#if ({ITYPE} == TYPEREAL)
-!!  integer(i4), parameter :: neginf_pat = sneginf_pat
-!!#else
-  integer(i8), parameter :: neginf_pat = dneginf_pat
-!!#endif
-
-  isneginf = (x == transfer(neginf_pat,x))
-
-end function cism_infnan_isneginf_r8
-
-! End ieee_arithmetic conditional.
-#endif
-
-!---------------------------------------------------------------------
-! GENERATION FUNCTIONS
-!---------------------------------------------------------------------
-! Two approaches for generation of NaN and Inf values:
-!   1. With Fortran 2003, use the ieee_value intrinsic to get a value
-!      from the corresponding class. These are:
-!       - ieee_signaling_nan
-!       - ieee_quiet_nan
-!       - ieee_positive_inf
-!       - ieee_negative_inf
-!   2. Without Fortran 2003, set the IEEE bit patterns directly.
-!      Use BOZ literals to get an integer with the correct bit
-!      pattern, then use "transfer" to transfer those bits into a
-!      real.
-!---------------------------------------------------------------------
-
-! TYPE double,real
-! DIMS 0,1,2,3,4,5,6,7
-pure subroutine set_nan_1d_r8(output, nan)
-#ifdef HAVE_IEEE_ARITHMETIC
-  use, intrinsic :: ieee_arithmetic, only: &
-       ieee_signaling_nan, &
-       ieee_quiet_nan, &
-       ieee_value
-#else
-!!#if ({ITYPE} == TYPEREAL)
-!!  integer(i4), parameter :: snan_pat = ssnan_pat
-!!  integer(i4), parameter :: qnan_pat = sqnan_pat
-!!#else
-  integer(i8), parameter :: snan_pat = dsnan_pat
-  integer(i8), parameter :: qnan_pat = dqnan_pat
-!!#endif
-#endif
-!!  real(r8), intent(out) :: output{DIMSTR}
-  real(r8), intent(out) :: output
-  type(cism_infnan_nan_type), intent(in) :: nan
-
-  ! Use scalar temporary for performance reasons, to reduce the cost of
-  ! the ieee_value call.
-  real(r8) :: tmp
-
-#ifdef HAVE_IEEE_ARITHMETIC
-  if (nan%quiet) then
-     tmp = ieee_value(tmp, ieee_quiet_nan)
-  else
-     tmp = ieee_value(tmp, ieee_signaling_nan)
-  end if
-#else
-  if (nan%quiet) then
-     tmp = transfer(qnan_pat, tmp)
-  else
-     tmp = transfer(snan_pat, tmp)
-  end if
-#endif
-
-  output = tmp
-
-end subroutine set_nan_1d_r8
-
-! TYPE double,real
-! DIMS 0,1,2,3,4,5,6,7
-pure subroutine set_inf_1d_r8(output, inf)
-#ifdef HAVE_IEEE_ARITHMETIC
-  use, intrinsic :: ieee_arithmetic, only: &
-       ieee_positive_inf, &
-       ieee_negative_inf, &
-       ieee_value
-#else
-!!#if ({ITYPE} == TYPEREAL)
-!!  integer(i4), parameter :: posinf_pat = sposinf_pat
-!!  integer(i4), parameter :: neginf_pat = sneginf_pat
-!!#else
-  integer(i8), parameter :: posinf_pat = dposinf_pat
-  integer(i8), parameter :: neginf_pat = dneginf_pat
-!!#endif
-#endif
-!!  real(r8), intent(out) :: output{DIMSTR}
-  real(r8), intent(out) :: output
-  type(cism_infnan_inf_type), intent(in) :: inf
-
-  ! Use scalar temporary for performance reasons, to reduce the cost of
-  ! the ieee_value call.
-  real(r8) :: tmp
-
-#ifdef HAVE_IEEE_ARITHMETIC
-  if (inf%positive) then
-     tmp = ieee_value(tmp,ieee_positive_inf)
-  else
-     tmp = ieee_value(tmp,ieee_negative_inf)
-  end if
-#else
-  if (inf%positive) then
-     tmp = transfer(posinf_pat, tmp)
-  else
-     tmp = transfer(neginf_pat, tmp)
-  end if
-#endif
-
-  output = tmp
-
-end subroutine set_inf_1d_r8
-
-!---------------------------------------------------------------------
-! CONVERSION INTERFACES.
-!---------------------------------------------------------------------
-! Function methods to get reals from nan/inf types.
-!---------------------------------------------------------------------
-
-pure function nan_r8(nan) result(output)
-  class(cism_infnan_nan_type), intent(in) :: nan
-  real(r8) :: output
-
-!!  output = nan
-  !WHL kluge
-  output = 0._r8
-
-end function nan_r8
-
-pure function nan_r4(nan) result(output)
-  class(cism_infnan_nan_type), intent(in) :: nan
-  real(r4) :: output
-
-!!  output = nan
-  !WHL kluge
-  output = 0._r8
-
-end function nan_r4
-
-pure function inf_r8(inf) result(output)
-  class(cism_infnan_inf_type), intent(in) :: inf
-  real(r8) :: output
-
-!!  output = inf
-  !WHL kluge
-  output = 0._r8
-
-end function inf_r8
-
-pure function inf_r4(inf) result(output)
-  class(cism_infnan_inf_type), intent(in) :: inf
-  real(r4) :: output
-
-!!  output = inf
-  !WHL kluge
-  output = 0._r8
-
-end function inf_r4
-
-end module cism_infnan_mod
diff --git a/libglimmer/cism_reprosum_mod.F90 b/libglimmer/cism_reprosum_mod.F90
index ec202273..1b4161a8 100644
--- a/libglimmer/cism_reprosum_mod.F90
+++ b/libglimmer/cism_reprosum_mod.F90
@@ -36,6 +36,10 @@ module cism_reprosum_mod
 ! I started from a version that includes some logic fixes and code cleanup
 !  done by Pat Worley in 2023. Pat's revised version differs from the code
 !  in the CESM repo as of Jan. 2026.
+! See here for information on Pat's mods:
+! * https://github.com/E3SM-Project/E3SM/pull/5534
+! * https://github.com/E3SM-Project/E3SM/pull/5549
+! * https://github.com/E3SM-Project/E3SM/pull/5560
 !------------------------------------------------------------------------
 
 !------------------------------------------------------------------------
@@ -62,11 +66,6 @@ module cism_reprosum_mod
    use glimmer_global, only: r8 => dp
    use glimmer_global, only: i8
    use glimmer_paramets, only: iulog
-   use cism_infnan_mod,only: cism_infnan_inf_type, assignment(=), &
-                             cism_infnan_posinf, cism_infnan_neginf, &
-                             cism_infnan_nan, &
-                             cism_infnan_isnan, cism_infnan_isinf, &
-                             cism_infnan_isposinf, cism_infnan_isneginf
    use profile, only: t_startf, t_stopf  !WHL - replace with perf_mod?
    ! end WHL mods
 
@@ -116,8 +115,6 @@ module cism_reprosum_mod
    !---------------------------------------------------------------------
    logical            :: repro_sum_use_ddpdd = .false.
 
-   logical            :: repro_sum_allow_infnan = .false.
-
    !WHL - Should this code be declared?
    !      Not sure what EAMXX_STANDALONE means
 #ifdef EAMXX_STANDALONE
@@ -143,7 +140,6 @@ end subroutine cism_reprosumx86_fix_end
 !========================================================================
 !
    subroutine cism_reprosum_setopts(repro_sum_use_ddpdd_in,    &
-                                   repro_sum_allow_infnan_in, &
                                    repro_sum_rel_diff_max_in, &
                                    repro_sum_recompute_in,    &
                                    repro_sum_master,          &
@@ -156,10 +152,6 @@ subroutine cism_reprosum_setopts(repro_sum_use_ddpdd_in,    &
 !------------------------------Arguments---------------------------------
       ! Use DDPDD algorithm instead of integer vector algorithm
       logical, intent(in), optional :: repro_sum_use_ddpdd_in
-      ! Allow INF or NaN in summands
-      logical, intent(in), optional :: repro_sum_allow_infnan_in
-      ! maximum permissible difference between reproducible and
-      ! nonreproducible sums
       real(r8), intent(in), optional :: repro_sum_rel_diff_max_in
       ! recompute using different algorithm when difference between
       ! reproducible and nonreproducible sums is too great
@@ -202,9 +194,6 @@ subroutine cism_reprosum_setopts(repro_sum_use_ddpdd_in,    &
       if ( present(repro_sum_use_ddpdd_in) ) then
          repro_sum_use_ddpdd = repro_sum_use_ddpdd_in
       endif
-      if ( present(repro_sum_allow_infnan_in) ) then
-         repro_sum_allow_infnan = repro_sum_allow_infnan_in
-      endif
       if ( present(repro_sum_rel_diff_max_in) ) then
          cism_reprosum_reldiffmax = repro_sum_rel_diff_max_in
       endif
@@ -222,14 +211,6 @@ subroutine cism_reprosum_setopts(repro_sum_use_ddpdd_in,    &
               'distributed sum algorithm'
          endif
 
-         if ( repro_sum_allow_infnan ) then
-            write(logunit,*) 'cism_REPROSUM_SETOPTS: ',&
-              'Will calculate sum when INF or NaN are included in summands'
-         else
-            write(logunit,*) 'cism_REPROSUM_SETOPTS: ',&
-              'Will abort if INF or NaN are included in summands'
-         endif
-
          if (cism_reprosum_reldiffmax >= 0.0_r8) then
             write(logunit,*) '                    ',&
               'with a maximum relative error tolerance of ', &
@@ -256,7 +237,7 @@ end subroutine cism_reprosum_setopts
 !
 
    subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands,     &
-                                 nflds, allow_infnan, ddpdd_sum,          &
+                                 nflds, ddpdd_sum,                        &
                                  arr_gbl_max, arr_gbl_max_out,            &
                                  arr_max_levels, arr_max_levels_out,      &
                                  gbl_max_nsummands, gbl_max_nsummands_out,&
@@ -505,10 +486,6 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands,     &
                                          ! use ddpdd algorithm instead
                                          ! of integer vector algorithm
 
-      logical,  intent(in),    optional :: allow_infnan
-         ! if .true., allow INF or NaN input values.
-         ! if .false. (the default), then abort.
-
       real(r8), intent(in),    optional :: arr_gbl_max(nflds)
                                          ! upper bound on max(abs(arr))
 
@@ -561,8 +538,6 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands,     &
 !
 ! Local workspace
 !
-      logical :: abort_inf_nan           ! flag indicating whether to
-                                         !  abort if INF or NaN found in input
       logical :: use_ddpdd_sum           ! flag indicating whether to
                                          !  use cism_reprosum_ddpdd or not
       logical :: recompute               ! flag indicating need to
@@ -571,17 +546,6 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands,     &
       logical :: validate                ! flag indicating need to
                                          !  verify gmax and max_levels
                                          !  are accurate/sufficient
-      logical :: nan_check, inf_check    ! flag on whether there are
-                                         !  NaNs and INFs in input array
-      logical :: inf_nan_lchecks(3,nflds)! flags on whether there are
-                                         !  NaNs, positive INFs, or negative INFs
-                                         !  for each input field locally
-      logical :: inf_nan_gchecks(3,nflds)! flags on whether there are
-                                         !  NaNs, positive INFs, or negative INFs
-                                         !  for each input field
-      logical :: arr_gsum_infnan(nflds)  ! flag on whether field sum is a
-                                         !  NaN or INF
-
       integer :: gbl_lor_red             ! global lor reduction? (0/1)
       integer :: gbl_max_red             ! global max reduction? (0/1)
       integer :: repro_sum_fast          ! 1 reduction repro_sum? (0/1)
@@ -589,8 +553,6 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands,     &
       integer :: repro_sum_both          ! both fast and slow? (0/1)
       integer :: nonrepro_sum            ! nonrepro_sum? (0/1)
 
-      integer :: nan_count, inf_count    ! local count of NaNs and INFs in
-                                         !  input array
       integer :: omp_nthreads            ! number of OpenMP threads
       integer :: mpi_comm                ! MPI subcommunicator
       integer :: mypid                   ! MPI task ID (COMM_WORLD)
@@ -664,85 +626,6 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands,     &
 !      call t_barrierf('sync_repro_sum',mpi_comm)
 #endif
 
-! Check whether should abort if input contains NaNs or INFs
-      abort_inf_nan = .not. repro_sum_allow_infnan
-      if ( present(allow_infnan) ) then
-         abort_inf_nan = .not. allow_infnan
-      endif
-
-! With Fujitsu always abort on NaNs or INFs in input
-#ifdef CPRFJ
-      abort_inf_nan = .true.
-#endif
-
-#ifndef EAMXX_STANDALONE
-      call t_startf('cism_reprosum_INF_NaN_Chk')
-#endif
-
-! Initialize flags to indicate that no NaNs or INFs are present in the input data
-      inf_nan_gchecks = .false.
-      arr_gsum_infnan = .false.
-
-      !TODO - Remove the inf_nan option; assume abort_inf_nan = T, as in CICE
-      if (abort_inf_nan) then
-
-! Check whether input contains NaNs or INFs, and abort if so
-         nan_check = any(cism_infnan_isnan(arr))
-         inf_check = any(cism_infnan_isinf(arr))
-
-         if (nan_check .or. inf_check) then
-
-            nan_count = count(cism_infnan_isnan(arr))
-            inf_count = count(cism_infnan_isinf(arr))
-
-            if ((nan_count > 0) .or. (inf_count > 0)) then
-               call mpi_comm_rank(MPI_COMM_WORLD, mypid, ierr)
-!               write(s_logunit,37) real(nan_count,r8), real(inf_count,r8), mypid
-               write(iulog,37) real(nan_count,r8), real(inf_count,r8), mypid
-37 format("cism_REPROSUM_CALC: Input contains ",e12.5, &
-          " NaNs and ", e12.5, " INFs on MPI task ", i7)
-               !WHL mod
-!               call shr_sys_abort("shr_reprosum_calc ERROR: NaNs or INFs in input")
-               write(iulog,*) 'cism_reprosum_calc ERROR: NaNs or INFs in input'
-               call mpi_abort(MPI_COMM_WORLD, 1001, ierr)
-               !end WHL mod
-            endif
-
-         endif
-
-#ifndef CPRFJ
-      else
-
-! Determine whether any fields contain NaNs or INFs, and avoid processing them
-! via integer expansions
-         inf_nan_lchecks = .false.
-
-         do ifld=1,nflds
-            inf_nan_lchecks(1,ifld) = any(cism_infnan_isnan(arr(:,ifld)))
-            inf_nan_lchecks(2,ifld) = any(cism_infnan_isposinf(arr(:,ifld)))
-            inf_nan_lchecks(3,ifld) = any(cism_infnan_isneginf(arr(:,ifld)))
-         end do
-#ifndef EAMXX_STANDALONE
-         call t_startf("repro_sum_allr_lor")
-#endif
-         call mpi_allreduce (inf_nan_lchecks, inf_nan_gchecks, 3*nflds, &
-                             MPI_LOGICAL, MPI_LOR, mpi_comm, ierr)
-         gbl_lor_red = 1
-#ifndef EAMXX_STANDALONE
-         call t_stopf("repro_sum_allr_lor")
-#endif
-
-         do ifld=1,nflds
-            arr_gsum_infnan(ifld) = any(inf_nan_gchecks(:,ifld))
-         enddo
-#endif
-
-      endif
-
-#ifndef EAMXX_STANDALONE
-      call t_stopf('cism_reprosum_INF_NaN_Chk')
-#endif
-
 ! Check whether should use cism_reprosum_ddpdd algorithm
       use_ddpdd_sum = repro_sum_use_ddpdd
       if ( present(ddpdd_sum) ) then
@@ -891,7 +774,7 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands,     &
                call cism_reprosum_int(arr, arr_gsum, nsummands, dsummands, &
                                      nflds, arr_max_shift, arr_gmax_exp, &
                                      arr_max_levels, max_level, extra_levels, &
-                                     arr_gsum_infnan, validate, recompute, &
+                                     validate, recompute, &
                                      omp_nthreads, mpi_comm)
 
 ! Record statistics, etc.
@@ -943,15 +826,13 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands,     &
                do ifld=1,nflds
                   arr_exp_tlmin = MAXEXPONENT(1.0_r8)
                   arr_exp_tlmax = MINEXPONENT(1.0_r8)
-                  if (.not. arr_gsum_infnan(ifld)) then
-                     do isum=isum_beg(ithread),isum_end(ithread)
-                        if (arr(isum,ifld) /= 0.0_r8) then
-                           arr_exp = exponent(arr(isum,ifld))
-                           arr_exp_tlmin = min(arr_exp,arr_exp_tlmin)
-                           arr_exp_tlmax = max(arr_exp,arr_exp_tlmax)
-                        endif
-                     end do
-                  endif
+                  do isum=isum_beg(ithread),isum_end(ithread)
+                     if (arr(isum,ifld) /= 0.0_r8) then
+                        arr_exp = exponent(arr(isum,ifld))
+                        arr_exp_tlmin = min(arr_exp,arr_exp_tlmin)
+                        arr_exp_tlmax = max(arr_exp,arr_exp_tlmax)
+                     endif
+                  end do
                   arr_tlmin_exp(ifld,ithread) = arr_exp_tlmin
                   arr_tlmax_exp(ifld,ithread) = arr_exp_tlmax
                end do
@@ -981,8 +862,8 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands,     &
             arr_gmax_exp(:) = -arr_gextremes(1:nflds,1)
             arr_gmin_exp(:) =  arr_gextremes(1:nflds,2)
 
-! If a field is identically zero or contains INFs or NaNs, arr_gmin_exp
-! still equals MAXEXPONENT and arr_gmax_exp still equals MINEXPONENT.
+! If a field is identically zero, arr_gmin_exp still equals MAXEXPONENT
+!  and arr_gmax_exp still equals MINEXPONENT.
 ! In this case, set arr_gmin_exp = arr_gmax_exp = MINEXPONENT
             do ifld=1,nflds
                arr_gmin_exp(ifld) = min(arr_gmax_exp(ifld),arr_gmin_exp(ifld))
@@ -1093,7 +974,7 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands,     &
             call cism_reprosum_int(arr, arr_gsum, nsummands, dsummands, &
                                   nflds, arr_max_shift, arr_gmax_exp, &
                                   max_levels, max_level, extra_levels, &
-                                  arr_gsum_infnan, validate, recompute, &
+                                  validate, recompute, &
                                   omp_nthreads, mpi_comm)
 
          endif
@@ -1122,11 +1003,9 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands,     &
 !$omp default(shared)  &
 !$omp private(ifld, isum)
             do ifld=1,nflds
-               if (.not. arr_gsum_infnan(ifld)) then
-                  do isum=1,nsummands
-                     arr_lsum(ifld) = arr(isum,ifld) + arr_lsum(ifld)
-                  end do
-               endif
+               do isum=1,nsummands
+                  arr_lsum(ifld) = arr(isum,ifld) + arr_lsum(ifld)
+               end do
             end do
 
 #ifndef EAMXX_STANDALONE
@@ -1160,25 +1039,6 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands,     &
          endif
       endif
 
-! Set field sums to NaN and INF, as needed
-      do ifld=1,nflds
-         if (arr_gsum_infnan(ifld)) then
-            if (inf_nan_gchecks(1,ifld)) then
-               ! NaN => NaN
-               arr_gsum(ifld) = cism_infnan_nan
-            else if (inf_nan_gchecks(2,ifld) .and. inf_nan_gchecks(3,ifld)) then
-               ! posINF and negINF => NaN
-               arr_gsum(ifld) = cism_infnan_nan
-            else if (inf_nan_gchecks(2,ifld)) then
-               ! posINF only => posINF
-               arr_gsum(ifld) = cism_infnan_posinf
-            else if (inf_nan_gchecks(3,ifld)) then
-               ! negINF only => negINF
-               arr_gsum(ifld) = cism_infnan_neginf
-            endif
-         endif
-      end do
-
 ! Return statistics
       if ( present(repro_sum_stats) ) then
          repro_sum_stats(1) = repro_sum_stats(1) + repro_sum_fast
@@ -1204,7 +1064,8 @@ end subroutine cism_reprosum_calc
 
    subroutine cism_reprosum_int(arr, arr_gsum, nsummands, dsummands, nflds, &
                                 arr_max_shift, arr_gmax_exp, max_levels,    &
-                                max_level, extra_levels, skip_field,        &
+!                                max_level, extra_levels, skip_field,        &
+                                max_level, extra_levels,                    &
                                 validate, recompute, omp_nthreads, mpi_comm )
 !------------------------------------------------------------------------
 !
@@ -1245,11 +1106,6 @@ subroutine cism_reprosum_int(arr, arr_gsum, nsummands, dsummands, nflds, &
       real(r8), intent(in) :: arr(dsummands,nflds)
                                             ! input array
 
-      logical,  intent(in) :: skip_field(nflds)
-         ! flag indicating whether the sum for this field should be
-         ! computed or not (used to skip over fields containing
-         ! NaN or INF summands)
-
       logical,  intent(in) :: validate
          ! flag indicating that accuracy of solution generated from
          ! arr_gmax_exp and max_levels should be tested
@@ -1399,8 +1255,6 @@ subroutine cism_reprosum_int(arr, arr_gsum, nsummands, dsummands, nflds, &
           not_exact(ifld,ithread) = 0
           i8_arr_tlsum_level(:,ifld,ithread) = 0_i8
 
-          if (skip_field(ifld)) cycle
-
           do isum=isum_beg(ithread),isum_end(ithread)
             arr_remainder = 0.0_r8
 
diff --git a/libglimmer/parallel_mpi.F90 b/libglimmer/parallel_mpi.F90
index 6a941283..0d248e0e 100644
--- a/libglimmer/parallel_mpi.F90
+++ b/libglimmer/parallel_mpi.F90
@@ -9601,8 +9601,6 @@ subroutine parallel_reduce_reprosum(arr, arr_gsum)
 
     logical :: &
          ddpdd_sum,           & ! use ddpdd algorithm instead of fixed-precision algorithm
-         allow_infnan,        & ! if .true., allow INF or NaN input values;
-                                ! if .false. (the default), then abort if INF or NaNs are present
          repro_sum_validate     ! flag enabling/disabling testing that gmax and max_levels
                                 ! are accurate/sufficient. Default is enabled.
 
@@ -9617,7 +9615,6 @@ subroutine parallel_reduce_reprosum(arr, arr_gsum)
     allocate (arr_max_levels_out(nflds))
     allocate (rel_diff(2,nflds))
 
-    allow_infnan = .false.
     ddpdd_sum = .false.
     repro_sum_validate = .true.
 
@@ -9626,7 +9623,6 @@ subroutine parallel_reduce_reprosum(arr, arr_gsum)
     call cism_reprosum_calc(&
          arr,        arr_gsum,                          &
          nsummands,  dsummands,  nflds,                 &
-         allow_infnan = allow_infnan,                   &
          ddpdd_sum = ddpdd_sum,                         &
 !            arr_gbl_max = arr_gbl_max,                     &
          arr_gbl_max_out = arr_gbl_max_out,             &

From a8b597f4e6c3192d11b6b9b39f28b4d543026e43 Mon Sep 17 00:00:00 2001
From: William Lipscomb <lipscomb@ucar.edu>
Date: Wed, 21 Jan 2026 20:52:32 -0700
Subject: [PATCH 13/21] Fixed a logic bug in a parallel_sum subroutine

This commit fixes a bug in function parallel_global_sum_stagger_real8_3d_nflds.
The function now works correctly.

I found the bug while running Greenland tests with the BP solver,
using various combinations of preconditioners and sparse PCG solvers
(e.g., both diagonal and SIA preconditioners, with the standard
and Chronopoulos-Gear PCG algorithms). All the BP configurations
I tested are now reproducible.

I also cleaned up some diagnostics, making the output consistent
across related subroutines.
---
 libglimmer/parallel_mpi.F90              |  31 +---
 libglissade/glissade_velo_higher_pcg.F90 | 216 ++++++++++++++++++-----
 2 files changed, 177 insertions(+), 70 deletions(-)

diff --git a/libglimmer/parallel_mpi.F90 b/libglimmer/parallel_mpi.F90
index 0d248e0e..d043cde2 100644
--- a/libglimmer/parallel_mpi.F90
+++ b/libglimmer/parallel_mpi.F90
@@ -6186,12 +6186,6 @@ function parallel_global_sum_real8_2d(a, parallel, mask_2d)
          own_ewn     => parallel%own_ewn,      &
          own_nsn     => parallel%own_nsn)
 
-    !WHL - debug
-    if (verbose_reprosum .and. main_task) then
-!       write(iulog,*) 'In parallel_global_sum_real8_2d, reprosum =', parallel%reprosum
-!       write(iulog,*) 'nhalo, local ewn, local_nsn =', nhalo, local_ewn, local_nsn
-    endif
-
     if (present(mask_2d)) then
        mask = mask_2d
     else
@@ -6229,9 +6223,8 @@ function parallel_global_sum_real8_2d(a, parallel, mask_2d)
 
        parallel_global_sum_real8_2d = arr_gsum(1)
 
-       !WHL - debug
        if (verbose_reprosum .and. main_task) then
-!!          write(iulog,*) 'arr_gsum =', arr_gsum
+!          write(iulog,*) 'arr_gsum =', arr_gsum
        endif
 
        deallocate(arr)
@@ -6331,7 +6324,6 @@ function parallel_global_sum_real8_3d(a, parallel, mask_2d)
 
        parallel_global_sum_real8_3d = arr_gsum(1)
 
-       !WHL - debug
        if (verbose_reprosum .and. main_task) then
 !          write(iulog,*) 'arr_gsum =', arr_gsum
        endif
@@ -6465,7 +6457,7 @@ function parallel_global_sum_patch_real8_2d(a, npatch, patch_id, parallel)
        parallel_global_sum_patch_real8_2d(:) = arr_gsum(:)
 
        if (verbose_reprosum .and. main_task) then
-!!          write(iulog,*) 'arr_gsum =', arr_gsum
+!          write(iulog,*) 'arr_gsum =', arr_gsum
        endif
 
        deallocate(arr)
@@ -6567,7 +6559,6 @@ function parallel_global_sum_stagger_real8_2d(arr1, parallel, arr2)
 
        parallel_global_sum_stagger_real8_2d = arr_gsum(1)
 
-       !WHL - debug
        if (verbose_reprosum .and. main_task) then
 !          write(iulog,*) 'arr_gsum =', arr_gsum
        endif
@@ -6683,7 +6674,6 @@ function parallel_global_sum_stagger_real8_3d(arr1, parallel, arr2)
 
        parallel_global_sum_stagger_real8_3d = arr_gsum(1)
 
-       !WHL - debug
        if (verbose_reprosum .and. main_task) then
 !          write(iulog,*) 'arr_gsum =', arr_gsum
        endif
@@ -6751,7 +6741,6 @@ function parallel_global_sum_stagger_real8_2d_nflds(arr1, nflds, parallel, arr2)
          staggered_jlo, staggered_jhi
 
     ! variables for computing reproductible sums
-!!    integer :: nsummands, nflds      ! dimensions of array passed to parallel_reduce_reprosum
     integer :: nsummands               ! dimensions of array passed to parallel_reduce_reprosum
     integer :: count
     real(dp), dimension(:,:), allocatable :: arr
@@ -6766,7 +6755,6 @@ function parallel_global_sum_stagger_real8_2d_nflds(arr1, nflds, parallel, arr2)
 
        ! Allocate and fill arrays to pass to parallel_reduce_reprosum
        nsummands = (staggered_ihi-staggered_ilo+1) * (staggered_jhi-staggered_jlo+1)
-!       nflds = size(a,3)
        allocate(arr(nsummands,nflds))
        allocate(arr_gsum(nflds))
 
@@ -6805,7 +6793,6 @@ function parallel_global_sum_stagger_real8_2d_nflds(arr1, nflds, parallel, arr2)
 
        parallel_global_sum_stagger_real8_2d_nflds = arr_gsum(:)
 
-       !WHL - debug
        if (verbose_reprosum .and. main_task) then
 !          write(iulog,*) 'arr_gsum =', arr_gsum
        endif
@@ -6837,7 +6824,6 @@ function parallel_global_sum_stagger_real8_2d_nflds(arr1, nflds, parallel, arr2)
 
        enddo   ! nflds
 
-       ! take the global sum
        parallel_global_sum_stagger_real8_2d_nflds = parallel_reduce_sum(local_sum(:))
 
     endif   ! reprosum
@@ -6872,7 +6858,6 @@ function parallel_global_sum_stagger_real8_3d_nflds(arr1, nflds, parallel, arr2)
          staggered_jlo, staggered_jhi
 
     ! variables for computing reproductible sums
-!!    integer :: nsummands, nflds      ! dimensions of array passed to parallel_reduce_reprosum
     integer :: nsummands               ! dimensions of array passed to parallel_reduce_reprosum
     integer :: count
     real(dp), dimension(:,:), allocatable :: arr
@@ -6888,14 +6873,8 @@ function parallel_global_sum_stagger_real8_3d_nflds(arr1, nflds, parallel, arr2)
 
     if (parallel%reprosum) then   ! compute using cism_reprosum_calc
 
-       !WHL - debug
-       if (verbose_reprosum .and. main_task) then
-          write(iulog,*) '   In global_sum_stagger_real8_3d_nflds'
-       endif
-
        ! Allocate and fill arrays to pass to parallel_reduce_reprosum
        nsummands = (staggered_ihi-staggered_ilo+1) * (staggered_jhi-staggered_jlo+1) * nz
-!       nflds = size(a,3)
        allocate(arr(nsummands,nflds))
        allocate(arr_gsum(nflds))
 
@@ -6938,7 +6917,6 @@ function parallel_global_sum_stagger_real8_3d_nflds(arr1, nflds, parallel, arr2)
 
        parallel_global_sum_stagger_real8_3d_nflds = arr_gsum(:)
 
-       !WHL - debug
        if (verbose_reprosum .and. main_task) then
 !          write(iulog,*) 'arr_gsum =', arr_gsum
        endif
@@ -6974,10 +6952,9 @@ function parallel_global_sum_stagger_real8_3d_nflds(arr1, nflds, parallel, arr2)
 
        enddo   ! nflds
 
-    endif   ! reprosum
+       parallel_global_sum_stagger_real8_3d_nflds = parallel_reduce_sum(local_sum(:))
 
-    ! take the global sum
-    parallel_global_sum_stagger_real8_3d_nflds = parallel_reduce_sum(local_sum(:))
+    endif   ! reprosum
 
   end function parallel_global_sum_stagger_real8_3d_nflds
 
diff --git a/libglissade/glissade_velo_higher_pcg.F90 b/libglissade/glissade_velo_higher_pcg.F90
index 2c2cc482..9f3a8261 100644
--- a/libglissade/glissade_velo_higher_pcg.F90
+++ b/libglissade/glissade_velo_higher_pcg.F90
@@ -215,6 +215,19 @@ subroutine pcg_solver_standard_3d(nx,        ny,            &
     real(dp), dimension(-1:1,nz,nx-1,ny-1) ::  &
        Muu, Mvv            ! simplified SIA matrices for preconditioning
 
+    integer :: &
+         staggered_ilo, staggered_ihi, &  ! bounds of locally owned vertices on staggered grid
+         staggered_jlo, staggered_jhi
+
+    !WHL - debug
+    integer :: iu_max, ju_max, iv_max, jv_max
+    real(dp) :: ru_max, rv_max
+
+    staggered_ilo = parallel%staggered_ilo
+    staggered_ihi = parallel%staggered_ihi
+    staggered_jlo = parallel%staggered_jlo
+    staggered_jhi = parallel%staggered_jhi
+
     if (verbose_pcg .and. main_task) then
        write(iulog,*) 'Using native PCG solver (standard)'
        write(iulog,*) 'tolerance, maxiters, precond =', tolerance, maxiters, precond
@@ -238,7 +251,7 @@ subroutine pcg_solver_standard_3d(nx,        ny,            &
           j = jtest
           write(iulog,*) 'i, j, r =', i, j, this_rank
           write(iulog,*) 'Auu diag =', Adiagu(:,i,j)
-          write(iulog,*) 'Avu diag =', Adiagv(:,i,j)
+          write(iulog,*) 'Avv diag =', Adiagv(:,i,j)
        endif
 
        !TODO - Create a separate setup for tridiag_local
@@ -254,6 +267,18 @@ subroutine pcg_solver_standard_3d(nx,        ny,            &
                                         Auu,       Avv,      &
                                         Muu,       Mvv)
 
+       if (verbose_pcg .and. this_rank == rtest) then
+          j = jtest
+          write(iulog,*) ' '
+          write(iulog,*) 'i, k, Muu_sia, Mvv_sia:'
+          do i = staggered_ihi, staggered_ilo, -1
+             write(iulog,*) ' '
+             do k = 1, nz
+                write(iulog,'(2i4, 6e13.5)') i, k, Muu(:,k,i,j), Mvv(:,k,i,j)
+             enddo
+          enddo  ! i
+       endif
+
     else    ! no preconditioner
 
        if (verbose_pcg .and. main_task) then
@@ -461,6 +486,19 @@ subroutine pcg_solver_standard_3d(nx,        ny,            &
        rv(:,:,:) = rv(:,:,:) - alpha * qv(:,:,:)
        call t_stopf("pcg_vecupdate")
 
+       if (verbose_pcg .and. this_rank == rtest) then
+          j = jtest
+          write(iulog,*) ' '
+          write(iulog,*) 'iter =', iter
+          write(iulog,*) 'i, k, xu, xv, ru, rv:'
+          do i = itest-3, itest+3
+             write(iulog,*) ' '
+             do k = 1, nz
+                write(iulog,'(2i4, 4f16.10)') i, k, xu(k,i,j), xv(k,i,j), ru(k,i,j), rv(k,i,j)
+             enddo
+          enddo  ! i
+       endif
+
        ! Check for convergence every linear_solve_ncheck iterations.
        ! Also check at iter = 5, to reduce iterations when the nonlinear solver is close to convergence.
        ! TODO: Check at iter = linear_solve_ncheck/2 instead of 5?  This would be answer-changing.
@@ -468,7 +506,11 @@ subroutine pcg_solver_standard_3d(nx,        ny,            &
        ! For convergence check, use r = b - Ax
 
        if (mod(iter, linear_solve_ncheck) == 0 .or. iter == 5) then
-!!       if (mod(iter, linear_solve_ncheck) == 0 .or. iter == linear_solve_ncheck/2) then
+
+          if (verbose_pcg .and. main_task) then
+             write(iulog,*) ' '
+             write(iulog,*) 'Check convergence, iter =', iter
+          endif
 
           ! Halo update for x
 
@@ -514,28 +556,56 @@ subroutine pcg_solver_standard_3d(nx,        ny,            &
           err = L2_resid/L2_rhs
 
           if (verbose_pcg .and. main_task) then
-!             write(iulog,*) ' '
-!             write(iulog,*) 'iter, L2_resid, error =', iter, L2_resid, err
+             write(iulog,*) 'iter, L2_resid, L2_rhs, error =', iter, L2_resid, L2_rhs, err
+          endif
+
+          !WHL - debug
+          if (verbose_pcg .and. this_rank == rtest) then
+             ru_max = 0.d0
+             rv_max = 0.d0
+             iu_max = 0
+             ju_max = 0
+             do j = staggered_jlo, staggered_jhi
+                do i = staggered_ilo, staggered_ihi
+                   if (sum(abs(ru(:,i,j))) > ru_max) then
+                      ru_max = sum(abs(ru(:,i,j)))
+                      iu_max = i
+                      ju_max = j
+                   endif
+                   if (sum(abs(rv(:,i,j))) > rv_max) then
+                      rv_max = sum(abs(rv(:,i,j)))
+                      iv_max = i
+                      jv_max = j
+                   endif
+                enddo
+             enddo
+             write(iulog,*) 'r, i, j, ru_max:', this_rank, iu_max, ju_max, ru_max
+             write(iulog,*) 'r, i, j, rv_max:', this_rank, iv_max, jv_max, rv_max
           endif
 
+          ! If converged, then exit the loop.
+          ! Note: Without good preconditioning, convergence can be slow,
+          !       but the solution after maxiters_chrongear might be good enough.
+
           if (err < tolerance) then
              niters = iter
+             if (verbose_pcg .and. main_task) then
+                write(iulog,*) 'Glissade PCG solver has converged, iter =', niters
+                write(iulog,*) ' '
+             endif
              exit iter_loop
-          endif            
+          elseif (iter == maxiters) then
+             if (verbose_pcg .and. main_task) then
+                write(iulog,*) 'Glissade PCG solver did not converge'
+                write(iulog,*) 'iter, err, tolerance:', iter, err, tolerance
+                write(iulog,*) ' '
+             endif
+          endif
 
        endif    ! linear_solve_ncheck
 
     enddo iter_loop
 
-!WHL - Without good preconditioning, convergence can be slow, but the solution after maxiters might be good enough.
- 
-    if (niters == maxiters) then
-       if (verbose_pcg .and. main_task) then
-          write(iulog,*) 'Glissade PCG solver not yet converged'
-          write(iulog,*) 'niters, err, tolerance:', niters, err, tolerance
-       endif
-    endif
-
   end subroutine pcg_solver_standard_3d
 
 !****************************************************************************
@@ -659,6 +729,19 @@ subroutine pcg_solver_standard_2d(nx,        ny,            &
        L2_rhs              ! L2 norm of rhs vector b
                            ! solver converges when L2_resid/L2_rhs < tolerance
 
+    integer :: &
+         staggered_ilo, staggered_ihi, &  ! bounds of locally owned vertices on staggered grid
+         staggered_jlo, staggered_jhi
+
+    !WHL - debug
+    integer :: iu_max, ju_max, iv_max, jv_max
+    real(dp) :: ru_max, rv_max
+
+    staggered_ilo = parallel%staggered_ilo
+    staggered_ihi = parallel%staggered_ihi
+    staggered_jlo = parallel%staggered_jlo
+    staggered_jhi = parallel%staggered_jhi
+
     if (verbose_pcg .and. main_task) then
        write(iulog,*) 'Using native PCG solver (standard)'
        write(iulog,*) 'tolerance, maxiters, precond =', tolerance, maxiters, precond
@@ -875,7 +958,6 @@ subroutine pcg_solver_standard_2d(nx,        ny,            &
        ! For convergence check, use r = b - Ax
 
        if (mod(iter, linear_solve_ncheck) == 0 .or. iter == 5) then
-!!       if (mod(iter, linear_solve_ncheck) == 0 .or. iter == linear_solve_ncheck/2) then
 
           ! Halo update for x
 
@@ -920,24 +1002,53 @@ subroutine pcg_solver_standard_2d(nx,        ny,            &
           ! compute normalized error
           err = L2_resid/L2_rhs
 
+          !WHL - debug
+          if (verbose_pcg .and. this_rank == rtest) then
+             ru_max = 0.d0
+             rv_max = 0.d0
+             iu_max = 0
+             ju_max = 0
+             do j = staggered_jlo, staggered_jhi
+                do i = staggered_ilo, staggered_ihi
+                   if (abs(ru(i,j)) > ru_max) then
+                      ru_max = ru(i,j)
+                      iu_max = i
+                      ju_max = j
+                   endif
+                   if (abs(rv(i,j)) > rv_max) then
+                      rv_max = rv(i,j)
+                      iv_max = i
+                      jv_max = j
+                   endif
+                enddo
+             enddo
+             write(iulog,*) 'r, i, j, ru_max:', this_rank, iu_max, ju_max, ru_max
+             write(iulog,*) 'r, i, j, rv_max:', this_rank, iv_max, jv_max, rv_max
+          endif
+
+          ! If converged, then exit the loop.
+          ! Note: Without good preconditioning, convergence can be slow,
+          !       but the solution after maxiters_chrongear might be good enough.
+
           if (err < tolerance) then
              niters = iter
+             if (verbose_pcg .and. main_task) then
+                write(iulog,*) 'Glissade PCG solver has converged, iter =', niters
+                write(iulog,*) ' '
+             endif
              exit iter_loop
-          endif            
+          elseif (iter == maxiters) then
+             if (verbose_pcg .and. main_task) then
+                write(iulog,*) 'Glissade PCG solver did not converge'
+                write(iulog,*) 'iter, err, tolerance:', iter, err, tolerance
+                write(iulog,*) ' '
+             endif
+          endif
 
        endif    ! linear_solve_ncheck
 
     enddo iter_loop
 
-!WHL - Without good preconditioning, convergence can be slow, but the solution after maxiters might be good enough.
- 
-    if (niters == maxiters) then
-       if (verbose_pcg .and. main_task) then
-          write(iulog,*) 'Glissade PCG solver not yet converged'
-          write(iulog,*) 'niters, err, tolerance:', niters, err, tolerance
-       endif
-    endif
-
   end subroutine pcg_solver_standard_2d
 
 !****************************************************************************
@@ -1256,7 +1367,7 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
           j = jtest
           write(iulog,*) 'i, j, r =', i, j, this_rank
           write(iulog,*) 'Auu diag =', Adiagu(:,i,j)
-          write(iulog,*) 'Avu diag =', Adiagv(:,i,j)
+          write(iulog,*) 'Avv diag =', Adiagv(:,i,j)
        endif
 
     elseif (precond == HO_PRECOND_SIA) then
@@ -1398,6 +1509,8 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
     ! First pass of algorithm
     !---------------------------------------------------------------
 
+    iter = 1
+
     ! Note: The matrix A must be complete for all rows corresponding to locally 
     !        owned nodes, and x must have the correct values in
     !        halo nodes bordering the locally owned nodes.
@@ -1590,16 +1703,25 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
     rv(:,:,:) = rv(:,:,:) - alpha*qv(:,:,:)
     call t_stopf("pcg_vecupdate")
 
+    if (verbose_pcg .and. this_rank == rtest) then
+       j = jtest
+       write(iulog,*) ' '
+       write(iulog,*) 'iter =', iter
+       write(iulog,*) 'i, k, xu, xv, ru, rv:'
+       do i = itest-3, itest+3
+          write(iulog,*) ' '
+          do k = 1, nz
+             write(iulog,'(2i4, 4f16.10)') i, k, xu(k,i,j), xv(k,i,j), ru(k,i,j), rv(k,i,j)
+          enddo
+       enddo  ! i
+    endif
+
     !---------------------------------------------------------------
     ! Iterate to solution
     !---------------------------------------------------------------
 
     iter_loop: do iter = 2, maxiters_chrongear  ! first iteration done above
 
-       if (verbose_pcg .and. main_task) then
-!          write(iulog,*) 'iter =', iter
-       endif
-
        !---- Compute PC(r) = solution z of Mz = r
        !---- z is correct in halo
 
@@ -1768,11 +1890,12 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
        if (verbose_pcg .and. this_rank == rtest) then
           j = jtest
           write(iulog,*) ' '
+          write(iulog,*) 'iter =', iter
           write(iulog,*) 'i, k, xu, xv, ru, rv:'
           do i = itest-3, itest+3
              write(iulog,*) ' '
              do k = 1, nz
-                write(iulog,'(i4, 4f16.10)') i, xu(k,i,j), xv(k,i,j), ru(k,i,j), rv(k,i,j)
+                write(iulog,'(2i4, 4f16.10)') i, k, xu(k,i,j), xv(k,i,j), ru(k,i,j), rv(k,i,j)
              enddo
           enddo  ! i
        endif
@@ -1820,13 +1943,17 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
           rr = parallel_global_sum_stagger(worku, parallel, workv)
           call t_stopf("pcg_glbsum_resid")
 
-          L2_resid = sqrt(rr)          ! L2 norm of residual
-          err = L2_resid/L2_rhs        ! normalized error
+          ! take square root
+          L2_resid = sqrt(rr)
+
+          ! compute normalized error
+          err = L2_resid/L2_rhs
 
           if (verbose_pcg .and. main_task) then
              write(iulog,*) 'iter, L2_resid, L2_rhs, error =', iter, L2_resid, L2_rhs, err
           endif
 
+          !WHL - debug
           if (verbose_pcg .and. this_rank == rtest) then
              ru_max = 0.d0
              rv_max = 0.d0
@@ -1834,13 +1961,13 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
              ju_max = 0
              do j = staggered_jlo, staggered_jhi
                 do i = staggered_ilo, staggered_ihi
-                   if (abs(sum(ru(:,i,j))) > ru_max) then
-                      ru_max = sum(ru(:,i,j))
+                   if (sum(abs(ru(:,i,j))) > ru_max) then
+                      ru_max = sum(abs(ru(:,i,j)))
                       iu_max = i
                       ju_max = j
                    endif
-                   if (abs(sum(rv(:,i,j))) > rv_max) then
-                      rv_max = sum(rv(:,i,j))
+                   if (sum(abs(rv(:,i,j))) > rv_max) then
+                      rv_max = sum(abs(rv(:,i,j)))
                       iv_max = i
                       jv_max = j
                    endif
@@ -1861,10 +1988,10 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
                 write(iulog,*) ' '
              endif
              exit iter_loop
-          elseif (niters == maxiters_chrongear) then
+          elseif (iter == maxiters_chrongear) then
              if (verbose_pcg .and. main_task) then
-                write(iulog,*) 'Glissade PCG solver not yet converged'
-                write(iulog,*) 'niters, err, tolerance:', niters, err, tolerance
+                write(iulog,*) 'Glissade PCG solver did not converge'
+                write(iulog,*) 'iter, err, tolerance:', iter, err, tolerance
                 write(iulog,*) ' '
              endif
           endif
@@ -2916,6 +3043,7 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
              write(iulog,*) 'iter, L2_resid, L2_rhs, error =', iter, L2_resid, L2_rhs, err
           endif
 
+          !WHL - debug
           if (verbose_pcg .and. this_rank == rtest) then
              ru_max = 0.d0
              rv_max = 0.d0
@@ -2947,12 +3075,14 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
              niters = iter
              if (verbose_pcg .and. main_task) then
                 write(iulog,*) 'Glissade PCG solver has converged, iter =', niters
+                write(iulog,*) ' '
              endif
              exit iter_loop
-          elseif (niters == maxiters_chrongear) then
+          elseif (iter == maxiters_chrongear) then
              if (verbose_pcg .and. main_task) then
-                write(iulog,*) 'Glissade PCG solver not yet converged'
-                write(iulog,*) 'niters, err, tolerance:', niters, err, tolerance
+                write(iulog,*) 'Glissade PCG solver did not converge'
+                write(iulog,*) 'iter, err, tolerance:', iter, err, tolerance
+                write(iulog,*) ' '
              endif
           endif
 

From cf3a07494da1655aa00e9a04e26fa980c39edfd8 Mon Sep 17 00:00:00 2001
From: William Lipscomb <lipscomb@ucar.edu>
Date: Thu, 22 Jan 2026 17:23:01 -0700
Subject: [PATCH 14/21] Enable tridiagonal preconditioners with all PCG solvers

Several years ago, I implemented local and global triagonal preconditioners
in module glissade_velo_higher_pcg. For some reason, I coded and tested them
in the Chronopoulos-Gear PCG subroutines (which_ho_sparse = 3) but not the
standard PCG subroutines (which_ho_sparse = 2).

This commit adds the two tridiagonal preconditioners to both standard PCG
subroutines (2d and 3d). I tested the 2d version for DIVA and the 3d version
for BP, verifying that answers agree with other preconditioners, within roundoff.
---
 libglissade/glissade_velo_higher.F90     |   3 +-
 libglissade/glissade_velo_higher_pcg.F90 | 604 ++++++++++++++++++++---
 2 files changed, 544 insertions(+), 63 deletions(-)

diff --git a/libglissade/glissade_velo_higher.F90 b/libglissade/glissade_velo_higher.F90
index 099f4fd7..d68d3253 100644
--- a/libglissade/glissade_velo_higher.F90
+++ b/libglissade/glissade_velo_higher.F90
@@ -3274,7 +3274,8 @@ subroutine glissade_velo_higher_solve(model,                &
              
                 call pcg_solver_standard_3d(nx,           ny,            &
                                             nz,           parallel,      &
-                                            indxA_3d,     active_vertex, &
+                                            indxA_2d,     indxA_3d,      &
+                                            active_vertex,               &
                                             Auu,          Auv,           &
                                             Avu,          Avv,           &
                                             bu,           bv,            &
diff --git a/libglissade/glissade_velo_higher_pcg.F90 b/libglissade/glissade_velo_higher_pcg.F90
index 9f3a8261..4a303b71 100644
--- a/libglissade/glissade_velo_higher_pcg.F90
+++ b/libglissade/glissade_velo_higher_pcg.F90
@@ -66,7 +66,8 @@ module glissade_velo_higher_pcg
 
   subroutine pcg_solver_standard_3d(nx,        ny,            &
                                     nz,        parallel,      &
-                                    indxA_3d,  active_vertex, &
+                                    indxA_2d,  indxA_3d,      &
+                                    active_vertex,            &
                                     Auu,       Auv,           &
                                     Avu,       Avv,           &
                                     bu,        bv,            &
@@ -137,7 +138,10 @@ subroutine pcg_solver_standard_3d(nx,        ny,            &
        nz                       ! number of vertical levels where velocity is computed
 
     type(parallel_type), intent(in) :: &
-         parallel             ! info for parallel communication
+       parallel               ! info for parallel communication
+
+    integer, dimension(-1:1,-1:1), intent(in) :: &
+       indxA_2d               ! maps relative (x,y) coordinates to an index between 1 and 9
 
     integer, dimension(-1:1,-1:1,-1:1), intent(in) :: &
        indxA_3d               ! maps relative (x,y,z) coordinates to an index between 1 and 27
@@ -188,10 +192,11 @@ subroutine pcg_solver_standard_3d(nx,        ny,            &
     ! Local variables and parameters   
     !---------------------------------------------------------------
 
-    integer ::  i, j, k      ! grid indices
-    integer ::  iA, jA, kA   ! grid offsets ranging from -1 to 1
-    integer ::  m            ! matrix element index
-    integer ::  iter         ! iteration counter
+    integer :: i, j, k        ! grid indices
+    integer :: iA, jA, kA     ! grid offsets ranging from -1 to 1
+    integer :: m              ! matrix element index
+    integer :: ilocal, jlocal ! number of locally owned vertices in each direction
+    integer :: iter           ! iteration counter
 
     real(dp) ::           &
        eta0, eta1, eta2,  &! scalar inner product results
@@ -215,6 +220,36 @@ subroutine pcg_solver_standard_3d(nx,        ny,            &
     real(dp), dimension(-1:1,nz,nx-1,ny-1) ::  &
        Muu, Mvv            ! simplified SIA matrices for preconditioning
 
+    ! arrays for tridiagonal preconditioning
+    ! Note: 2D diagonal entries are Adiag_u and Adiag_v; distinct from 3D Adiagu and Adiagv above
+    real(dp), dimension(:,:), allocatable :: &
+         Asubdiag_u, Adiag_u, Asupdiag_u,  &  ! matrix entries from Auu for tridiagonal preconditioning
+         Asubdiag_v, Adiag_v, Asupdiag_v      ! matrix entries from Avv for tridiagonal preconditioning
+
+    real(dp), dimension(:,:), allocatable :: &
+         omega_u, omega_v,         & ! work arrays for tridiagonal solve
+	 denom_u, denom_v,         &
+         xuh_u,   xuh_v,           &
+         xlh_u,   xlh_v
+
+    ! Note: These two matrices are global in the EW and NS dimensions, respectively.
+    !       Each holds 8 pieces of information for each task on each row or column.
+    !       Since only 2 of these 8 pieces of information change from one iteration to the next,
+    !        it is more efficient to gather the remaining information once and pass the arrays
+    !        with intent(inout), than to declare the arrays in subroutine tridiag_solver_global_2d
+    !        and gather all the information every time the subroutine is called.
+    ! TODO: Revisit this. Is the efficiency gain large enough to justify the extra complexity?
+
+    real(dp), dimension(:,:), allocatable :: &
+         gather_data_row,   &   ! arrays for gathering data from every task on a row or column
+         gather_data_col
+
+    integer :: &
+         tasks_row,         &   ! number of tasks per row and column for tridiagonal solve
+         tasks_col
+
+    logical :: first_time       ! true on the first subroutine call (iter = 1), false thereafter
+
     integer :: &
          staggered_ilo, staggered_ihi, &  ! bounds of locally owned vertices on staggered grid
          staggered_jlo, staggered_jhi
@@ -228,17 +263,29 @@ subroutine pcg_solver_standard_3d(nx,        ny,            &
     staggered_jlo = parallel%staggered_jlo
     staggered_jhi = parallel%staggered_jhi
 
+    tasks_row = parallel%tasks_row
+    tasks_col = parallel%tasks_col
+
     if (verbose_pcg .and. main_task) then
        write(iulog,*) 'Using native PCG solver (standard)'
        write(iulog,*) 'tolerance, maxiters, precond =', tolerance, maxiters, precond
     endif
 
+    ! Compute array sizes for locally owned vertices
+    ilocal = staggered_ihi - staggered_ilo + 1
+    jlocal = staggered_jhi - staggered_jlo + 1
+
     ! Set up matrices for preconditioning
-    !TODO - Add tridiagonal options
 
     call t_startf("pcg_precond_init")
 
-    if (precond == HO_PRECOND_DIAG) then
+    if (precond == HO_PRECOND_NONE) then    ! no preconditioner
+
+       if (verbose_pcg .and. this_rank == rtest) then
+          write(iulog,*) 'Using no preconditioner'
+       endif
+
+    elseif (precond == HO_PRECOND_DIAG) then
 
        call setup_preconditioner_diag_3d(nx,       ny,         &
                                          nz,       indxA_3d,   &
@@ -254,12 +301,6 @@ subroutine pcg_solver_standard_3d(nx,        ny,            &
           write(iulog,*) 'Avv diag =', Adiagv(:,i,j)
        endif
 
-       !TODO - Create a separate setup for tridiag_local
-       !       For this setup: Pass in Auu and Avv
-       !       Return Adiag/subdiag/supdiag for u and v in halo
-       !       Return omega and denom in halo
-       !       Then M*z = r can compute z in halo
-
     elseif (precond == HO_PRECOND_SIA) then
 
        call setup_preconditioner_sia_3d(nx,        ny,       &
@@ -279,11 +320,79 @@ subroutine pcg_solver_standard_3d(nx,        ny,            &
           enddo  ! i
        endif
 
-    else    ! no preconditioner
+    elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then
 
-       if (verbose_pcg .and. main_task) then
-          write(iulog,*) 'Using no preconditioner'
-       endif
+       ! Allocate tridiagonal preconditioning matrices
+       allocate(Adiag_u   (nx-1,ny-1))
+       allocate(Asubdiag_u(nx-1,ny-1))
+       allocate(Asupdiag_u(nx-1,ny-1))
+       allocate(omega_u   (nx-1,ny-1))
+       allocate(denom_u   (nx-1,ny-1))
+
+       allocate(Adiag_v   (nx-1,ny-1))
+       allocate(Asubdiag_v(nx-1,ny-1))
+       allocate(Asupdiag_v(nx-1,ny-1))
+       allocate(omega_v   (nx-1,ny-1))
+       allocate(denom_v   (nx-1,ny-1))
+
+       ! Compute arrays for tridiagonal preconditioning
+
+       call setup_preconditioner_tridiag_local_3d(&
+            nx,            ny,            &
+            nz,            parallel,      &
+            active_vertex,                &
+            indxA_2d,      indxA_3d,      &
+            itest, jtest,  rtest,         &
+            Auu,           Avv,           &
+            Muu,           Mvv,           &
+            Adiag_u,       Adiag_v,       &
+            Asubdiag_u,    Asubdiag_v,    &
+            Asupdiag_u,    Asupdiag_v,    &
+            omega_u,       omega_v,       &
+            denom_u,       denom_v)
+
+    elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then
+
+       ! Allocate tridiagonal preconditioning matrices
+       ! Note: (i,j) indices are switched for the A_v matrices to reduce striding.
+       allocate(Adiag_u   (ilocal,jlocal))
+       allocate(Asubdiag_u(ilocal,jlocal))
+       allocate(Asupdiag_u(ilocal,jlocal))
+       allocate(omega_u(ilocal,jlocal))
+       allocate(denom_u(ilocal,jlocal))
+       allocate(xuh_u(ilocal,jlocal))
+       allocate(xlh_u(ilocal,jlocal))
+
+       allocate(Adiag_v   (jlocal,ilocal))
+       allocate(Asubdiag_v(jlocal,ilocal))
+       allocate(Asupdiag_v(jlocal,ilocal))
+       allocate(omega_v(jlocal,ilocal))
+       allocate(denom_v(jlocal,ilocal))
+       allocate(xuh_v(jlocal,ilocal))
+       allocate(xlh_v(jlocal,ilocal))
+
+       ! These two matrices are for gathering data from all tasks on a given row or column.
+       allocate(gather_data_row(8*tasks_row,jlocal))
+       allocate(gather_data_col(8*tasks_col,ilocal))
+       gather_data_row = 0.0d0
+       gather_data_col = 0.0d0
+
+       call setup_preconditioner_tridiag_global_3d(&
+            nx,            ny,            &
+            nz,            parallel,      &
+            active_vertex,                &
+            indxA_2d,      indxA_3d,      &
+            ilocal,        jlocal,        &
+            itest, jtest,  rtest,         &
+            Auu,           Avv,           &
+            Muu,           Mvv,           &
+            Adiag_u,       Adiag_v,       &
+            Asubdiag_u,    Asubdiag_v,    &
+            Asupdiag_u,    Asupdiag_v,    &
+            omega_u,       omega_v,       &
+            denom_u,       denom_v,       &
+            xuh_u,         xuh_v,         &
+            xlh_u,         xlh_v)
 
     endif   ! precond
 
@@ -393,6 +502,54 @@ subroutine pcg_solver_standard_3d(nx,        ny,            &
                                active_vertex,         &
                                Mvv,  rv,   zv)      ! solve Mvv*zv = rv for zv
 
+       elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then
+
+          ! Use a local tridiagonal solver to find an approximate solution of A*z = r
+
+          call tridiag_solver_local_3d(&
+               nx,            ny,            &
+               nz,            parallel,      &
+               active_vertex,                &
+               itest, jtest,  rtest,         &
+               Adiag_u,       Adiag_v,       &  ! entries of 2D preconditioning matrix
+               Asubdiag_u,    Asubdiag_v,    &
+               Asupdiag_u,    Asupdiag_v,    &
+               omega_u,       omega_v,       &
+               denom_u,       denom_v,       &
+               Muu,           Mvv,           &  ! entries of SIA matrix
+               ru,            rv,            &  ! 3D residual
+               zu,            zv)               ! approximate solution of Az = r
+
+       elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then
+
+          ! Use a global tridiagonal solver to find an approximate solution of A*z = r
+
+          if (iter == 1) then
+             first_time = .true.
+          else
+             first_time = .false.
+          endif
+
+          call tridiag_solver_global_3d(&
+               nx,              ny,              &
+               nz,              parallel,        &
+               active_vertex,                    &
+               ilocal,          jlocal,          &
+               tasks_row,       tasks_col,       &
+               itest,  jtest,   rtest,           &
+               Adiag_u,         Adiag_v,         &  ! entries of 2D preconditioning matrix
+               Asubdiag_u,      Asubdiag_v,      &
+               Asupdiag_u,      Asupdiag_v,      &
+               omega_u,         omega_v,         &
+               denom_u,         denom_v,         &
+               xuh_u,           xuh_v,           &
+               xlh_u,           xlh_v,           &
+               Muu,             Mvv,             &  ! entries of SIA matrix
+               gather_data_row, gather_data_col, &
+               first_time,                       &
+               ru,              rv,              &  ! 3D residual
+               zu,              zv)                 ! approximate solution of Az = r
+
        endif    ! precond
 
        call t_stopf("pcg_precond")
@@ -606,6 +763,17 @@ subroutine pcg_solver_standard_3d(nx,        ny,            &
 
     enddo iter_loop
 
+    ! Clean up
+    if (allocated(Adiag_u))    deallocate(Adiag_u, Adiag_v)
+    if (allocated(Asubdiag_u)) deallocate(Asubdiag_u, Asubdiag_v)
+    if (allocated(Asupdiag_u)) deallocate(Asupdiag_u, Asupdiag_v)
+    if (allocated(omega_u))    deallocate(omega_u, omega_v)
+    if (allocated(denom_u))    deallocate(denom_u, denom_v)
+    if (allocated(xuh_u))      deallocate(xuh_u, xuh_v)
+    if (allocated(xlh_u))      deallocate(xlh_u, xlh_v)
+    if (allocated(gather_data_row)) deallocate(gather_data_row)
+    if (allocated(gather_data_col)) deallocate(gather_data_col)
+
   end subroutine pcg_solver_standard_3d
 
 !****************************************************************************
@@ -707,7 +875,7 @@ subroutine pcg_solver_standard_2d(nx,        ny,            &
     ! Local variables and parameters   
     !---------------------------------------------------------------
 
-    integer ::  i, j         ! grid indices
+    integer ::  i, j, ii, jj ! grid indices
     integer ::  iter         ! iteration counter
 
     real(dp) ::           &
@@ -729,10 +897,41 @@ subroutine pcg_solver_standard_2d(nx,        ny,            &
        L2_rhs              ! L2 norm of rhs vector b
                            ! solver converges when L2_resid/L2_rhs < tolerance
 
+    ! tridiagonal matrix elements
+    real(dp), dimension(:,:), allocatable :: &
+         Asubdiag_u, Adiag_u, Asupdiag_u,  &  ! matrix entries from Auu for tridiagonal preconditioning
+         Asubdiag_v, Adiag_v, Asupdiag_v      ! matrix entries from Avv for tridiagonal preconditioning
+    
+    real(dp), dimension(:,:), allocatable :: &
+         omega_u, omega_v,         & ! work arrays for tridiagonal solve
+         denom_u, denom_v,         &
+         xuh_u,   xuh_v,           &
+         xlh_u,   xlh_v
+
+    real(dp), dimension(:,:), allocatable :: &
+         b_u, b_v, x_u, x_v
+
+    real(dp), dimension(:,:), allocatable :: &
+         gather_data_row,   &   ! arrays for gathering data from every task on a row or column
+         gather_data_col
+
+    integer :: ilocal, jlocal   ! number of locally owned vertices in each direction   
+
+    integer :: &
+         tasks_row,         &   ! number of tasks per row and column for tridiagonal solve
+         tasks_col
+
+    logical :: first_time       ! true on the first subroutine call (iter = 1), false thereafter
+
     integer :: &
          staggered_ilo, staggered_ihi, &  ! bounds of locally owned vertices on staggered grid
          staggered_jlo, staggered_jhi
 
+    !TODO - Apply the following for tridiag PCs?
+!    integer, parameter :: &
+!         maxiters_tridiag = 100 ! max number of linear iterations for tridiagonal preconditioning,
+!                                ! which generally leads to faster convergence than diagonal preconditioning 
+
     !WHL - debug
     integer :: iu_max, ju_max, iv_max, jv_max
     real(dp) :: ru_max, rv_max
@@ -742,24 +941,160 @@ subroutine pcg_solver_standard_2d(nx,        ny,            &
     staggered_jlo = parallel%staggered_jlo
     staggered_jhi = parallel%staggered_jhi
 
+    tasks_row = parallel%tasks_row
+    tasks_col = parallel%tasks_col
+
     if (verbose_pcg .and. main_task) then
        write(iulog,*) 'Using native PCG solver (standard)'
        write(iulog,*) 'tolerance, maxiters, precond =', tolerance, maxiters, precond
     endif
 
+    ! Compute array sizes for locally owned vertices
+    ilocal = staggered_ihi - staggered_ilo + 1
+    jlocal = staggered_jhi - staggered_jlo + 1
+
     ! Set up matrices for preconditioning
 
-    !TODO - Add tridiagonal option
+    call t_startf("pcg_precond_init")
 
-    if (verbose_pcg .and. main_task) then
-       write(iulog,*) 'Using diagonal matrix for preconditioning'
-    endif  ! verbose_pcg
+    if (precond == HO_PRECOND_NONE) then    ! no preconditioner
+
+       if (verbose_pcg .and. main_task) then
+          write(iulog,*) 'Using no preconditioner'
+       endif
+
+    elseif (precond == HO_PRECOND_DIAG) then
+
+       if (verbose_pcg .and. main_task) then
+          write(iulog,*) 'Using diagonal matrix for preconditioning'
+       endif  ! verbose_pcg
+
+       call setup_preconditioner_diag_2d(nx,         ny,       &
+                                         indxA_2d,             &
+                                         Auu,        Avv,      &
+                                         Adiagu,     Adiagv)
+
+    elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then
+
+       !WHL - debug
+       if (verbose_tridiag .and. this_rank==rtest) then
+          i = itest
+          j = jtest
+          write(iulog,*) ' '
+          write(iulog,*) 'r, i, j =', this_rank, i, j
+          write(iulog,*) 'Auu =', Auu(i,j,:)
+          write(iulog,*) 'Avv =', Avv(i,j,:)
+       endif
+
+       allocate(Adiag_u   (nx-1,ny-1))
+       allocate(Asubdiag_u(nx-1,ny-1))
+       allocate(Asupdiag_u(nx-1,ny-1))
+       allocate(omega_u   (nx-1,ny-1))
+       allocate(denom_u   (nx-1,ny-1))
+
+       allocate(Adiag_v   (nx-1,ny-1))
+       allocate(Asubdiag_v(nx-1,ny-1))
+       allocate(Asupdiag_v(nx-1,ny-1))
+       allocate(omega_v   (nx-1,ny-1))
+       allocate(denom_v   (nx-1,ny-1))
+
+       call setup_preconditioner_tridiag_local_2d(&
+            nx,            ny,             &
+            parallel,      indxA_2d,       &
+            itest,         jtest,  rtest,  &
+            Auu,           Avv,            &
+            Adiag_u,       Adiag_v,        &
+            Asubdiag_u,    Asubdiag_v,     &
+            Asupdiag_u,    Asupdiag_v,     &
+            omega_u,       omega_v,        &
+            denom_u,       denom_v)
+
+    elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then
+
+       ! Allocate tridiagonal matrices
+       ! Note: (i,j) indices are switced for the A_v matrices to reduce striding.
+
+       allocate(Adiag_u   (ilocal,jlocal))
+       allocate(Asubdiag_u(ilocal,jlocal))
+       allocate(Asupdiag_u(ilocal,jlocal))
+       allocate(omega_u(ilocal,jlocal))
+       allocate(denom_u(ilocal,jlocal))
+       allocate(xuh_u(ilocal,jlocal))
+       allocate(xlh_u(ilocal,jlocal))
+       allocate(b_u(ilocal,jlocal))
+       allocate(x_u(ilocal,jlocal))
+
+       allocate(Adiag_v   (jlocal,ilocal))
+       allocate(Asubdiag_v(jlocal,ilocal))
+       allocate(Asupdiag_v(jlocal,ilocal))
+       allocate(omega_v(jlocal,ilocal))
+       allocate(denom_v(jlocal,ilocal))
+       allocate(xuh_v(jlocal,ilocal))
+       allocate(xlh_v(jlocal,ilocal))
+       allocate(b_v(jlocal,ilocal))
+       allocate(x_v(jlocal,ilocal))
+
+       ! These two matrices are for gathering data from all tasks on a given row or column.
+       allocate(gather_data_row(8*tasks_row,jlocal))
+       allocate(gather_data_col(8*tasks_col,ilocal))
+       gather_data_row = 0.0d0
+       gather_data_col = 0.0d0
+
+       ! Compute the entries of the tridiagonal matrices
+
+       ! Extract tridiagonal matrix entries from Auu
+       do j = 1, jlocal
+          jj = j + staggered_jlo - 1
+          do i = 1, ilocal
+             ii = i + staggered_ilo - 1
+             Asubdiag_u(i,j) = Auu(ii,jj,indxA_2d(-1,0))   ! subdiagonal elements
+             Adiag_u   (i,j) = Auu(ii,jj,indxA_2d( 0,0))   ! diagonal elements
+             Asupdiag_u(i,j) = Auu(ii,jj,indxA_2d( 1,0))   ! superdiagonal elements
+          enddo
+       enddo
+
+       ! compute work arrays for the u solve in each matrix row
+       call setup_preconditioner_tridiag_global_2d(&
+            ilocal,       jlocal,      &
+!!          itest, jtest, rtest,       &
+            itest - staggered_ilo + 1, &  ! itest referenced to (ilocal,jlocal) coordinates
+            jtest - staggered_jlo + 1, &  ! jtest referenced to (ilocal,jlocal) coordinates
+            rtest,                     &
+            Adiag_u,                   &
+            Asubdiag_u,   Asupdiag_u,  &
+            omega_u,      denom_u,     &
+            xuh_u,        xlh_u)
+
+       ! Extract tridiagonal matrix entries from Avv
+
+       do i = 1, ilocal
+          ii = i + staggered_ilo - 1
+          do j = 1, jlocal
+             jj = j + staggered_jlo - 1
+             Asubdiag_v(j,i) = Avv(ii,jj,indxA_2d(0,-1))   ! subdiagonal elements
+             Adiag_v   (j,i) = Avv(ii,jj,indxA_2d(0, 0))   ! diagonal elements
+             Asupdiag_v(j,i) = Avv(ii,jj,indxA_2d(0, 1))   ! superdiagonal elements
+          enddo
+       enddo
+
+       ! compute work arrays for the v solve in each matrix column
+       ! Note: The *_v arrays have dimensions (jlocal,ilocal) to reduce strides
+          
+       call setup_preconditioner_tridiag_global_2d(&
+            jlocal,       ilocal,      &
+!!          itest, jtest, rtest,       &
+            jtest - staggered_jlo + 1, &  ! jtest referenced to (jlocal,ilocal) coordinates
+            itest - staggered_ilo + 1, &  ! itest referenced to (jlocal,ilocal) coordinates
+            rtest,                     &
+            Adiag_v,                   &
+            Asubdiag_v,   Asupdiag_v,  &
+            omega_v,      denom_v,     &
+            xuh_v,        xlh_v)
+
+    endif   ! precond
+
+    if (verbose_pcg .and. main_task) write(iulog,*) 'Done in PC setup'
 
-    call t_startf("pcg_precond_init")
-    call setup_preconditioner_diag_2d(nx,         ny,       &
-                                      indxA_2d,             &
-                                      Auu,        Avv,      &
-                                      Adiagu,     Adiagv)
     call t_stopf("pcg_precond_init")
 
     ! Compute initial residual and initialize the direction vector d
@@ -858,6 +1193,143 @@ subroutine pcg_solver_standard_2d(nx,        ny,            &
           enddo    ! i
           enddo    ! j
 
+       elseif(precond == HO_PRECOND_TRIDIAG_LOCAL) then  ! local
+
+          if (verbose_tridiag .and. this_rank == rtest) then
+             i = itest
+             j = jtest
+             write(iulog,*) 'Residual:'
+             write(iulog,*) 'r, i, j, ru:', this_rank, i, j, ru(i,j)
+             write(iulog,*) 'r, i, j, rv:', this_rank, i, j, rv(i,j)
+             write(iulog,*) ' '
+             write(iulog,*) 'jtest =', jtest
+             write(iulog,*) 'i, ru, rv:'
+             do i = itest-3, itest+3
+                write(iulog,'(i4, 2f15.10)') i, ru(i,j), rv(i,j)
+             enddo
+          endif
+
+          if (verbose_pcg .and. main_task) then
+             write(iulog,*) 'call tridiag_solver_local_2d'
+          endif
+
+          ! Solve M*z = r, where M is a local tridiagonal matrix (one matrix per task)
+          !TODO - Test a local solver that can compute zu and zv in the halo
+          !       (to avoid the halo update below)
+
+          call tridiag_solver_local_2d(&
+               nx,            ny,            &
+               parallel,                     &
+               itest, jtest,  rtest,         &
+               Adiag_u,       Adiag_v,       &  ! entries of preconditioning matrix
+               Asubdiag_u,    Asubdiag_v,    &
+               Asupdiag_u,    Asupdiag_v,    &
+               omega_u,       omega_v,       &
+               denom_u,       denom_v,       &
+               ru,            rv,            &  ! right hand side
+               zu,            zv)               ! solution
+       
+          !WHL - debug
+          if (verbose_pcg .and. this_rank == rtest) then
+             j = jtest
+             write(iulog,*) ' '
+             write(iulog,*) 'tridiag solve: i, ru, rv, zu, zv:'
+             do i = itest-3, itest+3
+                write(iulog,'(i4, 4f16.10)') i, ru(i,j), rv(i,j), zu(i,j), zv(i,j)
+             enddo
+          endif
+
+          !Note: Need zu and zv in a row of halo cells so that q = A*d is correct in all locally owned cells
+          !TODO: See whether tridiag solvers could be modified to provide zu and zv in halo cells?
+          call staggered_parallel_halo(zu, parallel)
+          call staggered_parallel_halo(zv, parallel)
+
+       elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then   ! tridiagonal preconditioning with global solve
+
+          ! Use a global tridiagonal solver to find an approximate solution of A*z = r
+          if (iter == 1) then
+             first_time = .true.
+          else
+             first_time = .false.
+          endif
+
+          ! convert ru(nx-1,ny-1) to b_u(ilocal,jlocal)
+          do j = 1, jlocal
+             jj = j + staggered_jlo - 1
+             do i = 1, ilocal
+                ii = i + staggered_ilo - 1
+                b_u(i,j) = ru(ii,jj)
+             enddo
+          enddo
+
+          ! Solve M*z = r, where M is a global tridiagonal matrix
+
+          call tridiag_solver_global_2d(&
+               ilocal,       jlocal,      &
+               parallel,     tasks_row,   &
+               'row',                     &  ! tridiagonal solve for each row
+!!                                          itest, jtest, rtest,      &
+               itest - staggered_ilo + 1, &  ! itest referenced to (ilocal,jlocal) coordinates                                  
+               jtest - staggered_jlo + 1, &  ! jtest referenced to (ilocal,jlocal) coordinates
+               rtest,                     &
+               Adiag_u,                   &
+               Asubdiag_u,   Asupdiag_u,  &
+               omega_u,      denom_u,     &
+               xuh_u,        xlh_u,       &
+               b_u,          x_u,         &
+               first_time,                &
+               gather_data_row)
+
+          ! convert x_u(ilocal,jlocal) to zu(nx-1,ny-1)
+          zu(:,:) = 0.0d0
+          do j = 1, jlocal
+             jj = j + staggered_jlo - 1
+             do i = 1, ilocal
+                ii = i + staggered_ilo - 1
+                zu(ii,jj) = x_u(i,j)
+             enddo
+          enddo
+
+          ! convert rv(nx-1,ny-1) to b_v(jlocal,ilocal)
+          do i = 1, ilocal
+             ii = i + staggered_ilo - 1
+             do j = 1, jlocal
+                jj = j + staggered_jlo - 1
+                b_v(j,i) = rv(ii,jj)
+             enddo
+          enddo
+
+          call tridiag_solver_global_2d(&
+               jlocal,       ilocal,      &
+               parallel,     tasks_col,   &
+               'col',                     &  ! tridiagonal solve for each column
+               !!                                          itest, jtest, rtest,      &
+               jtest - staggered_jlo + 1, &  ! jtest referenced to (jlocal,ilocal) coordinates
+               itest - staggered_ilo + 1, &  ! itest referenced to (jlocal,ilocal) coordinates
+               rtest,                     &
+               Adiag_v,                   &
+               Asubdiag_v,   Asupdiag_v,  &
+               omega_v,      denom_v,     &
+               xuh_v,        xlh_v,       &
+               b_v,          x_v,         &
+               first_time,                &
+               gather_data_col)
+
+          ! convert x_v(jlocal,ilocal) to zv(nx-1,ny-1)
+          zv(:,:) = 0.0d0
+          do i = 1, ilocal
+             ii = i + staggered_ilo - 1
+             do j = 1, jlocal
+                jj = j + staggered_jlo - 1
+                zv(ii,jj) = x_v(j,i)
+             enddo
+          enddo
+
+          !Note: Need zu and zv in a row of halo cells so that q = A*d is correct in all locally owned cells
+          !TODO: See whether tridiag_solver_local_2d could be modified to provide zu and zv in halo cells?
+          call staggered_parallel_halo(zu, parallel)
+          call staggered_parallel_halo(zv, parallel)
+          
        endif    ! precond
 
        call t_stopf("pcg_precond")
@@ -1049,6 +1521,19 @@ subroutine pcg_solver_standard_2d(nx,        ny,            &
 
     enddo iter_loop
 
+    ! Clean up
+    if (allocated(Adiag_u))    deallocate(Adiag_u, Adiag_v)
+    if (allocated(Asubdiag_u)) deallocate(Asubdiag_u, Asubdiag_v)
+    if (allocated(Asupdiag_u)) deallocate(Asupdiag_u, Asupdiag_v)
+    if (allocated(omega_u))    deallocate(omega_u, omega_v)
+    if (allocated(denom_u))    deallocate(denom_u, denom_v)
+    if (allocated(xuh_u))      deallocate(xuh_u, xuh_v)
+    if (allocated(xlh_u))      deallocate(xlh_u, xlh_v)
+    if (allocated(b_u))        deallocate(b_u, b_v)
+    if (allocated(x_u))        deallocate(x_u, x_v)
+    if (allocated(gather_data_row)) deallocate(gather_data_row)
+    if (allocated(gather_data_col)) deallocate(gather_data_col)
+
   end subroutine pcg_solver_standard_2d
 
 !****************************************************************************
@@ -1178,7 +1663,7 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
        nz                               ! number of vertical levels where velocity is computed
 
     type(parallel_type), intent(in) :: &
-         parallel             ! info for parallel communication
+       parallel               ! info for parallel communication
 
     integer, dimension(-1:1,-1:1), intent(in) :: &
        indxA_2d               ! maps relative (x,y) coordinates to an index between 1 and 9
@@ -1285,9 +1770,6 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
          xuh_u,   xuh_v,           &
          xlh_u,   xlh_v
 
-    real(dp), dimension(:,:), allocatable :: &
-         b_u, b_v, x_u, x_v
-
     ! Note: These two matrices are global in the EW and NS dimensions, respectively.
     !       Each holds 8 pieces of information for each task on each row or column.
     !       Since only 2 of these 8 pieces of information change from one iteration to the next,
@@ -2015,6 +2497,8 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
     if (allocated(denom_u))    deallocate(denom_u, denom_v)
     if (allocated(xuh_u))      deallocate(xuh_u, xuh_v)
     if (allocated(xlh_u))      deallocate(xlh_u, xlh_v)
+    if (allocated(gather_data_row)) deallocate(gather_data_row)
+    if (allocated(gather_data_col)) deallocate(gather_data_col)
 
   end subroutine pcg_solver_chrongear_3d
 
@@ -2115,7 +2599,7 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
     ! Local variables and parameters   
     !---------------------------------------------------------------
 
-    integer ::  i, j         ! grid indices
+    integer ::  i, j, ii, jj ! grid indices
     integer ::  m            ! matrix element index
     integer ::  iter         ! iteration counter
 
@@ -2130,24 +2614,6 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
     real(dp), dimension(2) ::   &
        gsum                ! result of global sum for dot products
    
-    ! diagonal matrix elements
-    real(dp), dimension(nx-1,ny-1) ::  &
-       Adiagu, Adiagv      ! diagonal terms of matrices Auu and Avv
-
-    ! tridiagonal matrix elements
-    real(dp), dimension(:,:), allocatable :: &
-         Asubdiag_u, Adiag_u, Asupdiag_u,  &  ! matrix entries from Auu for tridiagonal preconditioning
-         Asubdiag_v, Adiag_v, Asupdiag_v      ! matrix entries from Avv for tridiagonal preconditioning
-
-    real(dp), dimension(:,:), allocatable :: &
-         omega_u, omega_v,         & ! work arrays for tridiagonal solve
-         denom_u, denom_v,         &
-         xuh_u,   xuh_v,           &
-         xlh_u,   xlh_v
-
-    real(dp), dimension(:,:), allocatable :: &
-         b_u, b_v, x_u, x_v
-
     ! vectors (each of these is split into u and v components)
     real(dp), dimension(nx-1,ny-1) ::  &
        ru, rv,            &! residual vector (b-Ax)
@@ -2167,13 +2633,30 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
        L2_rhs              ! L2 norm of rhs vector = sqrt(b,b)
                            ! solver is converged when L2_resid/L2_rhs < tolerance
 
+    ! diagonal matrix elements
+    real(dp), dimension(nx-1,ny-1) ::  &
+       Adiagu, Adiagv      ! diagonal terms of matrices Auu and Avv
+
+    ! tridiagonal matrix elements
+    real(dp), dimension(:,:), allocatable :: &
+         Asubdiag_u, Adiag_u, Asupdiag_u,  &  ! matrix entries from Auu for tridiagonal preconditioning
+         Asubdiag_v, Adiag_v, Asupdiag_v      ! matrix entries from Avv for tridiagonal preconditioning
+
+    real(dp), dimension(:,:), allocatable :: &
+         omega_u, omega_v,         & ! work arrays for tridiagonal solve
+         denom_u, denom_v,         &
+         xuh_u,   xuh_v,           &
+         xlh_u,   xlh_v
+
+    real(dp), dimension(:,:), allocatable :: &
+         b_u, b_v, x_u, x_v
+
     real(dp), dimension(:,:), allocatable :: &
          gather_data_row,   &   ! arrays for gathering data from every task on a row or column
          gather_data_col
 
     integer :: ilocal, jlocal   ! number of locally owned vertices in each direction
 
-    integer :: ii, jj
     integer :: maxiters_chrongear  ! max number of linear iterations before quitting
 
     integer :: &
@@ -2285,9 +2768,8 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
             denom_u,       denom_v)
 
     elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then
-       !TODO - Figure out why these calculations depend on the number of cores. Halo bug?
 
-       ! Allocate tridiagonal matrices
+       ! Allocate tridiagonal preconditioning matrices
        ! Note: (i,j) indices are switced for the A_v matrices to reduce striding.
 
        allocate(Adiag_u   (ilocal,jlocal))
@@ -2310,6 +2792,12 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
        allocate(b_v(jlocal,ilocal))
        allocate(x_v(jlocal,ilocal))
 
+       ! These two matrices are for gathering data from all tasks on a given row or column.
+       allocate(gather_data_row(8*tasks_row,jlocal))
+       allocate(gather_data_col(8*tasks_col,ilocal))
+       gather_data_row = 0.0d0
+       gather_data_col = 0.0d0
+
        ! Compute the entries of the tridiagonal matrices
 
        ! Extract tridiagonal matrix entries from Auu
@@ -2548,10 +3036,6 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
           enddo
        enddo
 
-       ! Initialize the array for gathering information on each row of tasks
-       allocate(gather_data_row(8*tasks_row,jlocal))
-       gather_data_row = 0.0d0
-
        ! Solve M*z = r, where M is a global tridiagonal matrix
 
        call tridiag_solver_global_2d(ilocal,       jlocal,      &
@@ -2589,10 +3073,6 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
           enddo
        enddo
 
-       ! Initialize the array for gathering information on each column of tasks
-       allocate(gather_data_col(8*tasks_col,ilocal))
-       gather_data_col = 0.0d0
-
        call tridiag_solver_global_2d(jlocal,       ilocal,      &
                                      parallel,     tasks_col,   &
                                      'col',                     &  ! tridiagonal solve for each column

From d2c46ae9268c4332c344e198b13f63e730d1b60d Mon Sep 17 00:00:00 2001
From: William Lipscomb <lipscomb@ucar.edu>
Date: Thu, 22 Jan 2026 20:00:54 -0700
Subject: [PATCH 15/21] Reordered the PCG subroutines

This commit simply moves the 2D subroutines before the 3D subroutines
in glissade_velo_higher_pcg.F90. It seems more logical to put the
2D subroutines first. Might do something similar in glissade_velo_higher.F90.
---
 libglissade/glissade_velo_higher_pcg.F90 | 3061 +++++++++++-----------
 1 file changed, 1531 insertions(+), 1530 deletions(-)

diff --git a/libglissade/glissade_velo_higher_pcg.F90 b/libglissade/glissade_velo_higher_pcg.F90
index 4a303b71..57d1cebe 100644
--- a/libglissade/glissade_velo_higher_pcg.F90
+++ b/libglissade/glissade_velo_higher_pcg.F90
@@ -53,8 +53,8 @@ module glissade_velo_higher_pcg
     implicit none
     
     private
-    public :: pcg_solver_standard_3d,  pcg_solver_standard_2d,  &
-              pcg_solver_chrongear_3d, pcg_solver_chrongear_2d, &
+    public :: pcg_solver_standard_2d,  pcg_solver_standard_3d,  &
+              pcg_solver_chrongear_2d, pcg_solver_chrongear_3d, &
               matvec_multiply_structured_3d
 
     logical, parameter :: verbose_pcg = .false.
@@ -64,10 +64,9 @@ module glissade_velo_higher_pcg
 
 !****************************************************************************
 
-  subroutine pcg_solver_standard_3d(nx,        ny,            &
-                                    nz,        parallel,      &
-                                    indxA_2d,  indxA_3d,      &
-                                    active_vertex,            &
+  subroutine pcg_solver_standard_2d(nx,        ny,            &
+                                    parallel,                 &
+                                    indxA_2d,  active_vertex, &
                                     Auu,       Auv,           &
                                     Avu,       Avv,           &
                                     bu,        bv,            &
@@ -82,50 +81,25 @@ subroutine pcg_solver_standard_3d(nx,        ny,            &
     !  to solve the equation $Ax=b$.
     !  Convergence is checked every {\em linear_solve_ncheck} steps.
     !
-    !  It is based on the barotropic solver in the POP ocean model 
-    !  (author Phil Jones, LANL).  Input and output arrays are located
-    !  on a structured (i,j,k) grid as defined in the glissade_velo_higher
-    !  module.  The global matrix is sparse, but its nonzero elements
-    !  are stored in four dense matrices called Auu, Avv, Auv, and Avu.
-    !  Each matrix has 3x3x3 = 27 potential nonzero elements per
-    !  node (i,j,k).
+    !  It is similar to subroutine pcg_solver_standard_3d, but modified
+    !  to solve for x and y at a single horizontal level, as in the
+    !  shallow-shelf approximation.  See the comments in that subroutine
+    !  (above) for more details on data structure and solver methods.
+    !
+    !  Input and output arrays are located on a structured (i,j) grid 
+    !  as defined in the glissade_velo_higher module.  The global matrix 
+    !  is sparse, but its nonzero element are stored in four dense matrices 
+    !  called Auu, Avv, Auv, and Avu. Each matrix has 3x3 = 9 potential 
+    !  nonzero elements per node (i,j).
     !
     !  The current preconditioning options are
     !  (0) no preconditioning
     !  (1) diagonal preconditioning
-    !  (2) preconditioning using a physics-based SIA solver
-    ! 
-    !  For the dome test case with higher-order dynamics, option (2) is best. 
-    !
-    !  Here is a schematic of the method implemented below for solving Ax = b:
-    !
-    !  halo_update(x0)
-    !  r0 = b - A*x0
-    !  d0 = 0
-    !  eta0 = 1
-    !
-    !  while (not converged)
-    !     solve Mz = r for z
-    !     eta1 = (r,z)
-    !     beta = eta1/eta0
-    !     d = z + beta*d
-    !     halo_update(d)
-    !     eta0 = eta1
-    !     q = Ad
-    !     eta2 = (d,q)
-    !     alpha = eta1/eta2
-    !     x = x + alpha*d
-    !     r = r - alpha*q (or occasionally, r = b - Ax)
-    !     Check for convergence: err = sqrt(r,r)/sqrt(b,b) < tolerance
-    !  end while
     !
-    !  where x = solution (initial value = x0)
-    !        d = conjugate direction vector (initial value = d0)
-    !        r = residual vector (initial value = r0)
-    !        M = preconditioning matrix
-    !    (r,z) = dot product of vectors r and z
-    !            and similarly for (d,q)
-    !       
+    !  The SIA-based preconditioning optional is not available for a 2D solve.
+    ! 
+    !  TODO: Add a tridiagonal preconditioning option to this subroutine,
+    !        as for subroutine pcg_solver_chrongear_2d.
     !---------------------------------------------------------------
 
     !---------------------------------------------------------------
@@ -133,70 +107,62 @@ subroutine pcg_solver_standard_3d(nx,        ny,            &
     !---------------------------------------------------------------
 
     integer, intent(in) ::   &
-       nx, ny,               &  ! horizontal grid dimensions (for scalars)
-                                ! velocity grid has dimensions (nx-1,ny-1)
-       nz                       ! number of vertical levels where velocity is computed
+       nx, ny                 ! horizontal grid dimensions (for scalars)
+                              ! velocity grid has dimensions (nx-1,ny-1)
 
     type(parallel_type), intent(in) :: &
-       parallel               ! info for parallel communication
+         parallel             ! info for parallel communication
 
     integer, dimension(-1:1,-1:1), intent(in) :: &
        indxA_2d               ! maps relative (x,y) coordinates to an index between 1 and 9
 
-    integer, dimension(-1:1,-1:1,-1:1), intent(in) :: &
-       indxA_3d               ! maps relative (x,y,z) coordinates to an index between 1 and 27
-
     logical, dimension(nx-1,ny-1), intent(in) ::   &
-       active_vertex          ! T for columns (i,j) where velocity is computed, else F
+       active_vertex          ! T for vertices (i,j) where velocity is computed, else F
  
-    real(dp), dimension(27,nz,nx-1,ny-1), intent(in) ::   &
+    real(dp), dimension(nx-1,ny-1,9), intent(in) ::   &
        Auu, Auv, Avu, Avv     ! four components of assembled matrix
-                              ! 1st dimension = 27 (node and its nearest neighbors in x, y and z direction)
-                              ! other dimensions = (z,x,y) indices
+                              ! 3rd dimension = 9 (node and its nearest neighbors in x and y direction)
+                              ! 1st and 2nd dimensions = (x,y) indices
                               !
                               !    Auu  | Auv
                               !    _____|____
                               !    Avu  | Avv
                               !         |
 
-    real(dp), dimension(nz,nx-1,ny-1), intent(in) ::  &
+    real(dp), dimension(nx-1,ny-1), intent(in) ::  &
        bu, bv             ! assembled load (rhs) vector, divided into 2 parts
 
-    real(dp), dimension(nz,nx-1,ny-1), intent(inout) ::   &
+    real(dp), dimension(nx-1,ny-1), intent(inout) ::   &
        xu, xv             ! u and v components of solution (i.e., uvel and vvel)
 
     integer, intent(in)  ::   &
        precond           ! = 0 for no preconditioning
                          ! = 1 for diagonal preconditioning (best option for SSA-dominated flow)
-                         ! = 2 for preconditioning with SIA solver (works well for SIA-dominated flow)
 
     integer, intent(in)  :: &
-       linear_solve_ncheck          ! number of iterations between convergence checks in the linear solver
+       linear_solve_ncheck         ! number of iterations between convergence checks in the linear solver
 
     integer, intent(in) ::    &
-       maxiters                     ! max number of linear iterations before quitting
+       maxiters                    ! max number of linear iterations before quitting
 
     real(dp), intent(in) ::   &
-       tolerance                    ! tolerance for linear solver
+       tolerance                   ! tolerance for linear solver
 
     real(dp), intent(out) ::  &
-       err                          ! error (L2 norm of residual) in final solution
+       err                         ! error (L2 norm of residual) in final solution
 
     integer, intent(out) ::   &
-       niters                       ! iterations needed to solution
+       niters                      ! iterations needed to solution
 
     integer, intent(in) :: &
-       itest, jtest, rtest          ! point for debugging diagnostics
+       itest, jtest, rtest         ! point for debugging diagnostics
 
     !---------------------------------------------------------------
     ! Local variables and parameters   
     !---------------------------------------------------------------
 
-    integer :: i, j, k        ! grid indices
-    integer :: iA, jA, kA     ! grid offsets ranging from -1 to 1
-    integer :: m              ! matrix element index
-    integer :: ilocal, jlocal ! number of locally owned vertices in each direction
-    integer :: iter           ! iteration counter
+    integer ::  i, j, ii, jj ! grid indices
+    integer ::  iter         ! iteration counter
 
     real(dp) ::           &
        eta0, eta1, eta2,  &! scalar inner product results
@@ -204,7 +170,7 @@ subroutine pcg_solver_standard_3d(nx,        ny,            &
        beta                ! eta1/eta0 = term in expression for new direction vector
 
     ! vectors (each of these is split into u and v components)
-    real(dp), dimension(nz,nx-1,ny-1) ::  &
+    real(dp), dimension(nx-1,ny-1) ::  &
        Adiagu, Adiagv,    &! diagonal terms of matrices Auu and Avv
        ru, rv,            &! residual vector (b-Ax)
        du, dv,            &! conjugate direction vector
@@ -217,33 +183,26 @@ subroutine pcg_solver_standard_3d(nx,        ny,            &
        L2_rhs              ! L2 norm of rhs vector b
                            ! solver converges when L2_resid/L2_rhs < tolerance
 
-    real(dp), dimension(-1:1,nz,nx-1,ny-1) ::  &
-       Muu, Mvv            ! simplified SIA matrices for preconditioning
-
-    ! arrays for tridiagonal preconditioning
-    ! Note: 2D diagonal entries are Adiag_u and Adiag_v; distinct from 3D Adiagu and Adiagv above
+    ! tridiagonal matrix elements
     real(dp), dimension(:,:), allocatable :: &
          Asubdiag_u, Adiag_u, Asupdiag_u,  &  ! matrix entries from Auu for tridiagonal preconditioning
          Asubdiag_v, Adiag_v, Asupdiag_v      ! matrix entries from Avv for tridiagonal preconditioning
-
+    
     real(dp), dimension(:,:), allocatable :: &
          omega_u, omega_v,         & ! work arrays for tridiagonal solve
-	 denom_u, denom_v,         &
+         denom_u, denom_v,         &
          xuh_u,   xuh_v,           &
          xlh_u,   xlh_v
 
-    ! Note: These two matrices are global in the EW and NS dimensions, respectively.
-    !       Each holds 8 pieces of information for each task on each row or column.
-    !       Since only 2 of these 8 pieces of information change from one iteration to the next,
-    !        it is more efficient to gather the remaining information once and pass the arrays
-    !        with intent(inout), than to declare the arrays in subroutine tridiag_solver_global_2d
-    !        and gather all the information every time the subroutine is called.
-    ! TODO: Revisit this. Is the efficiency gain large enough to justify the extra complexity?
+    real(dp), dimension(:,:), allocatable :: &
+         b_u, b_v, x_u, x_v
 
     real(dp), dimension(:,:), allocatable :: &
          gather_data_row,   &   ! arrays for gathering data from every task on a row or column
          gather_data_col
 
+    integer :: ilocal, jlocal   ! number of locally owned vertices in each direction   
+
     integer :: &
          tasks_row,         &   ! number of tasks per row and column for tridiagonal solve
          tasks_col
@@ -254,6 +213,11 @@ subroutine pcg_solver_standard_3d(nx,        ny,            &
          staggered_ilo, staggered_ihi, &  ! bounds of locally owned vertices on staggered grid
          staggered_jlo, staggered_jhi
 
+    !TODO - Apply the following for tridiag PCs?
+!    integer, parameter :: &
+!         maxiters_tridiag = 100 ! max number of linear iterations for tridiagonal preconditioning,
+!                                ! which generally leads to faster convergence than diagonal preconditioning 
+
     !WHL - debug
     integer :: iu_max, ju_max, iv_max, jv_max
     real(dp) :: ru_max, rv_max
@@ -281,48 +245,33 @@ subroutine pcg_solver_standard_3d(nx,        ny,            &
 
     if (precond == HO_PRECOND_NONE) then    ! no preconditioner
 
-       if (verbose_pcg .and. this_rank == rtest) then
+       if (verbose_pcg .and. main_task) then
           write(iulog,*) 'Using no preconditioner'
        endif
 
     elseif (precond == HO_PRECOND_DIAG) then
 
-       call setup_preconditioner_diag_3d(nx,       ny,         &
-                                         nz,       indxA_3d,   &
-                                         Auu,      Avv,        &
-                                         Adiagu,   Adiagv)
-
-       !WHL - debug
-       if (verbose_pcg .and. this_rank == rtest) then
-          i = itest
-          j = jtest
-          write(iulog,*) 'i, j, r =', i, j, this_rank
-          write(iulog,*) 'Auu diag =', Adiagu(:,i,j)
-          write(iulog,*) 'Avv diag =', Adiagv(:,i,j)
-       endif
+       if (verbose_pcg .and. main_task) then
+          write(iulog,*) 'Using diagonal matrix for preconditioning'
+       endif  ! verbose_pcg
 
-    elseif (precond == HO_PRECOND_SIA) then
+       call setup_preconditioner_diag_2d(nx,         ny,       &
+                                         indxA_2d,             &
+                                         Auu,        Avv,      &
+                                         Adiagu,     Adiagv)
 
-       call setup_preconditioner_sia_3d(nx,        ny,       &
-                                        nz,        indxA_3d, &
-                                        Auu,       Avv,      &
-                                        Muu,       Mvv)
+    elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then
 
-       if (verbose_pcg .and. this_rank == rtest) then
+       !WHL - debug
+       if (verbose_tridiag .and. this_rank==rtest) then
+          i = itest
           j = jtest
           write(iulog,*) ' '
-          write(iulog,*) 'i, k, Muu_sia, Mvv_sia:'
-          do i = staggered_ihi, staggered_ilo, -1
-             write(iulog,*) ' '
-             do k = 1, nz
-                write(iulog,'(2i4, 6e13.5)') i, k, Muu(:,k,i,j), Mvv(:,k,i,j)
-             enddo
-          enddo  ! i
+          write(iulog,*) 'r, i, j =', this_rank, i, j
+          write(iulog,*) 'Auu =', Auu(i,j,:)
+          write(iulog,*) 'Avv =', Avv(i,j,:)
        endif
 
-    elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then
-
-       ! Allocate tridiagonal preconditioning matrices
        allocate(Adiag_u   (nx-1,ny-1))
        allocate(Asubdiag_u(nx-1,ny-1))
        allocate(Asupdiag_u(nx-1,ny-1))
@@ -335,26 +284,22 @@ subroutine pcg_solver_standard_3d(nx,        ny,            &
        allocate(omega_v   (nx-1,ny-1))
        allocate(denom_v   (nx-1,ny-1))
 
-       ! Compute arrays for tridiagonal preconditioning
-
-       call setup_preconditioner_tridiag_local_3d(&
-            nx,            ny,            &
-            nz,            parallel,      &
-            active_vertex,                &
-            indxA_2d,      indxA_3d,      &
-            itest, jtest,  rtest,         &
-            Auu,           Avv,           &
-            Muu,           Mvv,           &
-            Adiag_u,       Adiag_v,       &
-            Asubdiag_u,    Asubdiag_v,    &
-            Asupdiag_u,    Asupdiag_v,    &
-            omega_u,       omega_v,       &
+       call setup_preconditioner_tridiag_local_2d(&
+            nx,            ny,             &
+            parallel,      indxA_2d,       &
+            itest,         jtest,  rtest,  &
+            Auu,           Avv,            &
+            Adiag_u,       Adiag_v,        &
+            Asubdiag_u,    Asubdiag_v,     &
+            Asupdiag_u,    Asupdiag_v,     &
+            omega_u,       omega_v,        &
             denom_u,       denom_v)
 
     elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then
 
-       ! Allocate tridiagonal preconditioning matrices
-       ! Note: (i,j) indices are switched for the A_v matrices to reduce striding.
+       ! Allocate tridiagonal matrices
+       ! Note: (i,j) indices are switced for the A_v matrices to reduce striding.
+
        allocate(Adiag_u   (ilocal,jlocal))
        allocate(Asubdiag_u(ilocal,jlocal))
        allocate(Asupdiag_u(ilocal,jlocal))
@@ -362,6 +307,8 @@ subroutine pcg_solver_standard_3d(nx,        ny,            &
        allocate(denom_u(ilocal,jlocal))
        allocate(xuh_u(ilocal,jlocal))
        allocate(xlh_u(ilocal,jlocal))
+       allocate(b_u(ilocal,jlocal))
+       allocate(x_u(ilocal,jlocal))
 
        allocate(Adiag_v   (jlocal,ilocal))
        allocate(Asubdiag_v(jlocal,ilocal))
@@ -370,6 +317,8 @@ subroutine pcg_solver_standard_3d(nx,        ny,            &
        allocate(denom_v(jlocal,ilocal))
        allocate(xuh_v(jlocal,ilocal))
        allocate(xlh_v(jlocal,ilocal))
+       allocate(b_v(jlocal,ilocal))
+       allocate(x_v(jlocal,ilocal))
 
        ! These two matrices are for gathering data from all tasks on a given row or column.
        allocate(gather_data_row(8*tasks_row,jlocal))
@@ -377,36 +326,72 @@ subroutine pcg_solver_standard_3d(nx,        ny,            &
        gather_data_row = 0.0d0
        gather_data_col = 0.0d0
 
-       call setup_preconditioner_tridiag_global_3d(&
-            nx,            ny,            &
-            nz,            parallel,      &
-            active_vertex,                &
-            indxA_2d,      indxA_3d,      &
-            ilocal,        jlocal,        &
-            itest, jtest,  rtest,         &
-            Auu,           Avv,           &
-            Muu,           Mvv,           &
-            Adiag_u,       Adiag_v,       &
-            Asubdiag_u,    Asubdiag_v,    &
-            Asupdiag_u,    Asupdiag_v,    &
-            omega_u,       omega_v,       &
-            denom_u,       denom_v,       &
-            xuh_u,         xuh_v,         &
-            xlh_u,         xlh_v)
-
-    endif   ! precond
+       ! Compute the entries of the tridiagonal matrices
 
-    call t_stopf("pcg_precond_init")
+       ! Extract tridiagonal matrix entries from Auu
+       do j = 1, jlocal
+          jj = j + staggered_jlo - 1
+          do i = 1, ilocal
+             ii = i + staggered_ilo - 1
+             Asubdiag_u(i,j) = Auu(ii,jj,indxA_2d(-1,0))   ! subdiagonal elements
+             Adiag_u   (i,j) = Auu(ii,jj,indxA_2d( 0,0))   ! diagonal elements
+             Asupdiag_u(i,j) = Auu(ii,jj,indxA_2d( 1,0))   ! superdiagonal elements
+          enddo
+       enddo
 
-    ! Compute initial residual and initialize the direction vector d
-    ! Note: The matrix A must be complete for all rows corresponding to locally 
-    !        owned vertices, and x must have the correct values in
-    !        halo vertices bordering the locally owned vertices.
-    !       Then y = Ax will be correct for locally owned vertices.
+       ! compute work arrays for the u solve in each matrix row
+       call setup_preconditioner_tridiag_global_2d(&
+            ilocal,       jlocal,      &
+!!          itest, jtest, rtest,       &
+            itest - staggered_ilo + 1, &  ! itest referenced to (ilocal,jlocal) coordinates
+            jtest - staggered_jlo + 1, &  ! jtest referenced to (ilocal,jlocal) coordinates
+            rtest,                     &
+            Adiag_u,                   &
+            Asubdiag_u,   Asupdiag_u,  &
+            omega_u,      denom_u,     &
+            xuh_u,        xlh_u)
 
-    ! Halo update for x (initial guess for velocity solution)
+       ! Extract tridiagonal matrix entries from Avv
 
-    call t_startf("pcg_halo_init")
+       do i = 1, ilocal
+          ii = i + staggered_ilo - 1
+          do j = 1, jlocal
+             jj = j + staggered_jlo - 1
+             Asubdiag_v(j,i) = Avv(ii,jj,indxA_2d(0,-1))   ! subdiagonal elements
+             Adiag_v   (j,i) = Avv(ii,jj,indxA_2d(0, 0))   ! diagonal elements
+             Asupdiag_v(j,i) = Avv(ii,jj,indxA_2d(0, 1))   ! superdiagonal elements
+          enddo
+       enddo
+
+       ! compute work arrays for the v solve in each matrix column
+       ! Note: The *_v arrays have dimensions (jlocal,ilocal) to reduce strides
+          
+       call setup_preconditioner_tridiag_global_2d(&
+            jlocal,       ilocal,      &
+!!          itest, jtest, rtest,       &
+            jtest - staggered_jlo + 1, &  ! jtest referenced to (jlocal,ilocal) coordinates
+            itest - staggered_ilo + 1, &  ! itest referenced to (jlocal,ilocal) coordinates
+            rtest,                     &
+            Adiag_v,                   &
+            Asubdiag_v,   Asupdiag_v,  &
+            omega_v,      denom_v,     &
+            xuh_v,        xlh_v)
+
+    endif   ! precond
+
+    if (verbose_pcg .and. main_task) write(iulog,*) 'Done in PC setup'
+
+    call t_stopf("pcg_precond_init")
+
+    ! Compute initial residual and initialize the direction vector d
+    ! Note: The matrix A must be complete for all rows corresponding to locally 
+    !        owned vertices, and x must have the correct values in
+    !        halo vertices bordering the locally owned vertices.
+    !       Then y = Ax will be correct for locally owned vertices.
+
+    ! Halo update for x (initial guess for velocity solution)
+
+    call t_startf("pcg_halo_init")
     call staggered_parallel_halo(xu, parallel)
     call staggered_parallel_halo(xv, parallel)
     call t_stopf("pcg_halo_init")
@@ -414,9 +399,9 @@ subroutine pcg_solver_standard_3d(nx,        ny,            &
     ! Compute A*x (use z as a temp vector for A*x)
 
     call t_startf("pcg_matmult_init")
-    call matvec_multiply_structured_3d(nx,            ny,            &
-                                       nz,            parallel,      &
-                                       indxA_3d,      active_vertex, &
+    call matvec_multiply_structured_2d(nx,            ny,            &
+                                       parallel,                     &
+                                       indxA_2d,      active_vertex, &
                                        Auu,           Auv,           &
                                        Avu,           Avv,           &
                                        xu,            xv,            &
@@ -427,8 +412,8 @@ subroutine pcg_solver_standard_3d(nx,        ny,            &
     ! This will be correct for locally owned vertices.
 
     call t_startf("pcg_vecupdate_init")
-    ru(:,:,:) = bu(:,:,:) - zu(:,:,:)
-    rv(:,:,:) = bv(:,:,:) - zv(:,:,:)
+    ru(:,:) = bu(:,:) - zu(:,:)
+    rv(:,:) = bv(:,:) - zv(:,:)
     call t_stopf("pcg_vecupdate_init")
 
     ! Initialize scalars and vectors
@@ -436,18 +421,18 @@ subroutine pcg_solver_standard_3d(nx,        ny,            &
     niters = maxiters 
     eta0 = 1.d0
 
-    du(:,:,:) = 0.d0
-    dv(:,:,:) = 0.d0
+    du(:,:) = 0.d0
+    dv(:,:) = 0.d0
 
-    zu(:,:,:) = 0.d0
-    zv(:,:,:) = 0.d0
+    zu(:,:) = 0.d0
+    zv(:,:) = 0.d0
 
     ! Compute the L2 norm of the RHS vectors
     ! (Goal is to obtain L2_resid/L2_rhs < tolerance)
 
     call t_startf("pcg_dotprod")
-    work0u(:,:,:) = bu(:,:,:)*bu(:,:,:)    ! terms of dot product (b, b)
-    work0v(:,:,:) = bv(:,:,:)*bv(:,:,:)
+    work0u(:,:) = bu(:,:)*bu(:,:)    ! terms of dot product (b, b)
+    work0v(:,:) = bv(:,:)*bv(:,:)
     call t_stopf("pcg_dotprod")
 
     ! find global sum of the squared L2 norm
@@ -464,92 +449,173 @@ subroutine pcg_solver_standard_3d(nx,        ny,            &
 
     iter_loop: do iter = 1, maxiters
 
+       if (verbose_pcg .and. main_task) then
+!          write(iulog,*) 'iter =', iter
+       endif
+
        call t_startf("pcg_precond")
 
        ! Compute PC(r) = solution z of Mz = r
 
        if (precond == HO_PRECOND_NONE) then      ! no preconditioning
 
-           zu(:,:,:) = ru(:,:,:)         ! PC(r) = r     
-           zv(:,:,:) = rv(:,:,:)         ! PC(r) = r    
+           zu(:,:) = ru(:,:)         ! PC(r) = r     
+           zv(:,:) = rv(:,:)         ! PC(r) = r    
 
-       elseif (precond == HO_PRECOND_DIAG ) then  ! diagonal preconditioning
+       elseif (precond == HO_PRECOND_DIAG) then  ! diagonal preconditioning
 
           do j = 1, ny-1
           do i = 1, nx-1
-          do k = 1, nz
-             if (Adiagu(k,i,j) /= 0.d0) then
-                zu(k,i,j) = ru(k,i,j) / Adiagu(k,i,j)   ! PC(r), where PC is formed from diagonal elements of A
+             if (Adiagu(i,j) /= 0.d0) then
+                zu(i,j) = ru(i,j) / Adiagu(i,j)   ! PC(r), where PC is formed from diagonal elements of A
              else                                        
-                zu(k,i,j) = 0.d0
+                zu(i,j) = 0.d0
              endif
-             if (Adiagv(k,i,j) /= 0.d0) then
-                zv(k,i,j) = rv(k,i,j) / Adiagv(k,i,j)  
+             if (Adiagv(i,j) /= 0.d0) then
+                zv(i,j) = rv(i,j) / Adiagv(i,j)  
              else                                        
-                zv(k,i,j) = 0.d0
+                zv(i,j) = 0.d0
              endif
-          enddo    ! k
           enddo    ! i
           enddo    ! j
 
-       elseif (precond == HO_PRECOND_SIA) then   ! local vertical shallow-ice solver for preconditioning
-
-          call easy_sia_solver(nx,   ny,   nz,        &
-                               active_vertex,         &
-                               Muu,  ru,   zu)      ! solve Muu*zu = ru for zu 
+       elseif(precond == HO_PRECOND_TRIDIAG_LOCAL) then  ! local
 
-          call easy_sia_solver(nx,   ny,   nz,        &
-                               active_vertex,         &
-                               Mvv,  rv,   zv)      ! solve Mvv*zv = rv for zv
+          if (verbose_tridiag .and. this_rank == rtest) then
+             i = itest
+             j = jtest
+             write(iulog,*) 'Residual:'
+             write(iulog,*) 'r, i, j, ru:', this_rank, i, j, ru(i,j)
+             write(iulog,*) 'r, i, j, rv:', this_rank, i, j, rv(i,j)
+             write(iulog,*) ' '
+             write(iulog,*) 'jtest =', jtest
+             write(iulog,*) 'i, ru, rv:'
+             do i = itest-3, itest+3
+                write(iulog,'(i4, 2f15.10)') i, ru(i,j), rv(i,j)
+             enddo
+          endif
 
-       elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then
+          if (verbose_pcg .and. main_task) then
+             write(iulog,*) 'call tridiag_solver_local_2d'
+          endif
 
-          ! Use a local tridiagonal solver to find an approximate solution of A*z = r
+          ! Solve M*z = r, where M is a local tridiagonal matrix (one matrix per task)
+          !TODO - Test a local solver that can compute zu and zv in the halo
+          !       (to avoid the halo update below)
 
-          call tridiag_solver_local_3d(&
+          call tridiag_solver_local_2d(&
                nx,            ny,            &
-               nz,            parallel,      &
-               active_vertex,                &
+               parallel,                     &
                itest, jtest,  rtest,         &
-               Adiag_u,       Adiag_v,       &  ! entries of 2D preconditioning matrix
+               Adiag_u,       Adiag_v,       &  ! entries of preconditioning matrix
                Asubdiag_u,    Asubdiag_v,    &
                Asupdiag_u,    Asupdiag_v,    &
                omega_u,       omega_v,       &
                denom_u,       denom_v,       &
-               Muu,           Mvv,           &  ! entries of SIA matrix
-               ru,            rv,            &  ! 3D residual
-               zu,            zv)               ! approximate solution of Az = r
+               ru,            rv,            &  ! right hand side
+               zu,            zv)               ! solution
+       
+          !WHL - debug
+          if (verbose_pcg .and. this_rank == rtest) then
+             j = jtest
+             write(iulog,*) ' '
+             write(iulog,*) 'tridiag solve: i, ru, rv, zu, zv:'
+             do i = itest-3, itest+3
+                write(iulog,'(i4, 4f16.10)') i, ru(i,j), rv(i,j), zu(i,j), zv(i,j)
+             enddo
+          endif
 
-       elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then
+          !Note: Need zu and zv in a row of halo cells so that q = A*d is correct in all locally owned cells
+          !TODO: See whether tridiag solvers could be modified to provide zu and zv in halo cells?
+          call staggered_parallel_halo(zu, parallel)
+          call staggered_parallel_halo(zv, parallel)
 
-          ! Use a global tridiagonal solver to find an approximate solution of A*z = r
+       elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then   ! tridiagonal preconditioning with global solve
 
+          ! Use a global tridiagonal solver to find an approximate solution of A*z = r
           if (iter == 1) then
              first_time = .true.
           else
              first_time = .false.
           endif
 
-          call tridiag_solver_global_3d(&
-               nx,              ny,              &
-               nz,              parallel,        &
-               active_vertex,                    &
-               ilocal,          jlocal,          &
-               tasks_row,       tasks_col,       &
-               itest,  jtest,   rtest,           &
-               Adiag_u,         Adiag_v,         &  ! entries of 2D preconditioning matrix
-               Asubdiag_u,      Asubdiag_v,      &
-               Asupdiag_u,      Asupdiag_v,      &
-               omega_u,         omega_v,         &
-               denom_u,         denom_v,         &
-               xuh_u,           xuh_v,           &
-               xlh_u,           xlh_v,           &
-               Muu,             Mvv,             &  ! entries of SIA matrix
-               gather_data_row, gather_data_col, &
-               first_time,                       &
-               ru,              rv,              &  ! 3D residual
-               zu,              zv)                 ! approximate solution of Az = r
+          ! convert ru(nx-1,ny-1) to b_u(ilocal,jlocal)
+          do j = 1, jlocal
+             jj = j + staggered_jlo - 1
+             do i = 1, ilocal
+                ii = i + staggered_ilo - 1
+                b_u(i,j) = ru(ii,jj)
+             enddo
+          enddo
+
+          ! Solve M*z = r, where M is a global tridiagonal matrix
+
+          call tridiag_solver_global_2d(&
+               ilocal,       jlocal,      &
+               parallel,     tasks_row,   &
+               'row',                     &  ! tridiagonal solve for each row
+!!                                          itest, jtest, rtest,      &
+               itest - staggered_ilo + 1, &  ! itest referenced to (ilocal,jlocal) coordinates                                  
+               jtest - staggered_jlo + 1, &  ! jtest referenced to (ilocal,jlocal) coordinates
+               rtest,                     &
+               Adiag_u,                   &
+               Asubdiag_u,   Asupdiag_u,  &
+               omega_u,      denom_u,     &
+               xuh_u,        xlh_u,       &
+               b_u,          x_u,         &
+               first_time,                &
+               gather_data_row)
+
+          ! convert x_u(ilocal,jlocal) to zu(nx-1,ny-1)
+          zu(:,:) = 0.0d0
+          do j = 1, jlocal
+             jj = j + staggered_jlo - 1
+             do i = 1, ilocal
+                ii = i + staggered_ilo - 1
+                zu(ii,jj) = x_u(i,j)
+             enddo
+          enddo
+
+          ! convert rv(nx-1,ny-1) to b_v(jlocal,ilocal)
+          do i = 1, ilocal
+             ii = i + staggered_ilo - 1
+             do j = 1, jlocal
+                jj = j + staggered_jlo - 1
+                b_v(j,i) = rv(ii,jj)
+             enddo
+          enddo
+
+          call tridiag_solver_global_2d(&
+               jlocal,       ilocal,      &
+               parallel,     tasks_col,   &
+               'col',                     &  ! tridiagonal solve for each column
+               !!                                          itest, jtest, rtest,      &
+               jtest - staggered_jlo + 1, &  ! jtest referenced to (jlocal,ilocal) coordinates
+               itest - staggered_ilo + 1, &  ! itest referenced to (jlocal,ilocal) coordinates
+               rtest,                     &
+               Adiag_v,                   &
+               Asubdiag_v,   Asupdiag_v,  &
+               omega_v,      denom_v,     &
+               xuh_v,        xlh_v,       &
+               b_v,          x_v,         &
+               first_time,                &
+               gather_data_col)
+
+          ! convert x_v(jlocal,ilocal) to zv(nx-1,ny-1)
+          zv(:,:) = 0.0d0
+          do i = 1, ilocal
+             ii = i + staggered_ilo - 1
+             do j = 1, jlocal
+                jj = j + staggered_jlo - 1
+                zv(ii,jj) = x_v(j,i)
+             enddo
+          enddo
 
+          !Note: Need zu and zv in a row of halo cells so that q = A*d is correct in all locally owned cells
+          !TODO: See whether tridiag_solver_local_2d could be modified to provide zu and zv in halo cells?
+          call staggered_parallel_halo(zu, parallel)
+          call staggered_parallel_halo(zv, parallel)
+          
        endif    ! precond
 
        call t_stopf("pcg_precond")
@@ -557,8 +623,8 @@ subroutine pcg_solver_standard_3d(nx,        ny,            &
        ! Compute the dot product eta1 = (r, PC(r))
 
        call t_startf("pcg_dotprod")
-       work0u(:,:,:) = ru(:,:,:)*zu(:,:,:)    ! terms of dot product (r, PC(r))
-       work0v(:,:,:) = rv(:,:,:)*zv(:,:,:)    
+       work0u(:,:) = ru(:,:)*zu(:,:)    ! terms of dot product (r, PC(r))
+       work0v(:,:) = rv(:,:)*zv(:,:)    
        call t_stopf("pcg_dotprod")
 
        call t_startf("pcg_glbsum_iter")
@@ -569,7 +635,7 @@ subroutine pcg_solver_standard_3d(nx,        ny,            &
        !      then eta1 will be NaN.
  
        if (eta1 /= eta1) then  ! eta1 is NaN
-          call write_log('PCG solver has failed, alpha = NaN', GM_FATAL)
+          call write_log('PCG solver has failed, eta1 = NaN', GM_FATAL)
        endif
 
        ! Update the conjugate direction vector d
@@ -577,16 +643,16 @@ subroutine pcg_solver_standard_3d(nx,        ny,            &
        beta = eta1/eta0
 
        call t_startf("pcg_vecupdate")
-       du(:,:,:) = zu(:,:,:) + beta*du(:,:,:)       ! d_(i+1) = PC(r_(i+1)) + beta_(i+1)*d_i
-       dv(:,:,:) = zv(:,:,:) + beta*dv(:,:,:)       !
-                                                    !                    (r_(i+1), PC(r_(i+1)))
-                                                    ! where beta_(i+1) = --------------------  
-                                                    !                        (r_i, PC(r_i)) 
-                                                    ! Initially eta0 = 1  
-                                                    ! For n >=2, eta0 = old eta1
-       call t_stopf("pcg_vecupdate")
-
-       ! Halo update for d
+       du(:,:) = zu(:,:) + beta*du(:,:)       ! d_(i+1) = PC(r_(i+1)) + beta_(i+1)*d_i
+       dv(:,:) = zv(:,:) + beta*dv(:,:)       !
+                                              !                    (r_(i+1), PC(r_(i+1)))
+                                              ! where beta_(i+1) = --------------------  
+                                              !                        (r_i, PC(r_i)) 
+                                              ! Initially eta0 = 1  
+                                              ! For n >=2, eta0 = old eta1
+       call t_stopf("pcg_vecupdate")
+
+       ! Halo update for d
 
        call t_startf("pcg_halo_iter")
        call staggered_parallel_halo(du, parallel)
@@ -597,9 +663,9 @@ subroutine pcg_solver_standard_3d(nx,        ny,            &
        ! This is the one matvec multiply required for each iteration
 
        call t_startf("pcg_matmult_iter")
-       call matvec_multiply_structured_3d(nx,            ny,            &
-                                          nz,            parallel,      &
-                                          indxA_3d,      active_vertex, &
+       call matvec_multiply_structured_2d(nx,            ny,            &
+                                          parallel,                     &
+                                          indxA_2d,      active_vertex, &
                                           Auu,           Auv,           &
                                           Avu,           Avv,           &
                                           du,            dv,            &
@@ -613,8 +679,8 @@ subroutine pcg_solver_standard_3d(nx,        ny,            &
        ! Compute the dot product eta2 = (d, A*d)
 
        call t_startf("pcg_dotprod")
-       work0u(:,:,:) = du(:,:,:) * qu(:,:,:)       ! terms of dot product (d, Ad)
-       work0v(:,:,:) = dv(:,:,:) * qv(:,:,:)
+       work0u(:,:) = du(:,:) * qu(:,:)       ! terms of dot product (d, Ad)
+       work0v(:,:) = dv(:,:) * qv(:,:)
        call t_stopf("pcg_dotprod")
 
        call t_startf("pcg_glbsum_iter")
@@ -636,26 +702,13 @@ subroutine pcg_solver_standard_3d(nx,        ny,            &
        ! Compute the new solution and residual
 
        call t_startf("pcg_vecupdate")
-       xu(:,:,:) = xu(:,:,:) + alpha * du(:,:,:)    ! new solution, x_(i+1) = x_i + alpha*d
-       xv(:,:,:) = xv(:,:,:) + alpha * dv(:,:,:)
+       xu(:,:) = xu(:,:) + alpha * du(:,:)    ! new solution, x_(i+1) = x_i + alpha*d
+       xv(:,:) = xv(:,:) + alpha * dv(:,:)
 
-       ru(:,:,:) = ru(:,:,:) - alpha * qu(:,:,:)    ! new residual, r_(i+1) = r_i - alpha*(Ad)
-       rv(:,:,:) = rv(:,:,:) - alpha * qv(:,:,:)
+       ru(:,:) = ru(:,:) - alpha * qu(:,:)    ! new residual, r_(i+1) = r_i - alpha*(Ad)
+       rv(:,:) = rv(:,:) - alpha * qv(:,:)
        call t_stopf("pcg_vecupdate")
 
-       if (verbose_pcg .and. this_rank == rtest) then
-          j = jtest
-          write(iulog,*) ' '
-          write(iulog,*) 'iter =', iter
-          write(iulog,*) 'i, k, xu, xv, ru, rv:'
-          do i = itest-3, itest+3
-             write(iulog,*) ' '
-             do k = 1, nz
-                write(iulog,'(2i4, 4f16.10)') i, k, xu(k,i,j), xv(k,i,j), ru(k,i,j), rv(k,i,j)
-             enddo
-          enddo  ! i
-       endif
-
        ! Check for convergence every linear_solve_ncheck iterations.
        ! Also check at iter = 5, to reduce iterations when the nonlinear solver is close to convergence.
        ! TODO: Check at iter = linear_solve_ncheck/2 instead of 5?  This would be answer-changing.
@@ -664,11 +717,6 @@ subroutine pcg_solver_standard_3d(nx,        ny,            &
 
        if (mod(iter, linear_solve_ncheck) == 0 .or. iter == 5) then
 
-          if (verbose_pcg .and. main_task) then
-             write(iulog,*) ' '
-             write(iulog,*) 'Check convergence, iter =', iter
-          endif
-
           ! Halo update for x
 
           call t_startf("pcg_halo_resid")
@@ -679,9 +727,9 @@ subroutine pcg_solver_standard_3d(nx,        ny,            &
           ! Compute A*x (use z as a temp vector for A*x)
            
           call t_startf("pcg_matmult_resid")
-          call matvec_multiply_structured_3d(nx,            ny,            &
-                                             nz,            parallel,      &
-                                             indxA_3d,      active_vertex, &
+          call matvec_multiply_structured_2d(nx,            ny,            &
+                                             parallel,                     &
+                                             indxA_2d,      active_vertex, &
                                              Auu,           Auv,           &
                                              Avu,           Avv,           &
                                              xu,            xv,            &
@@ -691,15 +739,15 @@ subroutine pcg_solver_standard_3d(nx,        ny,            &
           ! Compute residual r = b - Ax
 
           call t_startf("pcg_vecupdate")
-          ru(:,:,:) = bu(:,:,:) - zu(:,:,:)
-          rv(:,:,:) = bv(:,:,:) - zv(:,:,:)
+          ru(:,:) = bu(:,:) - zu(:,:)
+          rv(:,:) = bv(:,:) - zv(:,:)
           call t_stopf("pcg_vecupdate")
 
           ! Compute squared L2 norm of (r, r)
 
           call t_startf("pcg_dotprod")
-          work0u(:,:,:) = ru(:,:,:)*ru(:,:,:)   ! terms of dot product (r, r)
-          work0v(:,:,:) = rv(:,:,:)*rv(:,:,:)
+          work0u(:,:) = ru(:,:)*ru(:,:)   ! terms of dot product (r, r)
+          work0v(:,:) = rv(:,:)*rv(:,:)
           call t_stopf("pcg_dotprod")
 
           call t_startf("pcg_glbsum_resid")
@@ -712,10 +760,6 @@ subroutine pcg_solver_standard_3d(nx,        ny,            &
           ! compute normalized error
           err = L2_resid/L2_rhs
 
-          if (verbose_pcg .and. main_task) then
-             write(iulog,*) 'iter, L2_resid, L2_rhs, error =', iter, L2_resid, L2_rhs, err
-          endif
-
           !WHL - debug
           if (verbose_pcg .and. this_rank == rtest) then
              ru_max = 0.d0
@@ -724,13 +768,13 @@ subroutine pcg_solver_standard_3d(nx,        ny,            &
              ju_max = 0
              do j = staggered_jlo, staggered_jhi
                 do i = staggered_ilo, staggered_ihi
-                   if (sum(abs(ru(:,i,j))) > ru_max) then
-                      ru_max = sum(abs(ru(:,i,j)))
+                   if (abs(ru(i,j)) > ru_max) then
+                      ru_max = ru(i,j)
                       iu_max = i
                       ju_max = j
                    endif
-                   if (sum(abs(rv(:,i,j))) > rv_max) then
-                      rv_max = sum(abs(rv(:,i,j)))
+                   if (abs(rv(i,j)) > rv_max) then
+                      rv_max = rv(i,j)
                       iv_max = i
                       jv_max = j
                    endif
@@ -771,16 +815,19 @@ subroutine pcg_solver_standard_3d(nx,        ny,            &
     if (allocated(denom_u))    deallocate(denom_u, denom_v)
     if (allocated(xuh_u))      deallocate(xuh_u, xuh_v)
     if (allocated(xlh_u))      deallocate(xlh_u, xlh_v)
+    if (allocated(b_u))        deallocate(b_u, b_v)
+    if (allocated(x_u))        deallocate(x_u, x_v)
     if (allocated(gather_data_row)) deallocate(gather_data_row)
     if (allocated(gather_data_col)) deallocate(gather_data_col)
 
-  end subroutine pcg_solver_standard_3d
+  end subroutine pcg_solver_standard_2d
 
 !****************************************************************************
 
-  subroutine pcg_solver_standard_2d(nx,        ny,            &
-                                    parallel,                 &
-                                    indxA_2d,  active_vertex, &
+  subroutine pcg_solver_standard_3d(nx,        ny,            &
+                                    nz,        parallel,      &
+                                    indxA_2d,  indxA_3d,      &
+                                    active_vertex,            &
                                     Auu,       Auv,           &
                                     Avu,       Avv,           &
                                     bu,        bv,            &
@@ -795,25 +842,50 @@ subroutine pcg_solver_standard_2d(nx,        ny,            &
     !  to solve the equation $Ax=b$.
     !  Convergence is checked every {\em linear_solve_ncheck} steps.
     !
-    !  It is similar to subroutine pcg_solver_standard_3d, but modified
-    !  to solve for x and y at a single horizontal level, as in the
-    !  shallow-shelf approximation.  See the comments in that subroutine
-    !  (above) for more details on data structure and solver methods.
-    !
-    !  Input and output arrays are located on a structured (i,j) grid 
-    !  as defined in the glissade_velo_higher module.  The global matrix 
-    !  is sparse, but its nonzero element are stored in four dense matrices 
-    !  called Auu, Avv, Auv, and Avu. Each matrix has 3x3 = 9 potential 
-    !  nonzero elements per node (i,j).
+    !  It is based on the barotropic solver in the POP ocean model 
+    !  (author Phil Jones, LANL).  Input and output arrays are located
+    !  on a structured (i,j,k) grid as defined in the glissade_velo_higher
+    !  module.  The global matrix is sparse, but its nonzero elements
+    !  are stored in four dense matrices called Auu, Avv, Auv, and Avu.
+    !  Each matrix has 3x3x3 = 27 potential nonzero elements per
+    !  node (i,j,k).
     !
     !  The current preconditioning options are
     !  (0) no preconditioning
     !  (1) diagonal preconditioning
-    !
-    !  The SIA-based preconditioning optional is not available for a 2D solve.
+    !  (2) preconditioning using a physics-based SIA solver
     ! 
-    !  TODO: Add a tridiagonal preconditioning option to this subroutine,
-    !        as for subroutine pcg_solver_chrongear_2d.
+    !  For the dome test case with higher-order dynamics, option (2) is best. 
+    !
+    !  Here is a schematic of the method implemented below for solving Ax = b:
+    !
+    !  halo_update(x0)
+    !  r0 = b - A*x0
+    !  d0 = 0
+    !  eta0 = 1
+    !
+    !  while (not converged)
+    !     solve Mz = r for z
+    !     eta1 = (r,z)
+    !     beta = eta1/eta0
+    !     d = z + beta*d
+    !     halo_update(d)
+    !     eta0 = eta1
+    !     q = Ad
+    !     eta2 = (d,q)
+    !     alpha = eta1/eta2
+    !     x = x + alpha*d
+    !     r = r - alpha*q (or occasionally, r = b - Ax)
+    !     Check for convergence: err = sqrt(r,r)/sqrt(b,b) < tolerance
+    !  end while
+    !
+    !  where x = solution (initial value = x0)
+    !        d = conjugate direction vector (initial value = d0)
+    !        r = residual vector (initial value = r0)
+    !        M = preconditioning matrix
+    !    (r,z) = dot product of vectors r and z
+    !            and similarly for (d,q)
+    !       
     !---------------------------------------------------------------
 
     !---------------------------------------------------------------
@@ -821,62 +893,70 @@ subroutine pcg_solver_standard_2d(nx,        ny,            &
     !---------------------------------------------------------------
 
     integer, intent(in) ::   &
-       nx, ny                 ! horizontal grid dimensions (for scalars)
-                              ! velocity grid has dimensions (nx-1,ny-1)
+       nx, ny,               &  ! horizontal grid dimensions (for scalars)
+                                ! velocity grid has dimensions (nx-1,ny-1)
+       nz                       ! number of vertical levels where velocity is computed
 
     type(parallel_type), intent(in) :: &
-         parallel             ! info for parallel communication
+       parallel               ! info for parallel communication
 
     integer, dimension(-1:1,-1:1), intent(in) :: &
        indxA_2d               ! maps relative (x,y) coordinates to an index between 1 and 9
 
+    integer, dimension(-1:1,-1:1,-1:1), intent(in) :: &
+       indxA_3d               ! maps relative (x,y,z) coordinates to an index between 1 and 27
+
     logical, dimension(nx-1,ny-1), intent(in) ::   &
-       active_vertex          ! T for vertices (i,j) where velocity is computed, else F
+       active_vertex          ! T for columns (i,j) where velocity is computed, else F
  
-    real(dp), dimension(nx-1,ny-1,9), intent(in) ::   &
+    real(dp), dimension(27,nz,nx-1,ny-1), intent(in) ::   &
        Auu, Auv, Avu, Avv     ! four components of assembled matrix
-                              ! 3rd dimension = 9 (node and its nearest neighbors in x and y direction)
-                              ! 1st and 2nd dimensions = (x,y) indices
+                              ! 1st dimension = 27 (node and its nearest neighbors in x, y and z direction)
+                              ! other dimensions = (z,x,y) indices
                               !
                               !    Auu  | Auv
                               !    _____|____
                               !    Avu  | Avv
                               !         |
 
-    real(dp), dimension(nx-1,ny-1), intent(in) ::  &
+    real(dp), dimension(nz,nx-1,ny-1), intent(in) ::  &
        bu, bv             ! assembled load (rhs) vector, divided into 2 parts
 
-    real(dp), dimension(nx-1,ny-1), intent(inout) ::   &
+    real(dp), dimension(nz,nx-1,ny-1), intent(inout) ::   &
        xu, xv             ! u and v components of solution (i.e., uvel and vvel)
 
     integer, intent(in)  ::   &
        precond           ! = 0 for no preconditioning
                          ! = 1 for diagonal preconditioning (best option for SSA-dominated flow)
+                         ! = 2 for preconditioning with SIA solver (works well for SIA-dominated flow)
 
     integer, intent(in)  :: &
-       linear_solve_ncheck         ! number of iterations between convergence checks in the linear solver
+       linear_solve_ncheck          ! number of iterations between convergence checks in the linear solver
 
     integer, intent(in) ::    &
-       maxiters                    ! max number of linear iterations before quitting
+       maxiters                     ! max number of linear iterations before quitting
 
     real(dp), intent(in) ::   &
-       tolerance                   ! tolerance for linear solver
+       tolerance                    ! tolerance for linear solver
 
     real(dp), intent(out) ::  &
-       err                         ! error (L2 norm of residual) in final solution
+       err                          ! error (L2 norm of residual) in final solution
 
     integer, intent(out) ::   &
-       niters                      ! iterations needed to solution
+       niters                       ! iterations needed to solution
 
     integer, intent(in) :: &
-       itest, jtest, rtest         ! point for debugging diagnostics
+       itest, jtest, rtest          ! point for debugging diagnostics
 
     !---------------------------------------------------------------
     ! Local variables and parameters   
     !---------------------------------------------------------------
 
-    integer ::  i, j, ii, jj ! grid indices
-    integer ::  iter         ! iteration counter
+    integer :: i, j, k        ! grid indices
+    integer :: iA, jA, kA     ! grid offsets ranging from -1 to 1
+    integer :: m              ! matrix element index
+    integer :: ilocal, jlocal ! number of locally owned vertices in each direction
+    integer :: iter           ! iteration counter
 
     real(dp) ::           &
        eta0, eta1, eta2,  &! scalar inner product results
@@ -884,7 +964,7 @@ subroutine pcg_solver_standard_2d(nx,        ny,            &
        beta                ! eta1/eta0 = term in expression for new direction vector
 
     ! vectors (each of these is split into u and v components)
-    real(dp), dimension(nx-1,ny-1) ::  &
+    real(dp), dimension(nz,nx-1,ny-1) ::  &
        Adiagu, Adiagv,    &! diagonal terms of matrices Auu and Avv
        ru, rv,            &! residual vector (b-Ax)
        du, dv,            &! conjugate direction vector
@@ -897,26 +977,33 @@ subroutine pcg_solver_standard_2d(nx,        ny,            &
        L2_rhs              ! L2 norm of rhs vector b
                            ! solver converges when L2_resid/L2_rhs < tolerance
 
-    ! tridiagonal matrix elements
+    real(dp), dimension(-1:1,nz,nx-1,ny-1) ::  &
+       Muu, Mvv            ! simplified SIA matrices for preconditioning
+
+    ! arrays for tridiagonal preconditioning
+    ! Note: 2D diagonal entries are Adiag_u and Adiag_v; distinct from 3D Adiagu and Adiagv above
     real(dp), dimension(:,:), allocatable :: &
          Asubdiag_u, Adiag_u, Asupdiag_u,  &  ! matrix entries from Auu for tridiagonal preconditioning
          Asubdiag_v, Adiag_v, Asupdiag_v      ! matrix entries from Avv for tridiagonal preconditioning
-    
+
     real(dp), dimension(:,:), allocatable :: &
          omega_u, omega_v,         & ! work arrays for tridiagonal solve
-         denom_u, denom_v,         &
+	 denom_u, denom_v,         &
          xuh_u,   xuh_v,           &
          xlh_u,   xlh_v
 
-    real(dp), dimension(:,:), allocatable :: &
-         b_u, b_v, x_u, x_v
+    ! Note: These two matrices are global in the EW and NS dimensions, respectively.
+    !       Each holds 8 pieces of information for each task on each row or column.
+    !       Since only 2 of these 8 pieces of information change from one iteration to the next,
+    !        it is more efficient to gather the remaining information once and pass the arrays
+    !        with intent(inout), than to declare the arrays in subroutine tridiag_solver_global_2d
+    !        and gather all the information every time the subroutine is called.
+    ! TODO: Revisit this. Is the efficiency gain large enough to justify the extra complexity?
 
     real(dp), dimension(:,:), allocatable :: &
          gather_data_row,   &   ! arrays for gathering data from every task on a row or column
          gather_data_col
 
-    integer :: ilocal, jlocal   ! number of locally owned vertices in each direction   
-
     integer :: &
          tasks_row,         &   ! number of tasks per row and column for tridiagonal solve
          tasks_col
@@ -927,11 +1014,6 @@ subroutine pcg_solver_standard_2d(nx,        ny,            &
          staggered_ilo, staggered_ihi, &  ! bounds of locally owned vertices on staggered grid
          staggered_jlo, staggered_jhi
 
-    !TODO - Apply the following for tridiag PCs?
-!    integer, parameter :: &
-!         maxiters_tridiag = 100 ! max number of linear iterations for tridiagonal preconditioning,
-!                                ! which generally leads to faster convergence than diagonal preconditioning 
-
     !WHL - debug
     integer :: iu_max, ju_max, iv_max, jv_max
     real(dp) :: ru_max, rv_max
@@ -959,33 +1041,48 @@ subroutine pcg_solver_standard_2d(nx,        ny,            &
 
     if (precond == HO_PRECOND_NONE) then    ! no preconditioner
 
-       if (verbose_pcg .and. main_task) then
+       if (verbose_pcg .and. this_rank == rtest) then
           write(iulog,*) 'Using no preconditioner'
        endif
 
     elseif (precond == HO_PRECOND_DIAG) then
 
-       if (verbose_pcg .and. main_task) then
-          write(iulog,*) 'Using diagonal matrix for preconditioning'
-       endif  ! verbose_pcg
-
-       call setup_preconditioner_diag_2d(nx,         ny,       &
-                                         indxA_2d,             &
-                                         Auu,        Avv,      &
-                                         Adiagu,     Adiagv)
-
-    elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then
+       call setup_preconditioner_diag_3d(nx,       ny,         &
+                                         nz,       indxA_3d,   &
+                                         Auu,      Avv,        &
+                                         Adiagu,   Adiagv)
 
        !WHL - debug
-       if (verbose_tridiag .and. this_rank==rtest) then
+       if (verbose_pcg .and. this_rank == rtest) then
           i = itest
+          j = jtest
+          write(iulog,*) 'i, j, r =', i, j, this_rank
+          write(iulog,*) 'Auu diag =', Adiagu(:,i,j)
+          write(iulog,*) 'Avv diag =', Adiagv(:,i,j)
+       endif
+
+    elseif (precond == HO_PRECOND_SIA) then
+
+       call setup_preconditioner_sia_3d(nx,        ny,       &
+                                        nz,        indxA_3d, &
+                                        Auu,       Avv,      &
+                                        Muu,       Mvv)
+
+       if (verbose_pcg .and. this_rank == rtest) then
           j = jtest
           write(iulog,*) ' '
-          write(iulog,*) 'r, i, j =', this_rank, i, j
-          write(iulog,*) 'Auu =', Auu(i,j,:)
-          write(iulog,*) 'Avv =', Avv(i,j,:)
+          write(iulog,*) 'i, k, Muu_sia, Mvv_sia:'
+          do i = staggered_ihi, staggered_ilo, -1
+             write(iulog,*) ' '
+             do k = 1, nz
+                write(iulog,'(2i4, 6e13.5)') i, k, Muu(:,k,i,j), Mvv(:,k,i,j)
+             enddo
+          enddo  ! i
        endif
 
+    elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then
+
+       ! Allocate tridiagonal preconditioning matrices
        allocate(Adiag_u   (nx-1,ny-1))
        allocate(Asubdiag_u(nx-1,ny-1))
        allocate(Asupdiag_u(nx-1,ny-1))
@@ -998,22 +1095,26 @@ subroutine pcg_solver_standard_2d(nx,        ny,            &
        allocate(omega_v   (nx-1,ny-1))
        allocate(denom_v   (nx-1,ny-1))
 
-       call setup_preconditioner_tridiag_local_2d(&
-            nx,            ny,             &
-            parallel,      indxA_2d,       &
-            itest,         jtest,  rtest,  &
-            Auu,           Avv,            &
-            Adiag_u,       Adiag_v,        &
-            Asubdiag_u,    Asubdiag_v,     &
-            Asupdiag_u,    Asupdiag_v,     &
-            omega_u,       omega_v,        &
+       ! Compute arrays for tridiagonal preconditioning
+
+       call setup_preconditioner_tridiag_local_3d(&
+            nx,            ny,            &
+            nz,            parallel,      &
+            active_vertex,                &
+            indxA_2d,      indxA_3d,      &
+            itest, jtest,  rtest,         &
+            Auu,           Avv,           &
+            Muu,           Mvv,           &
+            Adiag_u,       Adiag_v,       &
+            Asubdiag_u,    Asubdiag_v,    &
+            Asupdiag_u,    Asupdiag_v,    &
+            omega_u,       omega_v,       &
             denom_u,       denom_v)
 
     elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then
 
-       ! Allocate tridiagonal matrices
-       ! Note: (i,j) indices are switced for the A_v matrices to reduce striding.
-
+       ! Allocate tridiagonal preconditioning matrices
+       ! Note: (i,j) indices are switched for the A_v matrices to reduce striding.
        allocate(Adiag_u   (ilocal,jlocal))
        allocate(Asubdiag_u(ilocal,jlocal))
        allocate(Asupdiag_u(ilocal,jlocal))
@@ -1021,8 +1122,6 @@ subroutine pcg_solver_standard_2d(nx,        ny,            &
        allocate(denom_u(ilocal,jlocal))
        allocate(xuh_u(ilocal,jlocal))
        allocate(xlh_u(ilocal,jlocal))
-       allocate(b_u(ilocal,jlocal))
-       allocate(x_u(ilocal,jlocal))
 
        allocate(Adiag_v   (jlocal,ilocal))
        allocate(Asubdiag_v(jlocal,ilocal))
@@ -1031,8 +1130,6 @@ subroutine pcg_solver_standard_2d(nx,        ny,            &
        allocate(denom_v(jlocal,ilocal))
        allocate(xuh_v(jlocal,ilocal))
        allocate(xlh_v(jlocal,ilocal))
-       allocate(b_v(jlocal,ilocal))
-       allocate(x_v(jlocal,ilocal))
 
        ! These two matrices are for gathering data from all tasks on a given row or column.
        allocate(gather_data_row(8*tasks_row,jlocal))
@@ -1040,61 +1137,25 @@ subroutine pcg_solver_standard_2d(nx,        ny,            &
        gather_data_row = 0.0d0
        gather_data_col = 0.0d0
 
-       ! Compute the entries of the tridiagonal matrices
-
-       ! Extract tridiagonal matrix entries from Auu
-       do j = 1, jlocal
-          jj = j + staggered_jlo - 1
-          do i = 1, ilocal
-             ii = i + staggered_ilo - 1
-             Asubdiag_u(i,j) = Auu(ii,jj,indxA_2d(-1,0))   ! subdiagonal elements
-             Adiag_u   (i,j) = Auu(ii,jj,indxA_2d( 0,0))   ! diagonal elements
-             Asupdiag_u(i,j) = Auu(ii,jj,indxA_2d( 1,0))   ! superdiagonal elements
-          enddo
-       enddo
-
-       ! compute work arrays for the u solve in each matrix row
-       call setup_preconditioner_tridiag_global_2d(&
-            ilocal,       jlocal,      &
-!!          itest, jtest, rtest,       &
-            itest - staggered_ilo + 1, &  ! itest referenced to (ilocal,jlocal) coordinates
-            jtest - staggered_jlo + 1, &  ! jtest referenced to (ilocal,jlocal) coordinates
-            rtest,                     &
-            Adiag_u,                   &
-            Asubdiag_u,   Asupdiag_u,  &
-            omega_u,      denom_u,     &
-            xuh_u,        xlh_u)
-
-       ! Extract tridiagonal matrix entries from Avv
-
-       do i = 1, ilocal
-          ii = i + staggered_ilo - 1
-          do j = 1, jlocal
-             jj = j + staggered_jlo - 1
-             Asubdiag_v(j,i) = Avv(ii,jj,indxA_2d(0,-1))   ! subdiagonal elements
-             Adiag_v   (j,i) = Avv(ii,jj,indxA_2d(0, 0))   ! diagonal elements
-             Asupdiag_v(j,i) = Avv(ii,jj,indxA_2d(0, 1))   ! superdiagonal elements
-          enddo
-       enddo
-
-       ! compute work arrays for the v solve in each matrix column
-       ! Note: The *_v arrays have dimensions (jlocal,ilocal) to reduce strides
-          
-       call setup_preconditioner_tridiag_global_2d(&
-            jlocal,       ilocal,      &
-!!          itest, jtest, rtest,       &
-            jtest - staggered_jlo + 1, &  ! jtest referenced to (jlocal,ilocal) coordinates
-            itest - staggered_ilo + 1, &  ! itest referenced to (jlocal,ilocal) coordinates
-            rtest,                     &
-            Adiag_v,                   &
-            Asubdiag_v,   Asupdiag_v,  &
-            omega_v,      denom_v,     &
-            xuh_v,        xlh_v)
+       call setup_preconditioner_tridiag_global_3d(&
+            nx,            ny,            &
+            nz,            parallel,      &
+            active_vertex,                &
+            indxA_2d,      indxA_3d,      &
+            ilocal,        jlocal,        &
+            itest, jtest,  rtest,         &
+            Auu,           Avv,           &
+            Muu,           Mvv,           &
+            Adiag_u,       Adiag_v,       &
+            Asubdiag_u,    Asubdiag_v,    &
+            Asupdiag_u,    Asupdiag_v,    &
+            omega_u,       omega_v,       &
+            denom_u,       denom_v,       &
+            xuh_u,         xuh_v,         &
+            xlh_u,         xlh_v)
 
     endif   ! precond
 
-    if (verbose_pcg .and. main_task) write(iulog,*) 'Done in PC setup'
-
     call t_stopf("pcg_precond_init")
 
     ! Compute initial residual and initialize the direction vector d
@@ -1113,9 +1174,9 @@ subroutine pcg_solver_standard_2d(nx,        ny,            &
     ! Compute A*x (use z as a temp vector for A*x)
 
     call t_startf("pcg_matmult_init")
-    call matvec_multiply_structured_2d(nx,            ny,            &
-                                       parallel,                     &
-                                       indxA_2d,      active_vertex, &
+    call matvec_multiply_structured_3d(nx,            ny,            &
+                                       nz,            parallel,      &
+                                       indxA_3d,      active_vertex, &
                                        Auu,           Auv,           &
                                        Avu,           Avv,           &
                                        xu,            xv,            &
@@ -1126,8 +1187,8 @@ subroutine pcg_solver_standard_2d(nx,        ny,            &
     ! This will be correct for locally owned vertices.
 
     call t_startf("pcg_vecupdate_init")
-    ru(:,:) = bu(:,:) - zu(:,:)
-    rv(:,:) = bv(:,:) - zv(:,:)
+    ru(:,:,:) = bu(:,:,:) - zu(:,:,:)
+    rv(:,:,:) = bv(:,:,:) - zv(:,:,:)
     call t_stopf("pcg_vecupdate_init")
 
     ! Initialize scalars and vectors
@@ -1135,18 +1196,18 @@ subroutine pcg_solver_standard_2d(nx,        ny,            &
     niters = maxiters 
     eta0 = 1.d0
 
-    du(:,:) = 0.d0
-    dv(:,:) = 0.d0
+    du(:,:,:) = 0.d0
+    dv(:,:,:) = 0.d0
 
-    zu(:,:) = 0.d0
-    zv(:,:) = 0.d0
+    zu(:,:,:) = 0.d0
+    zv(:,:,:) = 0.d0
 
     ! Compute the L2 norm of the RHS vectors
     ! (Goal is to obtain L2_resid/L2_rhs < tolerance)
 
     call t_startf("pcg_dotprod")
-    work0u(:,:) = bu(:,:)*bu(:,:)    ! terms of dot product (b, b)
-    work0v(:,:) = bv(:,:)*bv(:,:)
+    work0u(:,:,:) = bu(:,:,:)*bu(:,:,:)    ! terms of dot product (b, b)
+    work0v(:,:,:) = bv(:,:,:)*bv(:,:,:)
     call t_stopf("pcg_dotprod")
 
     ! find global sum of the squared L2 norm
@@ -1163,182 +1224,101 @@ subroutine pcg_solver_standard_2d(nx,        ny,            &
 
     iter_loop: do iter = 1, maxiters
 
-       if (verbose_pcg .and. main_task) then
-!          write(iulog,*) 'iter =', iter
-       endif
-
        call t_startf("pcg_precond")
 
        ! Compute PC(r) = solution z of Mz = r
 
        if (precond == HO_PRECOND_NONE) then      ! no preconditioning
 
-           zu(:,:) = ru(:,:)         ! PC(r) = r     
-           zv(:,:) = rv(:,:)         ! PC(r) = r    
+           zu(:,:,:) = ru(:,:,:)         ! PC(r) = r     
+           zv(:,:,:) = rv(:,:,:)         ! PC(r) = r    
 
-       elseif (precond == HO_PRECOND_DIAG) then  ! diagonal preconditioning
+       elseif (precond == HO_PRECOND_DIAG ) then  ! diagonal preconditioning
 
           do j = 1, ny-1
           do i = 1, nx-1
-             if (Adiagu(i,j) /= 0.d0) then
-                zu(i,j) = ru(i,j) / Adiagu(i,j)   ! PC(r), where PC is formed from diagonal elements of A
+          do k = 1, nz
+             if (Adiagu(k,i,j) /= 0.d0) then
+                zu(k,i,j) = ru(k,i,j) / Adiagu(k,i,j)   ! PC(r), where PC is formed from diagonal elements of A
              else                                        
-                zu(i,j) = 0.d0
+                zu(k,i,j) = 0.d0
              endif
-             if (Adiagv(i,j) /= 0.d0) then
-                zv(i,j) = rv(i,j) / Adiagv(i,j)  
+             if (Adiagv(k,i,j) /= 0.d0) then
+                zv(k,i,j) = rv(k,i,j) / Adiagv(k,i,j)  
              else                                        
-                zv(i,j) = 0.d0
+                zv(k,i,j) = 0.d0
              endif
+          enddo    ! k
           enddo    ! i
           enddo    ! j
 
-       elseif(precond == HO_PRECOND_TRIDIAG_LOCAL) then  ! local
+       elseif (precond == HO_PRECOND_SIA) then   ! local vertical shallow-ice solver for preconditioning
 
-          if (verbose_tridiag .and. this_rank == rtest) then
-             i = itest
-             j = jtest
-             write(iulog,*) 'Residual:'
-             write(iulog,*) 'r, i, j, ru:', this_rank, i, j, ru(i,j)
-             write(iulog,*) 'r, i, j, rv:', this_rank, i, j, rv(i,j)
-             write(iulog,*) ' '
-             write(iulog,*) 'jtest =', jtest
-             write(iulog,*) 'i, ru, rv:'
-             do i = itest-3, itest+3
-                write(iulog,'(i4, 2f15.10)') i, ru(i,j), rv(i,j)
-             enddo
-          endif
+          call easy_sia_solver(nx,   ny,   nz,        &
+                               active_vertex,         &
+                               Muu,  ru,   zu)      ! solve Muu*zu = ru for zu 
 
-          if (verbose_pcg .and. main_task) then
-             write(iulog,*) 'call tridiag_solver_local_2d'
-          endif
+          call easy_sia_solver(nx,   ny,   nz,        &
+                               active_vertex,         &
+                               Mvv,  rv,   zv)      ! solve Mvv*zv = rv for zv
 
-          ! Solve M*z = r, where M is a local tridiagonal matrix (one matrix per task)
-          !TODO - Test a local solver that can compute zu and zv in the halo
-          !       (to avoid the halo update below)
+       elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then
 
-          call tridiag_solver_local_2d(&
+          ! Use a local tridiagonal solver to find an approximate solution of A*z = r
+
+          call tridiag_solver_local_3d(&
                nx,            ny,            &
-               parallel,                     &
+               nz,            parallel,      &
+               active_vertex,                &
                itest, jtest,  rtest,         &
-               Adiag_u,       Adiag_v,       &  ! entries of preconditioning matrix
+               Adiag_u,       Adiag_v,       &  ! entries of 2D preconditioning matrix
                Asubdiag_u,    Asubdiag_v,    &
                Asupdiag_u,    Asupdiag_v,    &
                omega_u,       omega_v,       &
                denom_u,       denom_v,       &
-               ru,            rv,            &  ! right hand side
-               zu,            zv)               ! solution
-       
-          !WHL - debug
-          if (verbose_pcg .and. this_rank == rtest) then
-             j = jtest
-             write(iulog,*) ' '
-             write(iulog,*) 'tridiag solve: i, ru, rv, zu, zv:'
-             do i = itest-3, itest+3
-                write(iulog,'(i4, 4f16.10)') i, ru(i,j), rv(i,j), zu(i,j), zv(i,j)
-             enddo
-          endif
-
-          !Note: Need zu and zv in a row of halo cells so that q = A*d is correct in all locally owned cells
-          !TODO: See whether tridiag solvers could be modified to provide zu and zv in halo cells?
-          call staggered_parallel_halo(zu, parallel)
-          call staggered_parallel_halo(zv, parallel)
+               Muu,           Mvv,           &  ! entries of SIA matrix
+               ru,            rv,            &  ! 3D residual
+               zu,            zv)               ! approximate solution of Az = r
 
-       elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then   ! tridiagonal preconditioning with global solve
+       elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then
 
           ! Use a global tridiagonal solver to find an approximate solution of A*z = r
+
           if (iter == 1) then
              first_time = .true.
           else
              first_time = .false.
           endif
 
-          ! convert ru(nx-1,ny-1) to b_u(ilocal,jlocal)
-          do j = 1, jlocal
-             jj = j + staggered_jlo - 1
-             do i = 1, ilocal
-                ii = i + staggered_ilo - 1
-                b_u(i,j) = ru(ii,jj)
-             enddo
-          enddo
+          call tridiag_solver_global_3d(&
+               nx,              ny,              &
+               nz,              parallel,        &
+               active_vertex,                    &
+               ilocal,          jlocal,          &
+               tasks_row,       tasks_col,       &
+               itest,  jtest,   rtest,           &
+               Adiag_u,         Adiag_v,         &  ! entries of 2D preconditioning matrix
+               Asubdiag_u,      Asubdiag_v,      &
+               Asupdiag_u,      Asupdiag_v,      &
+               omega_u,         omega_v,         &
+               denom_u,         denom_v,         &
+               xuh_u,           xuh_v,           &
+               xlh_u,           xlh_v,           &
+               Muu,             Mvv,             &  ! entries of SIA matrix
+               gather_data_row, gather_data_col, &
+               first_time,                       &
+               ru,              rv,              &  ! 3D residual
+               zu,              zv)                 ! approximate solution of Az = r
 
-          ! Solve M*z = r, where M is a global tridiagonal matrix
+       endif    ! precond
 
-          call tridiag_solver_global_2d(&
-               ilocal,       jlocal,      &
-               parallel,     tasks_row,   &
-               'row',                     &  ! tridiagonal solve for each row
-!!                                          itest, jtest, rtest,      &
-               itest - staggered_ilo + 1, &  ! itest referenced to (ilocal,jlocal) coordinates                                  
-               jtest - staggered_jlo + 1, &  ! jtest referenced to (ilocal,jlocal) coordinates
-               rtest,                     &
-               Adiag_u,                   &
-               Asubdiag_u,   Asupdiag_u,  &
-               omega_u,      denom_u,     &
-               xuh_u,        xlh_u,       &
-               b_u,          x_u,         &
-               first_time,                &
-               gather_data_row)
+       call t_stopf("pcg_precond")
 
-          ! convert x_u(ilocal,jlocal) to zu(nx-1,ny-1)
-          zu(:,:) = 0.0d0
-          do j = 1, jlocal
-             jj = j + staggered_jlo - 1
-             do i = 1, ilocal
-                ii = i + staggered_ilo - 1
-                zu(ii,jj) = x_u(i,j)
-             enddo
-          enddo
-
-          ! convert rv(nx-1,ny-1) to b_v(jlocal,ilocal)
-          do i = 1, ilocal
-             ii = i + staggered_ilo - 1
-             do j = 1, jlocal
-                jj = j + staggered_jlo - 1
-                b_v(j,i) = rv(ii,jj)
-             enddo
-          enddo
-
-          call tridiag_solver_global_2d(&
-               jlocal,       ilocal,      &
-               parallel,     tasks_col,   &
-               'col',                     &  ! tridiagonal solve for each column
-               !!                                          itest, jtest, rtest,      &
-               jtest - staggered_jlo + 1, &  ! jtest referenced to (jlocal,ilocal) coordinates
-               itest - staggered_ilo + 1, &  ! itest referenced to (jlocal,ilocal) coordinates
-               rtest,                     &
-               Adiag_v,                   &
-               Asubdiag_v,   Asupdiag_v,  &
-               omega_v,      denom_v,     &
-               xuh_v,        xlh_v,       &
-               b_v,          x_v,         &
-               first_time,                &
-               gather_data_col)
-
-          ! convert x_v(jlocal,ilocal) to zv(nx-1,ny-1)
-          zv(:,:) = 0.0d0
-          do i = 1, ilocal
-             ii = i + staggered_ilo - 1
-             do j = 1, jlocal
-                jj = j + staggered_jlo - 1
-                zv(ii,jj) = x_v(j,i)
-             enddo
-          enddo
-
-          !Note: Need zu and zv in a row of halo cells so that q = A*d is correct in all locally owned cells
-          !TODO: See whether tridiag_solver_local_2d could be modified to provide zu and zv in halo cells?
-          call staggered_parallel_halo(zu, parallel)
-          call staggered_parallel_halo(zv, parallel)
-          
-       endif    ! precond
-
-       call t_stopf("pcg_precond")
-
-       ! Compute the dot product eta1 = (r, PC(r))
+       ! Compute the dot product eta1 = (r, PC(r))
 
        call t_startf("pcg_dotprod")
-       work0u(:,:) = ru(:,:)*zu(:,:)    ! terms of dot product (r, PC(r))
-       work0v(:,:) = rv(:,:)*zv(:,:)    
+       work0u(:,:,:) = ru(:,:,:)*zu(:,:,:)    ! terms of dot product (r, PC(r))
+       work0v(:,:,:) = rv(:,:,:)*zv(:,:,:)    
        call t_stopf("pcg_dotprod")
 
        call t_startf("pcg_glbsum_iter")
@@ -1349,7 +1329,7 @@ subroutine pcg_solver_standard_2d(nx,        ny,            &
        !      then eta1 will be NaN.
  
        if (eta1 /= eta1) then  ! eta1 is NaN
-          call write_log('PCG solver has failed, eta1 = NaN', GM_FATAL)
+          call write_log('PCG solver has failed, alpha = NaN', GM_FATAL)
        endif
 
        ! Update the conjugate direction vector d
@@ -1357,13 +1337,13 @@ subroutine pcg_solver_standard_2d(nx,        ny,            &
        beta = eta1/eta0
 
        call t_startf("pcg_vecupdate")
-       du(:,:) = zu(:,:) + beta*du(:,:)       ! d_(i+1) = PC(r_(i+1)) + beta_(i+1)*d_i
-       dv(:,:) = zv(:,:) + beta*dv(:,:)       !
-                                              !                    (r_(i+1), PC(r_(i+1)))
-                                              ! where beta_(i+1) = --------------------  
-                                              !                        (r_i, PC(r_i)) 
-                                              ! Initially eta0 = 1  
-                                              ! For n >=2, eta0 = old eta1
+       du(:,:,:) = zu(:,:,:) + beta*du(:,:,:)       ! d_(i+1) = PC(r_(i+1)) + beta_(i+1)*d_i
+       dv(:,:,:) = zv(:,:,:) + beta*dv(:,:,:)       !
+                                                    !                    (r_(i+1), PC(r_(i+1)))
+                                                    ! where beta_(i+1) = --------------------  
+                                                    !                        (r_i, PC(r_i)) 
+                                                    ! Initially eta0 = 1  
+                                                    ! For n >=2, eta0 = old eta1
        call t_stopf("pcg_vecupdate")
 
        ! Halo update for d
@@ -1377,9 +1357,9 @@ subroutine pcg_solver_standard_2d(nx,        ny,            &
        ! This is the one matvec multiply required for each iteration
 
        call t_startf("pcg_matmult_iter")
-       call matvec_multiply_structured_2d(nx,            ny,            &
-                                          parallel,                     &
-                                          indxA_2d,      active_vertex, &
+       call matvec_multiply_structured_3d(nx,            ny,            &
+                                          nz,            parallel,      &
+                                          indxA_3d,      active_vertex, &
                                           Auu,           Auv,           &
                                           Avu,           Avv,           &
                                           du,            dv,            &
@@ -1393,8 +1373,8 @@ subroutine pcg_solver_standard_2d(nx,        ny,            &
        ! Compute the dot product eta2 = (d, A*d)
 
        call t_startf("pcg_dotprod")
-       work0u(:,:) = du(:,:) * qu(:,:)       ! terms of dot product (d, Ad)
-       work0v(:,:) = dv(:,:) * qv(:,:)
+       work0u(:,:,:) = du(:,:,:) * qu(:,:,:)       ! terms of dot product (d, Ad)
+       work0v(:,:,:) = dv(:,:,:) * qv(:,:,:)
        call t_stopf("pcg_dotprod")
 
        call t_startf("pcg_glbsum_iter")
@@ -1416,13 +1396,26 @@ subroutine pcg_solver_standard_2d(nx,        ny,            &
        ! Compute the new solution and residual
 
        call t_startf("pcg_vecupdate")
-       xu(:,:) = xu(:,:) + alpha * du(:,:)    ! new solution, x_(i+1) = x_i + alpha*d
-       xv(:,:) = xv(:,:) + alpha * dv(:,:)
+       xu(:,:,:) = xu(:,:,:) + alpha * du(:,:,:)    ! new solution, x_(i+1) = x_i + alpha*d
+       xv(:,:,:) = xv(:,:,:) + alpha * dv(:,:,:)
 
-       ru(:,:) = ru(:,:) - alpha * qu(:,:)    ! new residual, r_(i+1) = r_i - alpha*(Ad)
-       rv(:,:) = rv(:,:) - alpha * qv(:,:)
+       ru(:,:,:) = ru(:,:,:) - alpha * qu(:,:,:)    ! new residual, r_(i+1) = r_i - alpha*(Ad)
+       rv(:,:,:) = rv(:,:,:) - alpha * qv(:,:,:)
        call t_stopf("pcg_vecupdate")
 
+       if (verbose_pcg .and. this_rank == rtest) then
+          j = jtest
+          write(iulog,*) ' '
+          write(iulog,*) 'iter =', iter
+          write(iulog,*) 'i, k, xu, xv, ru, rv:'
+          do i = itest-3, itest+3
+             write(iulog,*) ' '
+             do k = 1, nz
+                write(iulog,'(2i4, 4f16.10)') i, k, xu(k,i,j), xv(k,i,j), ru(k,i,j), rv(k,i,j)
+             enddo
+          enddo  ! i
+       endif
+
        ! Check for convergence every linear_solve_ncheck iterations.
        ! Also check at iter = 5, to reduce iterations when the nonlinear solver is close to convergence.
        ! TODO: Check at iter = linear_solve_ncheck/2 instead of 5?  This would be answer-changing.
@@ -1431,6 +1424,11 @@ subroutine pcg_solver_standard_2d(nx,        ny,            &
 
        if (mod(iter, linear_solve_ncheck) == 0 .or. iter == 5) then
 
+          if (verbose_pcg .and. main_task) then
+             write(iulog,*) ' '
+             write(iulog,*) 'Check convergence, iter =', iter
+          endif
+
           ! Halo update for x
 
           call t_startf("pcg_halo_resid")
@@ -1441,9 +1439,9 @@ subroutine pcg_solver_standard_2d(nx,        ny,            &
           ! Compute A*x (use z as a temp vector for A*x)
            
           call t_startf("pcg_matmult_resid")
-          call matvec_multiply_structured_2d(nx,            ny,            &
-                                             parallel,                     &
-                                             indxA_2d,      active_vertex, &
+          call matvec_multiply_structured_3d(nx,            ny,            &
+                                             nz,            parallel,      &
+                                             indxA_3d,      active_vertex, &
                                              Auu,           Auv,           &
                                              Avu,           Avv,           &
                                              xu,            xv,            &
@@ -1453,15 +1451,15 @@ subroutine pcg_solver_standard_2d(nx,        ny,            &
           ! Compute residual r = b - Ax
 
           call t_startf("pcg_vecupdate")
-          ru(:,:) = bu(:,:) - zu(:,:)
-          rv(:,:) = bv(:,:) - zv(:,:)
+          ru(:,:,:) = bu(:,:,:) - zu(:,:,:)
+          rv(:,:,:) = bv(:,:,:) - zv(:,:,:)
           call t_stopf("pcg_vecupdate")
 
           ! Compute squared L2 norm of (r, r)
 
           call t_startf("pcg_dotprod")
-          work0u(:,:) = ru(:,:)*ru(:,:)   ! terms of dot product (r, r)
-          work0v(:,:) = rv(:,:)*rv(:,:)
+          work0u(:,:,:) = ru(:,:,:)*ru(:,:,:)   ! terms of dot product (r, r)
+          work0v(:,:,:) = rv(:,:,:)*rv(:,:,:)
           call t_stopf("pcg_dotprod")
 
           call t_startf("pcg_glbsum_resid")
@@ -1474,6 +1472,10 @@ subroutine pcg_solver_standard_2d(nx,        ny,            &
           ! compute normalized error
           err = L2_resid/L2_rhs
 
+          if (verbose_pcg .and. main_task) then
+             write(iulog,*) 'iter, L2_resid, L2_rhs, error =', iter, L2_resid, L2_rhs, err
+          endif
+
           !WHL - debug
           if (verbose_pcg .and. this_rank == rtest) then
              ru_max = 0.d0
@@ -1482,13 +1484,13 @@ subroutine pcg_solver_standard_2d(nx,        ny,            &
              ju_max = 0
              do j = staggered_jlo, staggered_jhi
                 do i = staggered_ilo, staggered_ihi
-                   if (abs(ru(i,j)) > ru_max) then
-                      ru_max = ru(i,j)
+                   if (sum(abs(ru(:,i,j))) > ru_max) then
+                      ru_max = sum(abs(ru(:,i,j)))
                       iu_max = i
                       ju_max = j
                    endif
-                   if (abs(rv(i,j)) > rv_max) then
-                      rv_max = rv(i,j)
+                   if (sum(abs(rv(:,i,j))) > rv_max) then
+                      rv_max = sum(abs(rv(:,i,j)))
                       iv_max = i
                       jv_max = j
                    endif
@@ -1529,19 +1531,16 @@ subroutine pcg_solver_standard_2d(nx,        ny,            &
     if (allocated(denom_u))    deallocate(denom_u, denom_v)
     if (allocated(xuh_u))      deallocate(xuh_u, xuh_v)
     if (allocated(xlh_u))      deallocate(xlh_u, xlh_v)
-    if (allocated(b_u))        deallocate(b_u, b_v)
-    if (allocated(x_u))        deallocate(x_u, x_v)
     if (allocated(gather_data_row)) deallocate(gather_data_row)
     if (allocated(gather_data_col)) deallocate(gather_data_col)
 
-  end subroutine pcg_solver_standard_2d
+  end subroutine pcg_solver_standard_3d
 
 !****************************************************************************
 
-  subroutine pcg_solver_chrongear_3d(nx,        ny,            &
-                                     nz,        parallel,      &
-                                     indxA_2d,  indxA_3d,      &
-                                     active_vertex,            &
+  subroutine pcg_solver_chrongear_2d(nx,        ny,            &
+                                     parallel,                 &
+                                     indxA_2d,  active_vertex, &
                                      Auu,       Auv,           &
                                      Avu,       Avv,           &
                                      bu,        bv,            &
@@ -1553,104 +1552,26 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
 
     !---------------------------------------------------------------
     !  This subroutine uses a Chronopoulos-Gear preconditioned conjugate-gradient
-    !  algorithm to solve the equation $Ax=b$.
-    !
-    !  It is based on the Chronopoulos-Gear PCG solver in the POP ocean model 
-    !  (author Frank Bryan, NCAR). It is a rearranged conjugate gradient solver 
-    !  that reduces the number of global reductions per iteration from two to one 
-    !  (not counting the convergence check).  Convergence is checked every 
-    !  {\em linear_solve_ncheck} steps.
-    !
-    !     References are:
-    !
-    !     Chronopoulos, A.T., A Class of Parallel Iterative Methods Implemented on Multiprocessors,
-    !        Ph.D. thesis, Technical Report UIUCDCS-R-86-1267, Department of Computer Science,
-    !        University of Illinois, Urbana, Illinois, pp. 1-116, 1986.
-    !
-    !     Chronopoulos, A.T., and C.W. Gear. s-step iterative methods
-    !        for symmetric linear systems. J. Comput. Appl. Math., 25(2),
-    !        153-168, 1989.
-    !
-    !     Dongarra, J. and V. Eijkhout. LAPACK Working Note 159.
-    !        Finite-choice algorithm optimization in conjugate gradients.
-    !        Tech. Rep. ut-cs-03-502. Computer Science Department.
-    !        University of Tennessee, Knoxville. 2003.
+    !  algorithm to solve the equation $Ax=b$. (See references in subroutine above.)
     !
-    !     D Azevedo, E.F., V.L. Eijkhout, and C.H. Romine. LAPACK Working
-    !        Note 56. Conjugate gradient algorithms with reduced
-    !        synchronization overhead on distributed memory multiprocessors.
-    !        Tech. Rep. CS-93-185.  Computer Science Department.
-    !        University of Tennessee, Knoxville. 1993.
-    !---------------------------------------------------------------
+    !  It is similar to subroutine pcg_solver_chrongear_3d, but modified
+    !  to solve for x and y at a single horizontal level, as in the
+    !  shallow-shelf approximation.  See the comments in that subroutine
+    !  (above) for more details on data structure and solver methods.
     !
-    !  The input and output arrays are located on a structured (i,j,k) grid 
-    !  as defined in the glissade_velo_higher module.  
-    !  The global matrix is sparse, but its nonzero elements are stored in 
-    !  four dense matrices called Auu, Avv, Auv, and Avu.
-    !  Each matrix has 3x3x3 = 27 potential nonzero elements per node (i,j,k).
+    !  Input and output arrays are located on a structured (i,j) grid 
+    !  as defined in the glissade_velo_higher module.  The global matrix 
+    !  is sparse, but its nonzero element are stored in four dense matrices 
+    !  called Auu, Avv, Auv, and Avu. Each matrix has 3x3 = 9 potential 
+    !  nonzero elements per node (i,j).
     !
-    !  The current preconditioning options are
+    !  The current preconditioning options for the solver are
     !  (0) no preconditioning
     !  (1) diagonal preconditioning
-    !  (2) preconditioning using a physics-based SIA solver
-    ! 
-    !  For the dome test case with higher-order dynamics, option (2) is best. 
-    !
-    !  Here is a schematic of the method implemented below for solving Ax = b:
-    !
-    !  Set up preconditioner M
-    !  work0 = (b,b)
-    !  bb = global_sum(work0)
-    !
-    !  First pass of algorithm:
-    !  halo_update(x)
-    !  r = b - A*x
-    !  halo_update(r)
-    !  solve Mz = r for z
-    !  work(1) = (r,z)
-    !  d = z
-    !  q = A*d
-    !  work(2) = (d,q)
-    !  halo_update(q)
-    !  rho_old = global_sum(work(1))
-    !  sigma = global_sum(work(2))
-    !  alpha = rho_old/sigma
-    !  x = x + alpha*d
-    !  r = r - alpha*q
-    !
-    !  Iterative loop:
-    !  while (not converged)
-    !     solve Mz = r for z
-    !     Az = A*z
-    !     work(1) = (r,z)
-    !     work(2) = (Az,z)
-    !     halo_update(Az)
-    !     rho = global_sum(work(1))
-    !     delta = global_sum(work(2))
-    !     beta = rho/rho_old
-    !     sigma = delta - beta^2 * sigma
-    !     alpha = rho/sigma
-    !     rho_old = rho
-    !     d = z + beta*d
-    !     q = Az + beta*q
-    !     x = x + alpha*d
-    !     r = r - alpha*q
-    !     if (time to check convergence) then
-    !        r = b - A*x
-    !        work0 = (r,r)
-    !        halo_update(r)
-    !        rr = global_sum(work0)
-    !        if (sqrt(r,r)/sqrt(b,b) < tolerance) exit
-    !     endif
-    !  end while
+    !  (3) local tridiagonal preconditioning
+    !  (4) global tridiagonal preconditioning
+    !  The SIA-based preconditioning option is not available for a 2D solve.
     !
-    !  where x = solution vector
-    !        d = conjugate direction vector
-    !        r = residual vector
-    !        M = preconditioning matrix
-    !    (r,z) = dot product of vectors r and z
-    !            and similarly for (Az,z), etc.
-    !       
     !---------------------------------------------------------------
 
     !---------------------------------------------------------------
@@ -1658,9 +1579,8 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
     !---------------------------------------------------------------
 
     integer, intent(in) ::   &
-       nx, ny,                       &  ! horizontal grid dimensions (for scalars)
-                                        ! velocity grid has dimensions (nx-1,ny-1)
-       nz                               ! number of vertical levels where velocity is computed
+       nx, ny                 ! horizontal grid dimensions (for scalars)
+                              ! velocity grid has dimensions (nx-1,ny-1)
 
     type(parallel_type), intent(in) :: &
        parallel               ! info for parallel communication
@@ -1668,60 +1588,54 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
     integer, dimension(-1:1,-1:1), intent(in) :: &
        indxA_2d               ! maps relative (x,y) coordinates to an index between 1 and 9
 
-    integer, dimension(-1:1,-1:1,-1:1), intent(in) :: &
-       indxA_3d               ! maps relative (x,y,z) coordinates to an index between 1 and 27
-
     logical, dimension(nx-1,ny-1), intent(in) ::   &
        active_vertex          ! T for columns (i,j) where velocity is computed, else F
  
-    real(dp), dimension(27,nz,nx-1,ny-1), intent(in) ::   &
-       Auu, Auv, Avu, Avv     ! four components of assembled matrix
-                              ! 1st dimension = 27 (node and its nearest neighbors in x, y and z direction)
-                              ! other dimensions = (z,x,y) indices
+    real(dp), dimension(nx-1,ny-1,9), intent(in) ::   &
+       Auu, Auv, &            ! four components of assembled matrix
+       Avu, Avv               ! 3rd dimension = 9 (node and its nearest neighbors in x and y direction)
+                              ! 1st and 2nd dimensions = (x,y) indices
                               !
                               !    Auu  | Auv
                               !    _____|____
                               !    Avu  | Avv
                               !         |
 
-    real(dp), dimension(nz,nx-1,ny-1), intent(in) ::  &
+    real(dp), dimension(nx-1,ny-1), intent(in) ::  &
        bu, bv             ! assembled load (rhs) vector, divided into 2 parts
 
-    real(dp), dimension(nz,nx-1,ny-1), intent(inout) ::   &
+    real(dp), dimension(nx-1,ny-1), intent(inout) ::   &
        xu, xv             ! u and v components of solution (i.e., uvel and vvel)
 
     integer, intent(in)  ::   &
        precond           ! = 0 for no preconditioning
                          ! = 1 for diagonal preconditioning (best option for SSA-dominated flow)
-                         ! = 2 for preconditioning with SIA solver (works well for SIA-dominated flow)
 
     integer, intent(in)  :: &
-       linear_solve_ncheck         ! number of iterations between convergence checks in the linear solver
+       linear_solve_ncheck          ! number of iterations between convergence checks in the linear solver
 
     integer, intent(in) ::    &
-       maxiters                    ! max number of linear iterations before quitting
+       maxiters                     ! max number of linear iterations before quitting
 
     real(dp), intent(in) ::   &
-       tolerance                   ! tolerance for linear solver
+       tolerance                    ! tolerance for linear solver
 
     real(dp), intent(out) ::  &
-       err                         ! error (L2 norm of residual) in final solution
+       err                          ! error (L2 norm of residual) in final solution
 
     integer, intent(out) ::   &
-       niters                      ! iterations needed to solution
+       niters                       ! iterations needed to solution
 
     integer, intent(in) :: &
-       itest, jtest, rtest         ! point for debugging diagnostics
+       itest, jtest, rtest      ! point for debugging diagnostics
 
     !---------------------------------------------------------------
     ! Local variables and parameters   
     !---------------------------------------------------------------
 
-    integer :: i, j, k, m     ! grid indices
-    integer :: ii, jj
-    integer :: ilocal, jlocal ! number of locally owned vertices in each direction
-    integer :: iter           ! iteration counter
-    integer :: maxiters_chrongear  ! max number of linear iterations before quitting
+    integer ::  i, j, ii, jj ! grid indices
+    integer ::  m            ! matrix element index
+    integer ::  iter         ! iteration counter
 
     real(dp) ::           &
        alpha,             &! rho/sigma = term in expression for new residual and solution
@@ -1735,8 +1649,7 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
        gsum                ! result of global sum for dot products
    
     ! vectors (each of these is split into u and v components)
-    real(dp), dimension(nz,nx-1,ny-1) ::  &
-       Adiagu, Adiagv,    &! diagonal terms of matrices Auu and Avv
+    real(dp), dimension(nx-1,ny-1) ::  &
        ru, rv,            &! residual vector (b-Ax)
        du, dv,            &! conjugate direction vector
        zu, zv,            &! solution of Mz = r
@@ -1744,7 +1657,7 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
        Azu, Azv,          &! result of matvec multiply A*z
        worku, workv        ! intermediate results
 
-    real(dp), dimension(nz,nx-1,ny-1,2) ::  &
+    real(dp), dimension(nx-1,ny-1,2) ::  &
        work2u, work2v      ! intermediate results
 
     real(dp) ::  &
@@ -1754,12 +1667,11 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
        L2_rhs              ! L2 norm of rhs vector = sqrt(b,b)
                            ! solver is converged when L2_resid/L2_rhs < tolerance
 
-    real(dp), dimension(-1:1,nz,nx-1,ny-1) ::  &
-       Muu, Mvv            ! simplified SIA matrices for preconditioning
-
-    ! arrays for tridiagonal preconditioning
-    ! Note: 2D diagonal entries are Adiag_u and Adiag_v; distinct from 3D Adiagu and Adiagv above
+    ! diagonal matrix elements
+    real(dp), dimension(nx-1,ny-1) ::  &
+       Adiagu, Adiagv      ! diagonal terms of matrices Auu and Avv
 
+    ! tridiagonal matrix elements
     real(dp), dimension(:,:), allocatable :: &
          Asubdiag_u, Adiag_u, Asupdiag_u,  &  ! matrix entries from Auu for tridiagonal preconditioning
          Asubdiag_v, Adiag_v, Asupdiag_v      ! matrix entries from Avv for tridiagonal preconditioning
@@ -1770,18 +1682,17 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
          xuh_u,   xuh_v,           &
          xlh_u,   xlh_v
 
-    ! Note: These two matrices are global in the EW and NS dimensions, respectively.
-    !       Each holds 8 pieces of information for each task on each row or column.
-    !       Since only 2 of these 8 pieces of information change from one iteration to the next,
-    !        it is more efficient to gather the remaining information once and pass the arrays
-    !        with intent(inout), than to declare the arrays in subroutine tridiag_solver_global_2d
-    !        and gather all the information every time the subroutine is called.
-    ! TODO: Revisit this. Is the efficiency gain large enough to justify the extra complexity?
+    real(dp), dimension(:,:), allocatable :: &
+         b_u, b_v, x_u, x_v
 
     real(dp), dimension(:,:), allocatable :: &
          gather_data_row,   &   ! arrays for gathering data from every task on a row or column
          gather_data_col
 
+    integer :: ilocal, jlocal   ! number of locally owned vertices in each direction
+
+    integer :: maxiters_chrongear  ! max number of linear iterations before quitting
+
     integer :: &
          staggered_ilo, staggered_ihi, &  ! bounds of locally owned vertices on staggered grid
          staggered_jlo, staggered_jhi
@@ -1790,9 +1701,17 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
          tasks_row,         &   ! number of tasks per row and column for tridiagonal solve
          tasks_col
 
+    integer, parameter :: &
+       maxiters_tridiag = 100  ! max number of linear iterations for tridiagonal preconditioning,
+                               ! which generally leads to faster convergence than diagonal preconditioning
+
+    !WHL - debug
+    real(dp) :: usum, usum_global, vsum, vsum_global
+
     !WHL - debug
     integer :: iu_max, ju_max, iv_max, jv_max
     real(dp) :: ru_max, rv_max
+    real(dp) :: sum_temp
 
     staggered_ilo = parallel%staggered_ilo
     staggered_ihi = parallel%staggered_ihi
@@ -1802,22 +1721,17 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
     tasks_row = parallel%tasks_row
     tasks_col = parallel%tasks_col
 
-    ! Note: maxiters_tridiag commented out here, because the BP tridiagonal solver
-    !       tends not to converge as well as the 2D version.
-    ! TODO: Make maxiters a config option.
-
     ! Set the maximum number of linear iterations.
     ! Typically allow up to 200 iterations with diagonal preconditioning, but only 100
     !  with tridiagonal, which usually converges faster.
 
-    !TODO - Test whether maxiters_tridiag (currently = 100) is sufficient for convergence with 3D solver
-!!    if (precond == HO_PRECOND_TRIDIAG_LOCAL .or. precond == HO_PRECOND_TRIDIAG_GLOBAL) then
-!!       maxiters_chrongear = maxiters_tridiag
-!!    else
+    if (precond == HO_PRECOND_TRIDIAG_LOCAL .or. precond == HO_PRECOND_TRIDIAG_GLOBAL) then
+       maxiters_chrongear = maxiters_tridiag
+    else
        maxiters_chrongear = maxiters
-!!    endif
+    endif
 
-    if (verbose_pcg .and. this_rank == rtest) then
+    if (verbose_pcg .and. main_task) then
        write(iulog,*) 'Using native PCG solver (Chronopoulos-Gear)'
        write(iulog,*) 'tolerance, maxiters, precond =', tolerance, maxiters_chrongear, precond
     endif
@@ -1832,14 +1746,14 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
 
     if (precond == HO_PRECOND_NONE) then    ! no preconditioner
 
-       if (verbose_pcg .and. this_rank == rtest) then
+       if (verbose_pcg .and. main_task) then
           write(iulog,*) 'Using no preconditioner'
        endif
 
     elseif (precond == HO_PRECOND_DIAG) then
 
-       call setup_preconditioner_diag_3d(nx,       ny,         &
-                                         nz,       indxA_3d,   &
+       call setup_preconditioner_diag_2d(nx,       ny,         &
+                                         indxA_2d,             &
                                          Auu,      Avv,        &
                                          Adiagu,   Adiagv)
 
@@ -1848,32 +1762,22 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
           i = itest
           j = jtest
           write(iulog,*) 'i, j, r =', i, j, this_rank
-          write(iulog,*) 'Auu diag =', Adiagu(:,i,j)
-          write(iulog,*) 'Avv diag =', Adiagv(:,i,j)
+          write(iulog,*) 'Au diag =', Adiagu(i,j)
+          write(iulog,*) 'Av diag =', Adiagv(i,j)
        endif
 
-    elseif (precond == HO_PRECOND_SIA) then
-
-       call setup_preconditioner_sia_3d(nx,        ny,       &
-                                        nz,        indxA_3d, &
-                                        Auu,       Avv,      &
-                                        Muu,       Mvv)
+    elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then
 
-       if (verbose_pcg .and. this_rank == rtest) then
+       !WHL - debug
+       if (verbose_tridiag .and. this_rank==rtest) then
+          i = itest
           j = jtest
           write(iulog,*) ' '
-          write(iulog,*) 'i, k, Muu_sia, Mvv_sia:'
-          do i = staggered_ihi, staggered_ilo, -1
-             write(iulog,*) ' '
-             do k = 1, nz
-                write(iulog,'(2i4, 6e13.5)') i, k, Muu(:,k,i,j), Mvv(:,k,i,j)
-             enddo
-          enddo  ! i
+          write(iulog,*) 'r, i, j =', this_rank, i, j
+          write(iulog,*) 'Auu =', Auu(i,j,:)
+          write(iulog,*) 'Avv =', Avv(i,j,:)
        endif
 
-    elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then
-
-       ! Allocate tridiagonal preconditioning matrices
        allocate(Adiag_u   (nx-1,ny-1))
        allocate(Asubdiag_u(nx-1,ny-1))
        allocate(Asupdiag_u(nx-1,ny-1))
@@ -1886,26 +1790,22 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
        allocate(omega_v   (nx-1,ny-1))
        allocate(denom_v   (nx-1,ny-1))
 
-       ! Compute arrays for tridiagonal preconditioning
-
-       call setup_preconditioner_tridiag_local_3d(&
-            nx,            ny,            &
-            nz,            parallel,      &
-            active_vertex,                &
-            indxA_2d,      indxA_3d,      &
-            itest, jtest,  rtest,         &
-            Auu,           Avv,           &
-            Muu,           Mvv,           &
-            Adiag_u,       Adiag_v,       &
-            Asubdiag_u,    Asubdiag_v,    &
-            Asupdiag_u,    Asupdiag_v,    &
-            omega_u,       omega_v,       &
+       call setup_preconditioner_tridiag_local_2d(&
+            nx,            ny,             &
+            parallel,      indxA_2d,       &
+            itest,         jtest,  rtest,  &
+            Auu,           Avv,            &
+            Adiag_u,       Adiag_v,        &
+            Asubdiag_u,    Asubdiag_v,     &
+            Asupdiag_u,    Asupdiag_v,     &
+            omega_u,       omega_v,        &
             denom_u,       denom_v)
-       
+
     elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then
 
        ! Allocate tridiagonal preconditioning matrices
-       ! Note: (i,j) indices are switched for the A_v matrices to reduce striding.
+       ! Note: (i,j) indices are switced for the A_v matrices to reduce striding.
+
        allocate(Adiag_u   (ilocal,jlocal))
        allocate(Asubdiag_u(ilocal,jlocal))
        allocate(Asupdiag_u(ilocal,jlocal))
@@ -1913,6 +1813,8 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
        allocate(denom_u(ilocal,jlocal))
        allocate(xuh_u(ilocal,jlocal))
        allocate(xlh_u(ilocal,jlocal))
+       allocate(b_u(ilocal,jlocal))
+       allocate(x_u(ilocal,jlocal))
 
        allocate(Adiag_v   (jlocal,ilocal))
        allocate(Asubdiag_v(jlocal,ilocal))
@@ -1921,6 +1823,8 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
        allocate(denom_v(jlocal,ilocal))
        allocate(xuh_v(jlocal,ilocal))
        allocate(xlh_v(jlocal,ilocal))
+       allocate(b_v(jlocal,ilocal))
+       allocate(x_v(jlocal,ilocal))
 
        ! These two matrices are for gathering data from all tasks on a given row or column.
        allocate(gather_data_row(8*tasks_row,jlocal))
@@ -1928,53 +1832,87 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
        gather_data_row = 0.0d0
        gather_data_col = 0.0d0
 
-       ! Compute arrays for tridiagonal preconditioning
+       ! Compute the entries of the tridiagonal matrices
 
-       call setup_preconditioner_tridiag_global_3d(&
-            nx,            ny,            &
-            nz,            parallel,      &
-            active_vertex,                &
-            indxA_2d,      indxA_3d,      &
-            ilocal,        jlocal,        &
-            itest, jtest,  rtest,         &
-            Auu,           Avv,           &
-            Muu,           Mvv,           &
-            Adiag_u,       Adiag_v,       &
-            Asubdiag_u,    Asubdiag_v,    &
-            Asupdiag_u,    Asupdiag_v,    &
-            omega_u,       omega_v,       &
-            denom_u,       denom_v,       &
-            xuh_u,         xuh_v,         &
-            xlh_u,         xlh_v)
+       ! Extract tridiagonal matrix entries from Auu
+       do j = 1, jlocal
+          jj = j + staggered_jlo - 1
+          do i = 1, ilocal
+             ii = i + staggered_ilo - 1
+             Asubdiag_u(i,j) = Auu(ii,jj,indxA_2d(-1,0))   ! subdiagonal elements
+             Adiag_u   (i,j) = Auu(ii,jj,indxA_2d( 0,0))   ! diagonal elements
+             Asupdiag_u(i,j) = Auu(ii,jj,indxA_2d( 1,0))   ! superdiagonal elements
+          enddo
+       enddo
+
+       ! compute work arrays for the u solve in each matrix row
+       call setup_preconditioner_tridiag_global_2d(&
+            ilocal,       jlocal,      &
+!!          itest, jtest, rtest,       &
+            itest - staggered_ilo + 1, &  ! itest referenced to (ilocal,jlocal) coordinates
+            jtest - staggered_jlo + 1, &  ! jtest referenced to (ilocal,jlocal) coordinates
+            rtest,                     &
+            Adiag_u,                   &
+            Asubdiag_u,   Asupdiag_u,  &
+            omega_u,      denom_u,     &
+            xuh_u,        xlh_u)
+
+       ! Extract tridiagonal matrix entries from Avv
+       do i = 1, ilocal
+          ii = i + staggered_ilo - 1
+          do j = 1, jlocal
+             jj = j + staggered_jlo - 1
+             Asubdiag_v(j,i) = Avv(ii,jj,indxA_2d(0,-1))   ! subdiagonal elements
+             Adiag_v   (j,i) = Avv(ii,jj,indxA_2d(0, 0))   ! diagonal elements
+             Asupdiag_v(j,i) = Avv(ii,jj,indxA_2d(0, 1))   ! superdiagonal elements
+          enddo
+       enddo
+
+       ! compute work arrays for the v solve in each matrix column
+       ! Note: The *_v arrays have dimensions (jlocal,ilocal) to reduce strides
+
+       call setup_preconditioner_tridiag_global_2d(&
+            jlocal,       ilocal,      &
+!!          itest, jtest, rtest,       &
+            jtest - staggered_jlo + 1, &  ! jtest referenced to (jlocal,ilocal) coordinates
+            itest - staggered_ilo + 1, &  ! itest referenced to (jlocal,ilocal) coordinates
+            rtest,                     &
+            Adiag_v,                   &
+            Asubdiag_v,   Asupdiag_v,  &
+            omega_v,      denom_v,     &
+            xuh_v,        xlh_v)
 
     endif   ! precond
 
+    !WHL - debug
+    if (verbose_pcg .and. main_task) write(iulog,*) 'Done in PC setup'
+
     call t_stopf("pcg_precond_init")
 
     !---- Initialize scalars and vectors
 
     niters = maxiters_chrongear
-    ru(:,:,:) = 0.d0
-    rv(:,:,:) = 0.d0
-    du(:,:,:) = 0.d0
-    dv(:,:,:) = 0.d0
-    zu(:,:,:) = 0.d0
-    zv(:,:,:) = 0.d0
-    qu(:,:,:) = 0.d0
-    qv(:,:,:) = 0.d0
-    Azu(:,:,:) = 0.d0
-    Azv(:,:,:) = 0.d0
-    worku(:,:,:) = 0.d0
-    workv(:,:,:) = 0.d0
-    work2u(:,:,:,:) = 0.d0
-    work2v(:,:,:,:) = 0.d0
+    ru(:,:) = 0.d0
+    rv(:,:) = 0.d0
+    du(:,:) = 0.d0
+    dv(:,:) = 0.d0
+    zu(:,:) = 0.d0
+    zv(:,:) = 0.d0
+    qu(:,:) = 0.d0
+    qv(:,:) = 0.d0
+    Azu(:,:) = 0.d0
+    Azv(:,:) = 0.d0
+    worku(:,:) = 0.d0
+    workv(:,:) = 0.d0
+    work2u(:,:,:) = 0.d0
+    work2v(:,:,:) = 0.d0
 
     !---- Compute the L2 norm of the RHS vectors
     !---- (Goal is to obtain L2_resid/L2_rhs < tolerance)
 
     call t_startf("pcg_dotprod")
-    worku(:,:,:) = bu(:,:,:)*bu(:,:,:)    ! terms of dot product (b, b)
-    workv(:,:,:) = bv(:,:,:)*bv(:,:,:)
+    worku(:,:) = bu(:,:)*bu(:,:)    ! terms of dot product (b, b)
+    workv(:,:) = bv(:,:)*bv(:,:)
     call t_stopf("pcg_dotprod")
 
     ! find global sum of the squared L2 norm
@@ -1991,8 +1929,6 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
     ! First pass of algorithm
     !---------------------------------------------------------------
 
-    iter = 1
-
     ! Note: The matrix A must be complete for all rows corresponding to locally 
     !        owned nodes, and x must have the correct values in
     !        halo nodes bordering the locally owned nodes.
@@ -2008,9 +1944,9 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
     !---- Compute A*x   (use z as a temp vector for A*x)
 
     call t_startf("pcg_matmult_init")
-    call matvec_multiply_structured_3d(nx,            ny,            &
-                                       nz,            parallel,      &
-                                       indxA_3d,      active_vertex, &
+    call matvec_multiply_structured_2d(nx,            ny,            &
+                                       parallel,                     &
+                                       indxA_2d,      active_vertex, &
                                        Auu,           Auv,           &
                                        Avu,           Avv,           &
                                        xu,            xv,            &
@@ -2021,8 +1957,8 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
     !---- This is correct for locally owned nodes.
 
     call t_startf("pcg_vecupdate")
-    ru(:,:,:) = bu(:,:,:) - zu(:,:,:)
-    rv(:,:,:) = bv(:,:,:) - zv(:,:,:)
+    ru(:,:) = bu(:,:) - zu(:,:)
+    rv(:,:) = bv(:,:) - zv(:,:)
     call t_stopf("pcg_vecupdate")
 
     !---- Halo update for residual
@@ -2038,81 +1974,169 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
     ! From here on, call timers with 'iter' suffix because this can be considered the first iteration
     call t_startf("pcg_precond_iter")
 
-    if (precond == HO_PRECOND_NONE) then   ! no preconditioning
+    if (precond == HO_PRECOND_NONE) then      ! no preconditioning
 
-       zu(:,:,:) = ru(:,:,:)         ! PC(r) = r     
-       zv(:,:,:) = rv(:,:,:)         ! PC(r) = r    
+       zu(:,:) = ru(:,:)         ! PC(r) = r     
+       zv(:,:) = rv(:,:)         ! PC(r) = r    
 
-    elseif (precond == HO_PRECOND_DIAG ) then  ! diagonal preconditioning
+    elseif (precond == HO_PRECOND_DIAG) then  ! diagonal preconditioning
 
+       ! Solve Mz = r, where M is a diagonal matrix
        do j = 1, ny-1
        do i = 1, nx-1
-       do k = 1, nz
-          if (Adiagu(k,i,j) /= 0.d0) then
-             zu(k,i,j) = ru(k,i,j) / Adiagu(k,i,j)   ! PC(r), where PC is formed from diagonal elements of A
+          if (Adiagu(i,j) /= 0.d0) then
+             zu(i,j) = ru(i,j) / Adiagu(i,j)   ! PC(r), where PC is formed from diagonal elements of A
           else                                        
-             zu(k,i,j) = 0.d0
+             zu(i,j) = 0.d0
           endif
-          if (Adiagv(k,i,j) /= 0.d0) then
-             zv(k,i,j) = rv(k,i,j) / Adiagv(k,i,j)  
+          if (Adiagv(i,j) /= 0.d0) then
+             zv(i,j) = rv(i,j) / Adiagv(i,j)
           else                                        
-             zv(k,i,j) = 0.d0
+             zv(i,j) = 0.d0
           endif
-       enddo    ! k
        enddo    ! i
        enddo    ! j
 
-    elseif (precond == HO_PRECOND_SIA) then   ! local vertical shallow-ice solver for preconditioning
+       !WHL - debug
+       if (verbose_pcg .and. this_rank == rtest) then
+!          i = itest
+!          write(iulog,*) ' '
+!          write(iulog,*) 'zv solve with diagonal precond, this_rank, i =', this_rank, i
+!          write(iulog,*) 'j, active, Adiagv, rv, zv, xv:'
+!          do j = staggered_jhi, staggered_jlo, -1
+!             write(iulog,'(i4, l4, 2f12.3, e12.3, f12.3)') j, active_vertex(i,j), Adiagv(i,j), rv(i,j), zv(i,j), xv(i,j)
+!          enddo
+       endif
 
-       call easy_sia_solver(nx,   ny,   nz,        &
-                            active_vertex,         &
-                            Muu,  ru,   zu)      ! solve Muu*zu = ru for zu 
+    elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then  ! local
 
-       call easy_sia_solver(nx,   ny,   nz,        &
-                            active_vertex,         &
-                            Mvv,  rv,   zv)      ! solve Mvv*zv = rv for zv
+       if (verbose_tridiag .and. this_rank == rtest) then
+          i = itest
+          j = jtest
+          write(iulog,*) 'Residual:'
+          write(iulog,*) 'r, i, j, ru:', this_rank, i, j, ru(i,j)
+          write(iulog,*) 'r, i, j, rv:', this_rank, i, j, rv(i,j)
+          write(iulog,*) ' '
+          write(iulog,*) 'jtest =', jtest
+          write(iulog,*) 'i, ru, rv:'
+          do i = itest-3, itest+3
+             write(iulog,'(i4, 2f15.10)') i, ru(i,j), rv(i,j)
+          enddo
+       endif
 
-    elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then
+       if (verbose_pcg .and. main_task) then
+          write(iulog,*) 'call tridiag_solver_local_2d'
+       endif
 
-       ! Use a local tridiagonal solver to find an approximate solution of A*z = r
+       ! Solve M*z = r, where M is a local tridiagonal matrix (one matrix per task)
 
-       call tridiag_solver_local_3d(&
-            nx,            ny,            &
-            nz,            parallel,      &
-            active_vertex,                &
-            itest, jtest,  rtest,         &
-            Adiag_u,       Adiag_v,       &  ! entries of 2D preconditioning matrix
-            Asubdiag_u,    Asubdiag_v,    &
-            Asupdiag_u,    Asupdiag_v,    &
-            omega_u,       omega_v,       &
-            denom_u,       denom_v,       &
-            Muu,           Mvv,           &  ! entries of SIA matrix
-            ru,            rv,            &  ! 3D residual
-            zu,            zv)               ! approximate solution of Az = r
+       !TODO - Test a local solver that can compute zu and zv in the halo
+       !       (to avoid the halo update below)
 
-    elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then
+       call tridiag_solver_local_2d(nx,            ny,            &
+                                    parallel,                     &
+                                    itest, jtest,  rtest,         &
+                                    Adiag_u,       Adiag_v,       &  ! entries of preconditioning matrix
+                                    Asubdiag_u,    Asubdiag_v,    &
+                                    Asupdiag_u,    Asupdiag_v,    &
+                                    omega_u,       omega_v,       &
+                                    denom_u,       denom_v,       &
+                                    ru,            rv,            &  ! right hand side
+                                    zu,            zv)               ! solution
 
-       ! Use a global tridiagonal solver to find an approximate solution of A*z = r
+       !WHL - debug
+       if (verbose_pcg .and. this_rank == rtest) then
+          j = jtest
+          write(iulog,*) ' '
+          write(iulog,*) 'tridiag solve: i, ru, rv, zu, zv:'
+          do i = itest-3, itest+3
+             write(iulog,'(i4, 4f16.10)') i, ru(i,j), rv(i,j), zu(i,j), zv(i,j)
+          enddo
+       endif
 
-       call tridiag_solver_global_3d(&
-            nx,              ny,              &
-            nz,              parallel,        &
-            active_vertex,                    &
-            ilocal,          jlocal,          &
-            tasks_row,       tasks_col,       &
-            itest,  jtest,   rtest,           &
-            Adiag_u,         Adiag_v,         &  ! entries of 2D preconditioning matrix
-            Asubdiag_u,      Asubdiag_v,      &
-            Asupdiag_u,      Asupdiag_v,      &
-            omega_u,         omega_v,         &
-            denom_u,         denom_v,         &
-            xuh_u,           xuh_v,           &
-            xlh_u,           xlh_v,           &
-            Muu,             Mvv,             &  ! entries of SIA matrix
-            gather_data_row, gather_data_col, &
-            .true.,                           &  ! first_time = T (first iteration)
-            ru,              rv,              &  ! 3D residual
-            zu,              zv)                 ! approximate solution of Az = r
+       !Note: Need zu and zv in a row of halo cells so that q = A*d is correct in all locally owned cells
+       !TODO: See whether tridiag solvers could be modified to provide zu and zv in halo cells?
+       call staggered_parallel_halo(zu, parallel)
+       call staggered_parallel_halo(zv, parallel)
+
+    elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then   ! tridiagonal preconditioning with global solve
+
+       ! convert ru(nx-1,ny-1) to b_u(ilocal,jlocal)
+       do j = 1, jlocal
+          jj = j + staggered_jlo - 1
+          do i = 1, ilocal
+             ii = i + staggered_ilo - 1
+             b_u(i,j) = ru(ii,jj)
+          enddo
+       enddo
+
+       ! Solve M*z = r, where M is a global tridiagonal matrix
+
+       call tridiag_solver_global_2d(ilocal,       jlocal,      &
+                                     parallel,     tasks_row,   &
+                                     'row',                     &  ! tridiagonal solve for each row
+!!                                          itest, jtest, rtest,      &
+                                     itest - staggered_ilo + 1, &  ! itest referenced to (ilocal,jlocal) coordinates
+                                     jtest - staggered_jlo + 1, &  ! jtest referenced to (ilocal,jlocal) coordinates
+                                     rtest,                     &
+                                     Adiag_u,                   &
+                                     Asubdiag_u,   Asupdiag_u,  &
+                                     omega_u,      denom_u,     &
+                                     xuh_u,        xlh_u,       &
+                                     b_u,          x_u,         &
+                                     .true.,                    &  ! first_time
+                                     gather_data_row)
+
+       ! convert x_u(ilocal,jlocal) to zu(nx-1,ny-1)
+       zu(:,:) = 0.0d0
+       do j = 1, jlocal
+          jj = j + staggered_jlo - 1
+          do i = 1, ilocal
+             ii = i + staggered_ilo - 1
+             zu(ii,jj) = x_u(i,j)
+          enddo
+       enddo
+
+       ! convert rv(nx-1,ny-1) to b_v(jlocal,ilocal)
+
+       do i = 1, ilocal
+          ii = i + staggered_ilo - 1
+          do j = 1, jlocal
+             jj = j + staggered_jlo - 1
+             b_v(j,i) = rv(ii,jj)
+          enddo
+       enddo
+
+       call tridiag_solver_global_2d(jlocal,       ilocal,      &
+                                     parallel,     tasks_col,   &
+                                     'col',                     &  ! tridiagonal solve for each column
+!!                                          itest, jtest, rtest,      &
+                                     jtest - staggered_jlo + 1, &  ! jtest referenced to (jlocal,ilocal) coordinates
+                                     itest - staggered_ilo + 1, &  ! itest referenced to (jlocal,ilocal) coordinates
+                                     rtest,                     &
+                                     Adiag_v,                   &
+                                     Asubdiag_v,   Asupdiag_v,  &
+                                     omega_v,      denom_v,     &
+                                     xuh_v,        xlh_v,       &
+                                     b_v,          x_v,         &
+                                     .true.,                    &  ! first_time
+                                     gather_data_col)
+
+       ! convert x_v(jlocal,ilocal) to zv(nx-1,ny-1)
+
+       zv(:,:) = 0.0d0
+       do i = 1, ilocal
+          ii = i + staggered_ilo - 1
+          do j = 1, jlocal
+             jj = j + staggered_jlo - 1
+             zv(ii,jj) = x_v(j,i)
+          enddo
+       enddo
+
+       !Note: Need zu and zv in a row of halo cells so that q = A*d is correct in all locally owned cells
+       !TODO: See whether tridiag_solver_local_2d could be modified to provide zu and zv in halo cells?
+       call staggered_parallel_halo(zu, parallel)
+       call staggered_parallel_halo(zv, parallel)
 
     endif    ! precond
 
@@ -2121,23 +2145,23 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
     !---- Compute intermediate result for dot product (r,z)
 
     call t_startf("pcg_dotprod")
-    work2u(:,:,:,1) = ru(:,:,:) * zu(:,:,:)
-    work2v(:,:,:,1) = rv(:,:,:) * zv(:,:,:)
+    work2u(:,:,1) = ru(:,:) * zu(:,:)
+    work2v(:,:,1) = rv(:,:) * zv(:,:)
     call t_stopf("pcg_dotprod")
 
     !---- Compute the conjugate direction vector d
     !---- Since z is correct in halo, so is d
 
-    du(:,:,:) = zu(:,:,:)
-    dv(:,:,:) = zv(:,:,:)
+    du(:,:) = zu(:,:)
+    dv(:,:) = zv(:,:)
 
     !---- Compute q = A*d
-    !---- q is correct for locally owned nodes
+    !---- q is correct for locally owned nodes, provided d extends one layer into the halo
 
     call t_startf("pcg_matmult_iter")
-    call matvec_multiply_structured_3d(nx,            ny,            &
-                                       nz,            parallel,      &
-                                       indxA_3d,      active_vertex, &
+    call matvec_multiply_structured_2d(nx,            ny,            &
+                                       parallel,                     &
+                                       indxA_2d,      active_vertex, &
                                        Auu,           Auv,           &
                                        Avu,           Avv,           &
                                        du,            dv,            &
@@ -2147,14 +2171,14 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
     !---- Compute intermediate result for dot product (d,q) = (d,Ad)
 
     call t_startf("pcg_dotprod")
-    work2u(:,:,:,2) = du(:,:,:) * qu(:,:,:)
-    work2v(:,:,:,2) = dv(:,:,:) * qv(:,:,:)
+    work2u(:,:,2) = du(:,:) * qu(:,:)
+    work2v(:,:,2) = dv(:,:) * qv(:,:)
     call t_stopf("pcg_dotprod")
 
     !---- Find global sums of (r,z) and (d,q)
 
     call t_startf("pcg_glbsum_iter")
-    gsum = parallel_global_sum_stagger(work2u, 2, parallel, work2v)
+    gsum = parallel_global_sum_stagger(work2u, 2, parallel, work2v)  ! nflds = 2
     call t_stopf("pcg_glbsum_iter")
 
     !---- Halo update for q
@@ -2178,112 +2202,214 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
     !---- These are correct in halo
 
     call t_startf("pcg_vecupdate")
-    xu(:,:,:) = xu(:,:,:) + alpha*du(:,:,:)
-    xv(:,:,:) = xv(:,:,:) + alpha*dv(:,:,:)
+    xu(:,:) = xu(:,:) + alpha*du(:,:)
+    xv(:,:) = xv(:,:) + alpha*dv(:,:)
 
-    ru(:,:,:) = ru(:,:,:) - alpha*qu(:,:,:)     ! q = A*d
-    rv(:,:,:) = rv(:,:,:) - alpha*qv(:,:,:)
+    ru(:,:) = ru(:,:) - alpha*qu(:,:)     ! q = A*d
+    rv(:,:) = rv(:,:) - alpha*qv(:,:)
     call t_stopf("pcg_vecupdate")
 
+    !WHL - debug
     if (verbose_pcg .and. this_rank == rtest) then
        j = jtest
-       write(iulog,*) ' '
-       write(iulog,*) 'iter =', iter
-       write(iulog,*) 'i, k, xu, xv, ru, rv:'
+!!       write(iulog,*) ' '
+!!       write(iulog,*) 'iter = 1: i, xu, xv, ru, rv:'
        do i = itest-3, itest+3
-          write(iulog,*) ' '
-          do k = 1, nz
-             write(iulog,'(2i4, 4f16.10)') i, k, xu(k,i,j), xv(k,i,j), ru(k,i,j), rv(k,i,j)
-          enddo
+!!          write(iulog,'(i4, 4f16.10)') i, xu(i,j), xv(i,j), ru(i,j), rv(i,j)
        enddo  ! i
     endif
 
+
     !---------------------------------------------------------------
     ! Iterate to solution
     !---------------------------------------------------------------
 
     iter_loop: do iter = 2, maxiters_chrongear  ! first iteration done above
 
+       if (verbose_pcg .and. main_task) then
+!          write(iulog,*) 'iter =', iter
+       endif
+
        !---- Compute PC(r) = solution z of Mz = r
        !---- z is correct in halo
 
        call t_startf("pcg_precond_iter")
 
-       if (precond == HO_PRECOND_NONE) then   ! no preconditioning
+       if (precond == HO_PRECOND_NONE) then      ! no preconditioning
 
-           zu(:,:,:) = ru(:,:,:)         ! PC(r) = r
-           zv(:,:,:) = rv(:,:,:)         ! PC(r) = r    
+           zu(:,:) = ru(:,:)         ! PC(r) = r
+           zv(:,:) = rv(:,:)         ! PC(r) = r    
 
-       elseif (precond == HO_PRECOND_DIAG ) then  ! diagonal preconditioning
+       elseif (precond == HO_PRECOND_DIAG) then  ! diagonal preconditioning
 
           do j = 1, ny-1
           do i = 1, nx-1
-          do k = 1, nz
-             if (Adiagu(k,i,j) /= 0.d0) then
-                zu(k,i,j) = ru(k,i,j) / Adiagu(k,i,j)   ! PC(r), where PC is formed from diagonal elements of A
+             if (Adiagu(i,j) /= 0.d0) then
+                zu(i,j) = ru(i,j) / Adiagu(i,j)   ! PC(r), where PC is formed from diagonal elements of A
              else
-                zu(k,i,j) = 0.d0
+                zu(i,j) = 0.d0
              endif
-             if (Adiagv(k,i,j) /= 0.d0) then
-                zv(k,i,j) = rv(k,i,j) / Adiagv(k,i,j)
+             if (Adiagv(i,j) /= 0.d0) then
+                zv(i,j) = rv(i,j) / Adiagv(i,j)
              else
-                zv(k,i,j) = 0.d0
+                zv(i,j) = 0.d0
              endif
-          enddo    ! k
           enddo    ! i
           enddo    ! j
 
-       elseif (precond == HO_PRECOND_SIA) then   ! local vertical shallow-ice solver for preconditioning
+       elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then   ! tridiagonal preconditioning with local solve
 
-          call easy_sia_solver(nx,   ny,   nz,        &
-                               active_vertex,         &
-                               Muu,  ru,   zu)      ! solve Muu*zu = ru for zu 
+          ! Solve M*z = r, where M is a local tridiagonal matrix (one matrix per task)
 
-          call easy_sia_solver(nx,   ny,   nz,        &
-                               active_vertex,         &
-                               Mvv,  rv,   zv)      ! solve Mvv*zv = rv for zv
+          !TODO - Test a local solver that can compute zu and zv in the halo
+          !       (to avoid the halo update below)
 
-       elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then
+          !WHL - debug
+          if (verbose_tridiag .and. this_rank == rtest) then
+             i = itest
+             j = jtest
+!             write(iulog,*) 'Residual:'
+!             write(iulog,*) 'r, i, j, ru:', this_rank, i, j, ru(i,j)
+!             write(iulog,*) 'r, i, j, rv:', this_rank, i, j, rv(i,j)
+!             write(iulog,*) ' '
+!             write(iulog,*) 'jtest =', jtest
+!             write(iulog,*) 'i, ru, rv:'
+!             do i = staggered_ihi, staggered_ilo, -1
+!                write(iulog,'(i4, 2f15.10)') i, ru(i,j), rv(i,j)
+!             enddo
+          endif
 
-          ! Use a local tridiagonal solver to find an approximate solution of A*z = r
 
-          call tridiag_solver_local_3d(&
-               nx,            ny,            &
-               nz,            parallel,      &
-               active_vertex,                &
-               itest, jtest,  rtest,         &
-               Adiag_u,       Adiag_v,       &  ! entries of 2D preconditioning matrix
-               Asubdiag_u,    Asubdiag_v,    &
-               Asupdiag_u,    Asupdiag_v,    &
-               omega_u,       omega_v,       &
-               denom_u,       denom_v,       &
-               Muu,           Mvv,           &  ! entries of SIA matrix
-               ru,            rv,            &  ! 3D residual
-               zu,            zv)               ! approximate solution of Az = r
+          !WHL - debug
+          if (verbose_pcg .and. this_rank == rtest) then
+             write(iulog,*) '   call tridiag_solver_local_2d'
+          endif
+
+          call tridiag_solver_local_2d(nx,            ny,            &
+                                       parallel,                     &
+                                       itest, jtest,  rtest,         &
+                                       Adiag_u,       Adiag_v,       &  ! entries of preconditioning matrix
+                                       Asubdiag_u,    Asubdiag_v,    &
+                                       Asupdiag_u,    Asupdiag_v,    &
+                                       omega_u,       omega_v,       &
+                                       denom_u,       denom_v,       &
+                                       ru,            rv,            &  ! right hand side
+                                       zu,            zv)               ! solution
+
+          !Note: Need zu and zv in a row of halo cells so that q = A*d is correct in all locally owned cells
+          !TODO: See whether tridiag solvers could be modified to provide zu and zv in halo cells?
+          call staggered_parallel_halo(zu, parallel)
+          call staggered_parallel_halo(zv, parallel)
+
+          if (verbose_pcg .and. this_rank == rtest) then
+             j = jtest
+             write(iulog,*) ' '
+             write(iulog,*) 'tridiag solve: i, ru, rv, zu, zv:'
+             do i = itest-3, itest+3
+                write(iulog,'(i4, 4f16.10)') i, ru(i,j), rv(i,j), zu(i,j), zv(i,j)
+             enddo
+          endif
 
        elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then   ! tridiagonal preconditioning with global solve
 
-          ! Use a global tridiagonal solver to find an approximate solution of A*z = r
+          !WHL - debug
+          if (verbose_tridiag .and. this_rank == rtest) then
+             j = jtest
+             write(iulog,*) ' '
+             write(iulog,*) 'jtest =', jtest
+             write(iulog,*) 'i, ru, rv:'
+             do i = itest-3, itest+3
+                write(iulog,'(i4, 2f15.10)') i, ru(i,j), rv(i,j)
+             enddo
+          endif
 
-          call tridiag_solver_global_3d(&
-               nx,              ny,              &
-               nz,              parallel,        &
-               active_vertex,                    &
-               ilocal,          jlocal,          &
-               tasks_row,       tasks_col,       &
-               itest,  jtest,   rtest,           &
-               Adiag_u,         Adiag_v,         &  ! entries of 2D preconditioning matrix
-               Asubdiag_u,      Asubdiag_v,      &
-               Asupdiag_u,      Asupdiag_v,      &
-               omega_u,         omega_v,         &
-               denom_u,         denom_v,         &
-               xuh_u,           xuh_v,           &
-               xlh_u,           xlh_v,           &
-               Muu,             Mvv,             &  ! entries of SIA matrix
-               gather_data_row, gather_data_col, &
-               .false.,                          &  ! first_time = F (iteration 2+)
-               ru,              rv,              &  ! 3D residual
-               zu,              zv)                 ! approximate solution of Az = r
+          ! convert ru(nx-1,ny-1) to b_u(ilocal,jlocal)
+
+          do j = 1, jlocal
+             jj = j + staggered_jlo - 1
+             do i = 1, ilocal
+                ii = i + staggered_ilo - 1
+                b_u(i,j) = ru(ii,jj)
+             enddo
+          enddo
+
+          !WHL - debug
+          if (verbose_pcg .and. this_rank == rtest) then
+             j = jtest
+             write(iulog,*) ' '
+             write(iulog,*) 'Before global tridiag PC u solve, r, j =', rtest, jtest
+             write(iulog,*) ' '
+             write(iulog,*) 'i, Adiag_u, Asubdiag_u, Asupdiag_u, b_u:'
+             do i = itest-3, itest+3
+                write(iulog,'(i4, 4e16.8)') i, Adiag_u(i,j), Asubdiag_u(i,j), Asupdiag_u(i,j), b_u(i,j)
+             enddo
+          endif
+
+          call tridiag_solver_global_2d(ilocal,       jlocal,      &
+                                        parallel,     tasks_row,   &
+                                        'row',                     &  ! tridiagonal solve for each row
+!!                                             itest, jtest, rtest,      &
+                                        itest - staggered_ilo + 1, &  ! itest referenced to (ilocal,jlocal) coordinates
+                                        jtest - staggered_jlo + 1, &  ! jtest referenced to (ilocal,jlocal) coordinates
+                                        rtest,                     &
+                                        Adiag_u,                   &
+                                        Asubdiag_u,   Asupdiag_u,  &
+                                        omega_u,      denom_u,     &
+                                        xuh_u,        xlh_u,       &
+                                        b_u,          x_u,         &
+                                        .false.,                   &  ! first_time
+                                        gather_data_row)
+
+          ! convert x_u(ilocal,jlocal) to zu(nx-1,ny-1)
+          zu(:,:) = 0.0d0
+          do j = 1, jlocal
+             jj = j + staggered_jlo - 1
+             do i = 1, ilocal
+                ii = i + staggered_ilo - 1
+                zu(ii,jj) = x_u(i,j)
+             enddo
+          enddo
+
+          ! convert rv(nx-1,ny-1) to b_v(jlocal,ilocal)
+
+          do i = 1, ilocal
+             ii = i + staggered_ilo - 1
+             do j = 1, jlocal
+                jj = j + staggered_jlo - 1
+                b_v(j,i) = rv(ii,jj)
+             enddo
+          enddo
+
+          call tridiag_solver_global_2d(jlocal,       ilocal,      &
+                                        parallel,     tasks_col,   &
+                                        'col',                     &  ! tridiagonal solve for each column
+!!                                             itest, jtest, rtest,      &
+                                        jtest - staggered_jlo + 1, &  ! jtest referenced to (jlocal,ilocal) coordinates
+                                        itest - staggered_ilo + 1, &  ! itest referenced to (jlocal,ilocal) coordinates
+                                        rtest,                     &
+                                        Adiag_v,                   &
+                                        Asubdiag_v,   Asupdiag_v,  &
+                                        omega_v,      denom_v,     &
+                                        xuh_v,        xlh_v,       &
+                                        b_v,          x_v,         &
+                                        .false.,                   &  ! first_time
+                                        gather_data_col)
+
+          ! convert x_v(jlocal,ilocal) to zv(nx-1,ny-1)
+
+          zv(:,:) = 0.0d0
+          do i = 1, ilocal
+             ii = i + staggered_ilo - 1
+             do j = 1, jlocal
+                jj = j + staggered_jlo - 1
+                zv(ii,jj) = x_v(j,i)
+             enddo
+          enddo
+
+          !Note: Need zu and zv in a row of halo cells so that q = A*d is correct in all locally owned cells
+          !TODO: See whether tridiag solvers could be modified to provide zu and zv in halo cells?
+          call staggered_parallel_halo(zu, parallel)
+          call staggered_parallel_halo(zv, parallel)
 
        endif    ! precond
 
@@ -2291,27 +2417,26 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
 
        !---- Compute Az = A*z
        !---- This is the one matvec multiply required per iteration
-       !---- Az is correct for local owned nodes and needs a halo update (below)
+       !---- Az is correct for locally owned nodes and needs a halo update (below)
 
        call t_startf("pcg_matmult_iter")
-       call matvec_multiply_structured_3d(nx,            ny,            &
-                                          nz,            parallel,      &
-                                          indxA_3d,      active_vertex, &
+       call matvec_multiply_structured_2d(nx,            ny,            &
+                                          parallel,                     &
+                                          indxA_2d,      active_vertex, &
                                           Auu,           Auv,           &
                                           Avu,           Avv,           &
                                           zu,            zv,            &
                                           Azu,           Azv)
        call t_stopf("pcg_matmult_iter")
 
-
        !---- Compute intermediate results for the dot products (r,z) and (Az,z)
 
        call t_startf("pcg_dotprod")
-       work2u(:,:,:,1) = ru(:,:,:)*zu(:,:,:)     ! terms of dot product (r,z)
-       work2v(:,:,:,1) = rv(:,:,:)*zv(:,:,:)    
+       work2u(:,:,1) = ru(:,:)*zu(:,:)     ! terms of dot product (r,z)
+       work2v(:,:,1) = rv(:,:)*zv(:,:)    
 
-       work2u(:,:,:,2) = Azu(:,:,:)*zu(:,:,:)    ! terms of dot product (A*z,z)
-       work2v(:,:,:,2) = Azv(:,:,:)*zv(:,:,:)    
+       work2u(:,:,2) = Azu(:,:)*zu(:,:)    ! terms of dot product (A*z,z)
+       work2v(:,:,2) = Azv(:,:)*zv(:,:)    
        call t_stopf("pcg_dotprod")
 
        ! Take the global sums of (r,z) and (Az,z)
@@ -2319,11 +2444,10 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
        ! this is the one MPI global reduction per iteration.
 
        call t_startf("pcg_glbsum_iter")
-       gsum = parallel_global_sum_stagger(work2u, 2, parallel, work2v)
+       gsum = parallel_global_sum_stagger(work2u, 2, parallel, work2v)  ! nflds = 2
        call t_stopf("pcg_glbsum_iter")
 
        !---- Halo update for Az
-       !---- This is the one halo update required per iteration
 
        call t_startf("pcg_halo_iter")
        call staggered_parallel_halo(Azu, parallel)
@@ -2342,7 +2466,7 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
 
        if (alpha /= alpha) then  ! alpha is NaN
 !!          write(iulog,*) 'rho, sigma, alpha:', rho, sigma, alpha
-          call write_log('Chron-Gear PCG solver has failed, alpha = NaN', GM_FATAL)
+          call write_log('Chron_Gear PCG solver has failed, alpha = NaN', GM_FATAL)
        endif
 
        !---- Update d and q
@@ -2350,38 +2474,25 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
 
        call t_startf("pcg_vecupdate")
 
-       du(:,:,:) = zu(:,:,:) + beta*du(:,:,:)       ! d_(i+1) = PC(r_(i+1)) + beta_(i+1)*d_i
-       dv(:,:,:) = zv(:,:,:) + beta*dv(:,:,:)       !
-                                                    !                    (r_(i+1), PC(r_(i+1)))
-                                                    ! where beta_(i+1) = --------------------  
-                                                    !                        (r_i, PC(r_i)) 
-       qu(:,:,:) = Azu(:,:,:) + beta*qu(:,:,:)
-       qv(:,:,:) = Azv(:,:,:) + beta*qv(:,:,:)
+       du(:,:) = zu(:,:) + beta*du(:,:)       ! d_(i+1) = PC(r_(i+1)) + beta_(i+1)*d_i
+       dv(:,:) = zv(:,:) + beta*dv(:,:)       !
+                                              !                    (r_(i+1), PC(r_(i+1)))
+                                              ! where beta_(i+1) = --------------------  
+                                              !                        (r_i, PC(r_i)) 
+       qu(:,:) = Azu(:,:) + beta*qu(:,:)
+       qv(:,:) = Azv(:,:) + beta*qv(:,:)
 
        !---- Update solution and residual
        !---- These are correct in halo
 
-       xu(:,:,:) = xu(:,:,:) + alpha*du(:,:,:)
-       xv(:,:,:) = xv(:,:,:) + alpha*dv(:,:,:)
+       xu(:,:) = xu(:,:) + alpha*du(:,:)
+       xv(:,:) = xv(:,:) + alpha*dv(:,:)
 
-       ru(:,:,:) = ru(:,:,:) - alpha*qu(:,:,:)
-       rv(:,:,:) = rv(:,:,:) - alpha*qv(:,:,:)
+       ru(:,:) = ru(:,:) - alpha*qu(:,:)
+       rv(:,:) = rv(:,:) - alpha*qv(:,:)
 
        call t_stopf("pcg_vecupdate")
 
-       if (verbose_pcg .and. this_rank == rtest) then
-          j = jtest
-          write(iulog,*) ' '
-          write(iulog,*) 'iter =', iter
-          write(iulog,*) 'i, k, xu, xv, ru, rv:'
-          do i = itest-3, itest+3
-             write(iulog,*) ' '
-             do k = 1, nz
-                write(iulog,'(2i4, 4f16.10)') i, k, xu(k,i,j), xv(k,i,j), ru(k,i,j), rv(k,i,j)
-             enddo
-          enddo  ! i
-       endif
-
        ! Check for convergence every linear_solve_ncheck iterations.
        ! Also check at iter = 5, to reduce iterations when the nonlinear solver is close to convergence.
        ! TODO: Check at iter = linear_solve_ncheck/2 instead of 5?  This would be answer-changing.
@@ -2392,15 +2503,19 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
 
           if (verbose_pcg .and. main_task) then
              write(iulog,*) ' '
-             write(iulog,*) 'Check convergence, iter =', iter
+             write(iulog,*) '   check convergence, iter =', iter
           endif
 
           !---- Compute z = A*x  (use z as a temp vector for A*x)
            
+          !WHL - debug - don't think this is needed, but try just in case
+!!          call staggered_parallel_halo(xu, parallel)
+!!          call staggered_parallel_halo(xv, parallel)
+
           call t_startf("pcg_matmult_resid")
-          call matvec_multiply_structured_3d(nx,            ny,            &
-                                             nz,            parallel,      &
-                                             indxA_3d,      active_vertex, &
+          call matvec_multiply_structured_2d(nx,            ny,            &
+                                             parallel,                     &
+                                             indxA_2d,      active_vertex, &
                                              Auu,           Auv,           &
                                              Avu,           Avv,           &
                                              xu,            xv,            &
@@ -2409,27 +2524,35 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
 
           !---- Compute residual r = b - A*x
 
+          !WHL - debug - don't think this is needed, but try just in case
+!!          call staggered_parallel_halo(bu, parallel)
+!!          call staggered_parallel_halo(bv, parallel)
+          !WHL - debug - don't think this is needed, but try just in case
+!!          call staggered_parallel_halo(zu, parallel)
+!!          call staggered_parallel_halo(zv, parallel)
+
           call t_startf("pcg_vecupdate")
-          ru(:,:,:) = bu(:,:,:) - zu(:,:,:)
-          rv(:,:,:) = bv(:,:,:) - zv(:,:,:)
+          ru(:,:) = bu(:,:) - zu(:,:)
+          rv(:,:) = bv(:,:) - zv(:,:)
           call t_stopf("pcg_vecupdate")
 
+          !WHL - debug - don't think this is needed, but try just in case
+!!          call staggered_parallel_halo(ru, parallel)
+!!          call staggered_parallel_halo(rv, parallel)
+
           !---- Compute dot product (r, r)
 
           call t_startf("pcg_dotprod")
-          worku(:,:,:) = ru(:,:,:)*ru(:,:,:)
-          workv(:,:,:) = rv(:,:,:)*rv(:,:,:)
+          worku(:,:) = ru(:,:)*ru(:,:)
+          workv(:,:) = rv(:,:)*rv(:,:)
           call t_stopf("pcg_dotprod")
 
           call t_startf("pcg_glbsum_resid")
           rr = parallel_global_sum_stagger(worku, parallel, workv)
           call t_stopf("pcg_glbsum_resid")
 
-          ! take square root
-          L2_resid = sqrt(rr)
-
-          ! compute normalized error
-          err = L2_resid/L2_rhs
+          L2_resid = sqrt(rr)          ! L2 norm of residual
+          err = L2_resid/L2_rhs        ! normalized error
 
           if (verbose_pcg .and. main_task) then
              write(iulog,*) 'iter, L2_resid, L2_rhs, error =', iter, L2_resid, L2_rhs, err
@@ -2443,13 +2566,13 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
              ju_max = 0
              do j = staggered_jlo, staggered_jhi
                 do i = staggered_ilo, staggered_ihi
-                   if (sum(abs(ru(:,i,j))) > ru_max) then
-                      ru_max = sum(abs(ru(:,i,j)))
+                   if (abs(ru(i,j)) > ru_max) then
+                      ru_max = ru(i,j)
                       iu_max = i
                       ju_max = j
                    endif
-                   if (sum(abs(rv(:,i,j))) > rv_max) then
-                      rv_max = sum(abs(rv(:,i,j)))
+                   if (abs(rv(i,j)) > rv_max) then
+                      rv_max = rv(i,j)
                       iv_max = i
                       jv_max = j
                    endif
@@ -2497,16 +2620,19 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
     if (allocated(denom_u))    deallocate(denom_u, denom_v)
     if (allocated(xuh_u))      deallocate(xuh_u, xuh_v)
     if (allocated(xlh_u))      deallocate(xlh_u, xlh_v)
+    if (allocated(b_u))        deallocate(b_u, b_v)
+    if (allocated(x_u))        deallocate(x_u, x_v)
     if (allocated(gather_data_row)) deallocate(gather_data_row)
     if (allocated(gather_data_col)) deallocate(gather_data_col)
 
-  end subroutine pcg_solver_chrongear_3d
+  end subroutine pcg_solver_chrongear_2d
 
 !****************************************************************************
-
-  subroutine pcg_solver_chrongear_2d(nx,        ny,            &
-                                     parallel,                 &
-                                     indxA_2d,  active_vertex, &
+    
+  subroutine pcg_solver_chrongear_3d(nx,        ny,            &
+                                     nz,        parallel,      &
+                                     indxA_2d,  indxA_3d,      &
+                                     active_vertex,            &
                                      Auu,       Auv,           &
                                      Avu,       Avv,           &
                                      bu,        bv,            &
@@ -2518,26 +2644,104 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
 
     !---------------------------------------------------------------
     !  This subroutine uses a Chronopoulos-Gear preconditioned conjugate-gradient
-    !  algorithm to solve the equation $Ax=b$. (See references in subroutine above.)
+    !  algorithm to solve the equation $Ax=b$.
     !
-    !  It is similar to subroutine pcg_solver_chrongear_3d, but modified
-    !  to solve for x and y at a single horizontal level, as in the
-    !  shallow-shelf approximation.  See the comments in that subroutine
-    !  (above) for more details on data structure and solver methods.
+    !  It is based on the Chronopoulos-Gear PCG solver in the POP ocean model 
+    !  (author Frank Bryan, NCAR). It is a rearranged conjugate gradient solver 
+    !  that reduces the number of global reductions per iteration from two to one 
+    !  (not counting the convergence check).  Convergence is checked every 
+    !  {\em linear_solve_ncheck} steps.
     !
-    !  Input and output arrays are located on a structured (i,j) grid 
-    !  as defined in the glissade_velo_higher module.  The global matrix 
-    !  is sparse, but its nonzero element are stored in four dense matrices 
-    !  called Auu, Avv, Auv, and Avu. Each matrix has 3x3 = 9 potential 
-    !  nonzero elements per node (i,j).
+    !     References are:
     !
-    !  The current preconditioning options for the solver are
+    !     Chronopoulos, A.T., A Class of Parallel Iterative Methods Implemented on Multiprocessors,
+    !        Ph.D. thesis, Technical Report UIUCDCS-R-86-1267, Department of Computer Science,
+    !        University of Illinois, Urbana, Illinois, pp. 1-116, 1986.
+    !
+    !     Chronopoulos, A.T., and C.W. Gear. s-step iterative methods
+    !        for symmetric linear systems. J. Comput. Appl. Math., 25(2),
+    !        153-168, 1989.
+    !
+    !     Dongarra, J. and V. Eijkhout. LAPACK Working Note 159.
+    !        Finite-choice algorithm optimization in conjugate gradients.
+    !        Tech. Rep. ut-cs-03-502. Computer Science Department.
+    !        University of Tennessee, Knoxville. 2003.
+    !
+    !     D Azevedo, E.F., V.L. Eijkhout, and C.H. Romine. LAPACK Working
+    !        Note 56. Conjugate gradient algorithms with reduced
+    !        synchronization overhead on distributed memory multiprocessors.
+    !        Tech. Rep. CS-93-185.  Computer Science Department.
+    !        University of Tennessee, Knoxville. 1993.
+    !---------------------------------------------------------------
+    !
+    !  The input and output arrays are located on a structured (i,j,k) grid 
+    !  as defined in the glissade_velo_higher module.  
+    !  The global matrix is sparse, but its nonzero elements are stored in 
+    !  four dense matrices called Auu, Avv, Auv, and Avu.
+    !  Each matrix has 3x3x3 = 27 potential nonzero elements per node (i,j,k).
+    !
+    !  The current preconditioning options are
     !  (0) no preconditioning
     !  (1) diagonal preconditioning
-    !  (3) local tridiagonal preconditioning
-    !  (4) global tridiagonal preconditioning
-    !  The SIA-based preconditioning option is not available for a 2D solve.
+    !  (2) preconditioning using a physics-based SIA solver
+    ! 
+    !  For the dome test case with higher-order dynamics, option (2) is best. 
+    !
+    !  Here is a schematic of the method implemented below for solving Ax = b:
+    !
+    !  Set up preconditioner M
+    !  work0 = (b,b)
+    !  bb = global_sum(work0)
+    !
+    !  First pass of algorithm:
+    !  halo_update(x)
+    !  r = b - A*x
+    !  halo_update(r)
+    !  solve Mz = r for z
+    !  work(1) = (r,z)
+    !  d = z
+    !  q = A*d
+    !  work(2) = (d,q)
+    !  halo_update(q)
+    !  rho_old = global_sum(work(1))
+    !  sigma = global_sum(work(2))
+    !  alpha = rho_old/sigma
+    !  x = x + alpha*d
+    !  r = r - alpha*q
+    !
+    !  Iterative loop:
+    !  while (not converged)
+    !     solve Mz = r for z
+    !     Az = A*z
+    !     work(1) = (r,z)
+    !     work(2) = (Az,z)
+    !     halo_update(Az)
+    !     rho = global_sum(work(1))
+    !     delta = global_sum(work(2))
+    !     beta = rho/rho_old
+    !     sigma = delta - beta^2 * sigma
+    !     alpha = rho/sigma
+    !     rho_old = rho
+    !     d = z + beta*d
+    !     q = Az + beta*q
+    !     x = x + alpha*d
+    !     r = r - alpha*q
+    !     if (time to check convergence) then
+    !        r = b - A*x
+    !        work0 = (r,r)
+    !        halo_update(r)
+    !        rr = global_sum(work0)
+    !        if (sqrt(r,r)/sqrt(b,b) < tolerance) exit
+    !     endif
+    !  end while
     !
+    !  where x = solution vector
+    !        d = conjugate direction vector
+    !        r = residual vector
+    !        M = preconditioning matrix
+    !    (r,z) = dot product of vectors r and z
+    !            and similarly for (Az,z), etc.
+    !       
     !---------------------------------------------------------------
 
     !---------------------------------------------------------------
@@ -2545,8 +2749,9 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
     !---------------------------------------------------------------
 
     integer, intent(in) ::   &
-       nx, ny                 ! horizontal grid dimensions (for scalars)
-                              ! velocity grid has dimensions (nx-1,ny-1)
+       nx, ny,                       &  ! horizontal grid dimensions (for scalars)
+                                        ! velocity grid has dimensions (nx-1,ny-1)
+       nz                               ! number of vertical levels where velocity is computed
 
     type(parallel_type), intent(in) :: &
        parallel               ! info for parallel communication
@@ -2554,54 +2759,60 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
     integer, dimension(-1:1,-1:1), intent(in) :: &
        indxA_2d               ! maps relative (x,y) coordinates to an index between 1 and 9
 
+    integer, dimension(-1:1,-1:1,-1:1), intent(in) :: &
+       indxA_3d               ! maps relative (x,y,z) coordinates to an index between 1 and 27
+
     logical, dimension(nx-1,ny-1), intent(in) ::   &
        active_vertex          ! T for columns (i,j) where velocity is computed, else F
  
-    real(dp), dimension(nx-1,ny-1,9), intent(in) ::   &
-       Auu, Auv, &            ! four components of assembled matrix
-       Avu, Avv               ! 3rd dimension = 9 (node and its nearest neighbors in x and y direction)
-                              ! 1st and 2nd dimensions = (x,y) indices
+    real(dp), dimension(27,nz,nx-1,ny-1), intent(in) ::   &
+       Auu, Auv, Avu, Avv     ! four components of assembled matrix
+                              ! 1st dimension = 27 (node and its nearest neighbors in x, y and z direction)
+                              ! other dimensions = (z,x,y) indices
                               !
                               !    Auu  | Auv
                               !    _____|____
                               !    Avu  | Avv
                               !         |
 
-    real(dp), dimension(nx-1,ny-1), intent(in) ::  &
+    real(dp), dimension(nz,nx-1,ny-1), intent(in) ::  &
        bu, bv             ! assembled load (rhs) vector, divided into 2 parts
 
-    real(dp), dimension(nx-1,ny-1), intent(inout) ::   &
+    real(dp), dimension(nz,nx-1,ny-1), intent(inout) ::   &
        xu, xv             ! u and v components of solution (i.e., uvel and vvel)
 
     integer, intent(in)  ::   &
        precond           ! = 0 for no preconditioning
                          ! = 1 for diagonal preconditioning (best option for SSA-dominated flow)
+                         ! = 2 for preconditioning with SIA solver (works well for SIA-dominated flow)
 
     integer, intent(in)  :: &
-       linear_solve_ncheck          ! number of iterations between convergence checks in the linear solver
+       linear_solve_ncheck         ! number of iterations between convergence checks in the linear solver
 
     integer, intent(in) ::    &
-       maxiters                     ! max number of linear iterations before quitting
+       maxiters                    ! max number of linear iterations before quitting
 
     real(dp), intent(in) ::   &
-       tolerance                    ! tolerance for linear solver
+       tolerance                   ! tolerance for linear solver
 
     real(dp), intent(out) ::  &
-       err                          ! error (L2 norm of residual) in final solution
+       err                         ! error (L2 norm of residual) in final solution
 
     integer, intent(out) ::   &
-       niters                       ! iterations needed to solution
+       niters                      ! iterations needed to solution
 
     integer, intent(in) :: &
-       itest, jtest, rtest      ! point for debugging diagnostics
+       itest, jtest, rtest         ! point for debugging diagnostics
 
     !---------------------------------------------------------------
     ! Local variables and parameters   
     !---------------------------------------------------------------
 
-    integer ::  i, j, ii, jj ! grid indices
-    integer ::  m            ! matrix element index
-    integer ::  iter         ! iteration counter
+    integer :: i, j, k, m     ! grid indices
+    integer :: ii, jj
+    integer :: ilocal, jlocal ! number of locally owned vertices in each direction
+    integer :: iter           ! iteration counter
+    integer :: maxiters_chrongear  ! max number of linear iterations before quitting
 
     real(dp) ::           &
        alpha,             &! rho/sigma = term in expression for new residual and solution
@@ -2615,7 +2826,8 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
        gsum                ! result of global sum for dot products
    
     ! vectors (each of these is split into u and v components)
-    real(dp), dimension(nx-1,ny-1) ::  &
+    real(dp), dimension(nz,nx-1,ny-1) ::  &
+       Adiagu, Adiagv,    &! diagonal terms of matrices Auu and Avv
        ru, rv,            &! residual vector (b-Ax)
        du, dv,            &! conjugate direction vector
        zu, zv,            &! solution of Mz = r
@@ -2623,7 +2835,7 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
        Azu, Azv,          &! result of matvec multiply A*z
        worku, workv        ! intermediate results
 
-    real(dp), dimension(nx-1,ny-1,2) ::  &
+    real(dp), dimension(nz,nx-1,ny-1,2) ::  &
        work2u, work2v      ! intermediate results
 
     real(dp) ::  &
@@ -2633,11 +2845,12 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
        L2_rhs              ! L2 norm of rhs vector = sqrt(b,b)
                            ! solver is converged when L2_resid/L2_rhs < tolerance
 
-    ! diagonal matrix elements
-    real(dp), dimension(nx-1,ny-1) ::  &
-       Adiagu, Adiagv      ! diagonal terms of matrices Auu and Avv
+    real(dp), dimension(-1:1,nz,nx-1,ny-1) ::  &
+       Muu, Mvv            ! simplified SIA matrices for preconditioning
+
+    ! arrays for tridiagonal preconditioning
+    ! Note: 2D diagonal entries are Adiag_u and Adiag_v; distinct from 3D Adiagu and Adiagv above
 
-    ! tridiagonal matrix elements
     real(dp), dimension(:,:), allocatable :: &
          Asubdiag_u, Adiag_u, Asupdiag_u,  &  ! matrix entries from Auu for tridiagonal preconditioning
          Asubdiag_v, Adiag_v, Asupdiag_v      ! matrix entries from Avv for tridiagonal preconditioning
@@ -2648,17 +2861,18 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
          xuh_u,   xuh_v,           &
          xlh_u,   xlh_v
 
-    real(dp), dimension(:,:), allocatable :: &
-         b_u, b_v, x_u, x_v
+    ! Note: These two matrices are global in the EW and NS dimensions, respectively.
+    !       Each holds 8 pieces of information for each task on each row or column.
+    !       Since only 2 of these 8 pieces of information change from one iteration to the next,
+    !        it is more efficient to gather the remaining information once and pass the arrays
+    !        with intent(inout), than to declare the arrays in subroutine tridiag_solver_global_2d
+    !        and gather all the information every time the subroutine is called.
+    ! TODO: Revisit this. Is the efficiency gain large enough to justify the extra complexity?
 
     real(dp), dimension(:,:), allocatable :: &
          gather_data_row,   &   ! arrays for gathering data from every task on a row or column
          gather_data_col
 
-    integer :: ilocal, jlocal   ! number of locally owned vertices in each direction
-
-    integer :: maxiters_chrongear  ! max number of linear iterations before quitting
-
     integer :: &
          staggered_ilo, staggered_ihi, &  ! bounds of locally owned vertices on staggered grid
          staggered_jlo, staggered_jhi
@@ -2667,17 +2881,9 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
          tasks_row,         &   ! number of tasks per row and column for tridiagonal solve
          tasks_col
 
-    integer, parameter :: &
-       maxiters_tridiag = 100  ! max number of linear iterations for tridiagonal preconditioning,
-                               ! which generally leads to faster convergence than diagonal preconditioning
-
-    !WHL - debug
-    real(dp) :: usum, usum_global, vsum, vsum_global
-
     !WHL - debug
     integer :: iu_max, ju_max, iv_max, jv_max
     real(dp) :: ru_max, rv_max
-    real(dp) :: sum_temp
 
     staggered_ilo = parallel%staggered_ilo
     staggered_ihi = parallel%staggered_ihi
@@ -2687,17 +2893,22 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
     tasks_row = parallel%tasks_row
     tasks_col = parallel%tasks_col
 
+    ! Note: maxiters_tridiag commented out here, because the BP tridiagonal solver
+    !       tends not to converge as well as the 2D version.
+    ! TODO: Make maxiters a config option.
+
     ! Set the maximum number of linear iterations.
     ! Typically allow up to 200 iterations with diagonal preconditioning, but only 100
     !  with tridiagonal, which usually converges faster.
 
-    if (precond == HO_PRECOND_TRIDIAG_LOCAL .or. precond == HO_PRECOND_TRIDIAG_GLOBAL) then
-       maxiters_chrongear = maxiters_tridiag
-    else
+    !TODO - Test whether maxiters_tridiag (currently = 100) is sufficient for convergence with 3D solver
+!!    if (precond == HO_PRECOND_TRIDIAG_LOCAL .or. precond == HO_PRECOND_TRIDIAG_GLOBAL) then
+!!       maxiters_chrongear = maxiters_tridiag
+!!    else
        maxiters_chrongear = maxiters
-    endif
+!!    endif
 
-    if (verbose_pcg .and. main_task) then
+    if (verbose_pcg .and. this_rank == rtest) then
        write(iulog,*) 'Using native PCG solver (Chronopoulos-Gear)'
        write(iulog,*) 'tolerance, maxiters, precond =', tolerance, maxiters_chrongear, precond
     endif
@@ -2712,14 +2923,14 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
 
     if (precond == HO_PRECOND_NONE) then    ! no preconditioner
 
-       if (verbose_pcg .and. main_task) then
+       if (verbose_pcg .and. this_rank == rtest) then
           write(iulog,*) 'Using no preconditioner'
        endif
 
     elseif (precond == HO_PRECOND_DIAG) then
 
-       call setup_preconditioner_diag_2d(nx,       ny,         &
-                                         indxA_2d,             &
+       call setup_preconditioner_diag_3d(nx,       ny,         &
+                                         nz,       indxA_3d,   &
                                          Auu,      Avv,        &
                                          Adiagu,   Adiagv)
 
@@ -2728,22 +2939,32 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
           i = itest
           j = jtest
           write(iulog,*) 'i, j, r =', i, j, this_rank
-          write(iulog,*) 'Au diag =', Adiagu(i,j)
-          write(iulog,*) 'Av diag =', Adiagv(i,j)
+          write(iulog,*) 'Auu diag =', Adiagu(:,i,j)
+          write(iulog,*) 'Avv diag =', Adiagv(:,i,j)
        endif
 
-    elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then
+    elseif (precond == HO_PRECOND_SIA) then
 
-       !WHL - debug
-       if (verbose_tridiag .and. this_rank==rtest) then
-          i = itest
+       call setup_preconditioner_sia_3d(nx,        ny,       &
+                                        nz,        indxA_3d, &
+                                        Auu,       Avv,      &
+                                        Muu,       Mvv)
+
+       if (verbose_pcg .and. this_rank == rtest) then
           j = jtest
           write(iulog,*) ' '
-          write(iulog,*) 'r, i, j =', this_rank, i, j
-          write(iulog,*) 'Auu =', Auu(i,j,:)
-          write(iulog,*) 'Avv =', Avv(i,j,:)
+          write(iulog,*) 'i, k, Muu_sia, Mvv_sia:'
+          do i = staggered_ihi, staggered_ilo, -1
+             write(iulog,*) ' '
+             do k = 1, nz
+                write(iulog,'(2i4, 6e13.5)') i, k, Muu(:,k,i,j), Mvv(:,k,i,j)
+             enddo
+          enddo  ! i
        endif
 
+    elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then
+
+       ! Allocate tridiagonal preconditioning matrices
        allocate(Adiag_u   (nx-1,ny-1))
        allocate(Asubdiag_u(nx-1,ny-1))
        allocate(Asupdiag_u(nx-1,ny-1))
@@ -2756,22 +2977,26 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
        allocate(omega_v   (nx-1,ny-1))
        allocate(denom_v   (nx-1,ny-1))
 
-       call setup_preconditioner_tridiag_local_2d(&
-            nx,            ny,             &
-            parallel,      indxA_2d,       &
-            itest,         jtest,  rtest,  &
-            Auu,           Avv,            &
-            Adiag_u,       Adiag_v,        &
-            Asubdiag_u,    Asubdiag_v,     &
-            Asupdiag_u,    Asupdiag_v,     &
-            omega_u,       omega_v,        &
-            denom_u,       denom_v)
+       ! Compute arrays for tridiagonal preconditioning
 
+       call setup_preconditioner_tridiag_local_3d(&
+            nx,            ny,            &
+            nz,            parallel,      &
+            active_vertex,                &
+            indxA_2d,      indxA_3d,      &
+            itest, jtest,  rtest,         &
+            Auu,           Avv,           &
+            Muu,           Mvv,           &
+            Adiag_u,       Adiag_v,       &
+            Asubdiag_u,    Asubdiag_v,    &
+            Asupdiag_u,    Asupdiag_v,    &
+            omega_u,       omega_v,       &
+            denom_u,       denom_v)
+       
     elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then
 
        ! Allocate tridiagonal preconditioning matrices
-       ! Note: (i,j) indices are switced for the A_v matrices to reduce striding.
-
+       ! Note: (i,j) indices are switched for the A_v matrices to reduce striding.
        allocate(Adiag_u   (ilocal,jlocal))
        allocate(Asubdiag_u(ilocal,jlocal))
        allocate(Asupdiag_u(ilocal,jlocal))
@@ -2779,8 +3004,6 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
        allocate(denom_u(ilocal,jlocal))
        allocate(xuh_u(ilocal,jlocal))
        allocate(xlh_u(ilocal,jlocal))
-       allocate(b_u(ilocal,jlocal))
-       allocate(x_u(ilocal,jlocal))
 
        allocate(Adiag_v   (jlocal,ilocal))
        allocate(Asubdiag_v(jlocal,ilocal))
@@ -2789,8 +3012,6 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
        allocate(denom_v(jlocal,ilocal))
        allocate(xuh_v(jlocal,ilocal))
        allocate(xlh_v(jlocal,ilocal))
-       allocate(b_v(jlocal,ilocal))
-       allocate(x_v(jlocal,ilocal))
 
        ! These two matrices are for gathering data from all tasks on a given row or column.
        allocate(gather_data_row(8*tasks_row,jlocal))
@@ -2798,87 +3019,53 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
        gather_data_row = 0.0d0
        gather_data_col = 0.0d0
 
-       ! Compute the entries of the tridiagonal matrices
-
-       ! Extract tridiagonal matrix entries from Auu
-       do j = 1, jlocal
-          jj = j + staggered_jlo - 1
-          do i = 1, ilocal
-             ii = i + staggered_ilo - 1
-             Asubdiag_u(i,j) = Auu(ii,jj,indxA_2d(-1,0))   ! subdiagonal elements
-             Adiag_u   (i,j) = Auu(ii,jj,indxA_2d( 0,0))   ! diagonal elements
-             Asupdiag_u(i,j) = Auu(ii,jj,indxA_2d( 1,0))   ! superdiagonal elements
-          enddo
-       enddo
-
-       ! compute work arrays for the u solve in each matrix row
-       call setup_preconditioner_tridiag_global_2d(&
-            ilocal,       jlocal,      &
-!!          itest, jtest, rtest,       &
-            itest - staggered_ilo + 1, &  ! itest referenced to (ilocal,jlocal) coordinates
-            jtest - staggered_jlo + 1, &  ! jtest referenced to (ilocal,jlocal) coordinates
-            rtest,                     &
-            Adiag_u,                   &
-            Asubdiag_u,   Asupdiag_u,  &
-            omega_u,      denom_u,     &
-            xuh_u,        xlh_u)
-
-       ! Extract tridiagonal matrix entries from Avv
-       do i = 1, ilocal
-          ii = i + staggered_ilo - 1
-          do j = 1, jlocal
-             jj = j + staggered_jlo - 1
-             Asubdiag_v(j,i) = Avv(ii,jj,indxA_2d(0,-1))   ! subdiagonal elements
-             Adiag_v   (j,i) = Avv(ii,jj,indxA_2d(0, 0))   ! diagonal elements
-             Asupdiag_v(j,i) = Avv(ii,jj,indxA_2d(0, 1))   ! superdiagonal elements
-          enddo
-       enddo
-
-       ! compute work arrays for the v solve in each matrix column
-       ! Note: The *_v arrays have dimensions (jlocal,ilocal) to reduce strides
+       ! Compute arrays for tridiagonal preconditioning
 
-       call setup_preconditioner_tridiag_global_2d(&
-            jlocal,       ilocal,      &
-!!          itest, jtest, rtest,       &
-            jtest - staggered_jlo + 1, &  ! jtest referenced to (jlocal,ilocal) coordinates
-            itest - staggered_ilo + 1, &  ! itest referenced to (jlocal,ilocal) coordinates
-            rtest,                     &
-            Adiag_v,                   &
-            Asubdiag_v,   Asupdiag_v,  &
-            omega_v,      denom_v,     &
-            xuh_v,        xlh_v)
+       call setup_preconditioner_tridiag_global_3d(&
+            nx,            ny,            &
+            nz,            parallel,      &
+            active_vertex,                &
+            indxA_2d,      indxA_3d,      &
+            ilocal,        jlocal,        &
+            itest, jtest,  rtest,         &
+            Auu,           Avv,           &
+            Muu,           Mvv,           &
+            Adiag_u,       Adiag_v,       &
+            Asubdiag_u,    Asubdiag_v,    &
+            Asupdiag_u,    Asupdiag_v,    &
+            omega_u,       omega_v,       &
+            denom_u,       denom_v,       &
+            xuh_u,         xuh_v,         &
+            xlh_u,         xlh_v)
 
     endif   ! precond
 
-    !WHL - debug
-    if (verbose_pcg .and. main_task) write(iulog,*) 'Done in PC setup'
-
     call t_stopf("pcg_precond_init")
 
     !---- Initialize scalars and vectors
 
     niters = maxiters_chrongear
-    ru(:,:) = 0.d0
-    rv(:,:) = 0.d0
-    du(:,:) = 0.d0
-    dv(:,:) = 0.d0
-    zu(:,:) = 0.d0
-    zv(:,:) = 0.d0
-    qu(:,:) = 0.d0
-    qv(:,:) = 0.d0
-    Azu(:,:) = 0.d0
-    Azv(:,:) = 0.d0
-    worku(:,:) = 0.d0
-    workv(:,:) = 0.d0
-    work2u(:,:,:) = 0.d0
-    work2v(:,:,:) = 0.d0
+    ru(:,:,:) = 0.d0
+    rv(:,:,:) = 0.d0
+    du(:,:,:) = 0.d0
+    dv(:,:,:) = 0.d0
+    zu(:,:,:) = 0.d0
+    zv(:,:,:) = 0.d0
+    qu(:,:,:) = 0.d0
+    qv(:,:,:) = 0.d0
+    Azu(:,:,:) = 0.d0
+    Azv(:,:,:) = 0.d0
+    worku(:,:,:) = 0.d0
+    workv(:,:,:) = 0.d0
+    work2u(:,:,:,:) = 0.d0
+    work2v(:,:,:,:) = 0.d0
 
     !---- Compute the L2 norm of the RHS vectors
     !---- (Goal is to obtain L2_resid/L2_rhs < tolerance)
 
     call t_startf("pcg_dotprod")
-    worku(:,:) = bu(:,:)*bu(:,:)    ! terms of dot product (b, b)
-    workv(:,:) = bv(:,:)*bv(:,:)
+    worku(:,:,:) = bu(:,:,:)*bu(:,:,:)    ! terms of dot product (b, b)
+    workv(:,:,:) = bv(:,:,:)*bv(:,:,:)
     call t_stopf("pcg_dotprod")
 
     ! find global sum of the squared L2 norm
@@ -2895,6 +3082,8 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
     ! First pass of algorithm
     !---------------------------------------------------------------
 
+    iter = 1
+
     ! Note: The matrix A must be complete for all rows corresponding to locally 
     !        owned nodes, and x must have the correct values in
     !        halo nodes bordering the locally owned nodes.
@@ -2910,9 +3099,9 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
     !---- Compute A*x   (use z as a temp vector for A*x)
 
     call t_startf("pcg_matmult_init")
-    call matvec_multiply_structured_2d(nx,            ny,            &
-                                       parallel,                     &
-                                       indxA_2d,      active_vertex, &
+    call matvec_multiply_structured_3d(nx,            ny,            &
+                                       nz,            parallel,      &
+                                       indxA_3d,      active_vertex, &
                                        Auu,           Auv,           &
                                        Avu,           Avv,           &
                                        xu,            xv,            &
@@ -2923,8 +3112,8 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
     !---- This is correct for locally owned nodes.
 
     call t_startf("pcg_vecupdate")
-    ru(:,:) = bu(:,:) - zu(:,:)
-    rv(:,:) = bv(:,:) - zv(:,:)
+    ru(:,:,:) = bu(:,:,:) - zu(:,:,:)
+    rv(:,:,:) = bv(:,:,:) - zv(:,:,:)
     call t_stopf("pcg_vecupdate")
 
     !---- Halo update for residual
@@ -2940,169 +3129,81 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
     ! From here on, call timers with 'iter' suffix because this can be considered the first iteration
     call t_startf("pcg_precond_iter")
 
-    if (precond == HO_PRECOND_NONE) then      ! no preconditioning
+    if (precond == HO_PRECOND_NONE) then   ! no preconditioning
 
-       zu(:,:) = ru(:,:)         ! PC(r) = r     
-       zv(:,:) = rv(:,:)         ! PC(r) = r    
+       zu(:,:,:) = ru(:,:,:)         ! PC(r) = r     
+       zv(:,:,:) = rv(:,:,:)         ! PC(r) = r    
 
-    elseif (precond == HO_PRECOND_DIAG) then  ! diagonal preconditioning
+    elseif (precond == HO_PRECOND_DIAG ) then  ! diagonal preconditioning
 
-       ! Solve Mz = r, where M is a diagonal matrix
        do j = 1, ny-1
        do i = 1, nx-1
-          if (Adiagu(i,j) /= 0.d0) then
-             zu(i,j) = ru(i,j) / Adiagu(i,j)   ! PC(r), where PC is formed from diagonal elements of A
+       do k = 1, nz
+          if (Adiagu(k,i,j) /= 0.d0) then
+             zu(k,i,j) = ru(k,i,j) / Adiagu(k,i,j)   ! PC(r), where PC is formed from diagonal elements of A
           else                                        
-             zu(i,j) = 0.d0
+             zu(k,i,j) = 0.d0
           endif
-          if (Adiagv(i,j) /= 0.d0) then
-             zv(i,j) = rv(i,j) / Adiagv(i,j)
+          if (Adiagv(k,i,j) /= 0.d0) then
+             zv(k,i,j) = rv(k,i,j) / Adiagv(k,i,j)  
           else                                        
-             zv(i,j) = 0.d0
+             zv(k,i,j) = 0.d0
           endif
+       enddo    ! k
        enddo    ! i
        enddo    ! j
 
-       !WHL - debug
-       if (verbose_pcg .and. this_rank == rtest) then
-!          i = itest
-!          write(iulog,*) ' '
-!          write(iulog,*) 'zv solve with diagonal precond, this_rank, i =', this_rank, i
-!          write(iulog,*) 'j, active, Adiagv, rv, zv, xv:'
-!          do j = staggered_jhi, staggered_jlo, -1
-!             write(iulog,'(i4, l4, 2f12.3, e12.3, f12.3)') j, active_vertex(i,j), Adiagv(i,j), rv(i,j), zv(i,j), xv(i,j)
-!          enddo
-       endif
-
-    elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then  ! local
-
-       if (verbose_tridiag .and. this_rank == rtest) then
-          i = itest
-          j = jtest
-          write(iulog,*) 'Residual:'
-          write(iulog,*) 'r, i, j, ru:', this_rank, i, j, ru(i,j)
-          write(iulog,*) 'r, i, j, rv:', this_rank, i, j, rv(i,j)
-          write(iulog,*) ' '
-          write(iulog,*) 'jtest =', jtest
-          write(iulog,*) 'i, ru, rv:'
-          do i = itest-3, itest+3
-             write(iulog,'(i4, 2f15.10)') i, ru(i,j), rv(i,j)
-          enddo
-       endif
-
-       if (verbose_pcg .and. main_task) then
-          write(iulog,*) 'call tridiag_solver_local_2d'
-       endif
-
-       ! Solve M*z = r, where M is a local tridiagonal matrix (one matrix per task)
-
-       !TODO - Test a local solver that can compute zu and zv in the halo
-       !       (to avoid the halo update below)
-
-       call tridiag_solver_local_2d(nx,            ny,            &
-                                    parallel,                     &
-                                    itest, jtest,  rtest,         &
-                                    Adiag_u,       Adiag_v,       &  ! entries of preconditioning matrix
-                                    Asubdiag_u,    Asubdiag_v,    &
-                                    Asupdiag_u,    Asupdiag_v,    &
-                                    omega_u,       omega_v,       &
-                                    denom_u,       denom_v,       &
-                                    ru,            rv,            &  ! right hand side
-                                    zu,            zv)               ! solution
-
-       !WHL - debug
-       if (verbose_pcg .and. this_rank == rtest) then
-          j = jtest
-          write(iulog,*) ' '
-          write(iulog,*) 'tridiag solve: i, ru, rv, zu, zv:'
-          do i = itest-3, itest+3
-             write(iulog,'(i4, 4f16.10)') i, ru(i,j), rv(i,j), zu(i,j), zv(i,j)
-          enddo
-       endif
-
-       !Note: Need zu and zv in a row of halo cells so that q = A*d is correct in all locally owned cells
-       !TODO: See whether tridiag solvers could be modified to provide zu and zv in halo cells?
-       call staggered_parallel_halo(zu, parallel)
-       call staggered_parallel_halo(zv, parallel)
-
-    elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then   ! tridiagonal preconditioning with global solve
-
-       ! convert ru(nx-1,ny-1) to b_u(ilocal,jlocal)
-       do j = 1, jlocal
-          jj = j + staggered_jlo - 1
-          do i = 1, ilocal
-             ii = i + staggered_ilo - 1
-             b_u(i,j) = ru(ii,jj)
-          enddo
-       enddo
-
-       ! Solve M*z = r, where M is a global tridiagonal matrix
+    elseif (precond == HO_PRECOND_SIA) then   ! local vertical shallow-ice solver for preconditioning
 
-       call tridiag_solver_global_2d(ilocal,       jlocal,      &
-                                     parallel,     tasks_row,   &
-                                     'row',                     &  ! tridiagonal solve for each row
-!!                                          itest, jtest, rtest,      &
-                                     itest - staggered_ilo + 1, &  ! itest referenced to (ilocal,jlocal) coordinates
-                                     jtest - staggered_jlo + 1, &  ! jtest referenced to (ilocal,jlocal) coordinates
-                                     rtest,                     &
-                                     Adiag_u,                   &
-                                     Asubdiag_u,   Asupdiag_u,  &
-                                     omega_u,      denom_u,     &
-                                     xuh_u,        xlh_u,       &
-                                     b_u,          x_u,         &
-                                     .true.,                    &  ! first_time
-                                     gather_data_row)
+       call easy_sia_solver(nx,   ny,   nz,        &
+                            active_vertex,         &
+                            Muu,  ru,   zu)      ! solve Muu*zu = ru for zu 
 
-       ! convert x_u(ilocal,jlocal) to zu(nx-1,ny-1)
-       zu(:,:) = 0.0d0
-       do j = 1, jlocal
-          jj = j + staggered_jlo - 1
-          do i = 1, ilocal
-             ii = i + staggered_ilo - 1
-             zu(ii,jj) = x_u(i,j)
-          enddo
-       enddo
+       call easy_sia_solver(nx,   ny,   nz,        &
+                            active_vertex,         &
+                            Mvv,  rv,   zv)      ! solve Mvv*zv = rv for zv
 
-       ! convert rv(nx-1,ny-1) to b_v(jlocal,ilocal)
+    elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then
 
-       do i = 1, ilocal
-          ii = i + staggered_ilo - 1
-          do j = 1, jlocal
-             jj = j + staggered_jlo - 1
-             b_v(j,i) = rv(ii,jj)
-          enddo
-       enddo
+       ! Use a local tridiagonal solver to find an approximate solution of A*z = r
 
-       call tridiag_solver_global_2d(jlocal,       ilocal,      &
-                                     parallel,     tasks_col,   &
-                                     'col',                     &  ! tridiagonal solve for each column
-!!                                          itest, jtest, rtest,      &
-                                     jtest - staggered_jlo + 1, &  ! jtest referenced to (jlocal,ilocal) coordinates
-                                     itest - staggered_ilo + 1, &  ! itest referenced to (jlocal,ilocal) coordinates
-                                     rtest,                     &
-                                     Adiag_v,                   &
-                                     Asubdiag_v,   Asupdiag_v,  &
-                                     omega_v,      denom_v,     &
-                                     xuh_v,        xlh_v,       &
-                                     b_v,          x_v,         &
-                                     .true.,                    &  ! first_time
-                                     gather_data_col)
+       call tridiag_solver_local_3d(&
+            nx,            ny,            &
+            nz,            parallel,      &
+            active_vertex,                &
+            itest, jtest,  rtest,         &
+            Adiag_u,       Adiag_v,       &  ! entries of 2D preconditioning matrix
+            Asubdiag_u,    Asubdiag_v,    &
+            Asupdiag_u,    Asupdiag_v,    &
+            omega_u,       omega_v,       &
+            denom_u,       denom_v,       &
+            Muu,           Mvv,           &  ! entries of SIA matrix
+            ru,            rv,            &  ! 3D residual
+            zu,            zv)               ! approximate solution of Az = r
 
-       ! convert x_v(jlocal,ilocal) to zv(nx-1,ny-1)
+    elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then
 
-       zv(:,:) = 0.0d0
-       do i = 1, ilocal
-          ii = i + staggered_ilo - 1
-          do j = 1, jlocal
-             jj = j + staggered_jlo - 1
-             zv(ii,jj) = x_v(j,i)
-          enddo
-       enddo
+       ! Use a global tridiagonal solver to find an approximate solution of A*z = r
 
-       !Note: Need zu and zv in a row of halo cells so that q = A*d is correct in all locally owned cells
-       !TODO: See whether tridiag_solver_local_2d could be modified to provide zu and zv in halo cells?
-       call staggered_parallel_halo(zu, parallel)
-       call staggered_parallel_halo(zv, parallel)
+       call tridiag_solver_global_3d(&
+            nx,              ny,              &
+            nz,              parallel,        &
+            active_vertex,                    &
+            ilocal,          jlocal,          &
+            tasks_row,       tasks_col,       &
+            itest,  jtest,   rtest,           &
+            Adiag_u,         Adiag_v,         &  ! entries of 2D preconditioning matrix
+            Asubdiag_u,      Asubdiag_v,      &
+            Asupdiag_u,      Asupdiag_v,      &
+            omega_u,         omega_v,         &
+            denom_u,         denom_v,         &
+            xuh_u,           xuh_v,           &
+            xlh_u,           xlh_v,           &
+            Muu,             Mvv,             &  ! entries of SIA matrix
+            gather_data_row, gather_data_col, &
+            .true.,                           &  ! first_time = T (first iteration)
+            ru,              rv,              &  ! 3D residual
+            zu,              zv)                 ! approximate solution of Az = r
 
     endif    ! precond
 
@@ -3111,23 +3212,23 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
     !---- Compute intermediate result for dot product (r,z)
 
     call t_startf("pcg_dotprod")
-    work2u(:,:,1) = ru(:,:) * zu(:,:)
-    work2v(:,:,1) = rv(:,:) * zv(:,:)
+    work2u(:,:,:,1) = ru(:,:,:) * zu(:,:,:)
+    work2v(:,:,:,1) = rv(:,:,:) * zv(:,:,:)
     call t_stopf("pcg_dotprod")
 
     !---- Compute the conjugate direction vector d
     !---- Since z is correct in halo, so is d
 
-    du(:,:) = zu(:,:)
-    dv(:,:) = zv(:,:)
+    du(:,:,:) = zu(:,:,:)
+    dv(:,:,:) = zv(:,:,:)
 
     !---- Compute q = A*d
-    !---- q is correct for locally owned nodes, provided d extends one layer into the halo
+    !---- q is correct for locally owned nodes
 
     call t_startf("pcg_matmult_iter")
-    call matvec_multiply_structured_2d(nx,            ny,            &
-                                       parallel,                     &
-                                       indxA_2d,      active_vertex, &
+    call matvec_multiply_structured_3d(nx,            ny,            &
+                                       nz,            parallel,      &
+                                       indxA_3d,      active_vertex, &
                                        Auu,           Auv,           &
                                        Avu,           Avv,           &
                                        du,            dv,            &
@@ -3137,14 +3238,14 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
     !---- Compute intermediate result for dot product (d,q) = (d,Ad)
 
     call t_startf("pcg_dotprod")
-    work2u(:,:,2) = du(:,:) * qu(:,:)
-    work2v(:,:,2) = dv(:,:) * qv(:,:)
+    work2u(:,:,:,2) = du(:,:,:) * qu(:,:,:)
+    work2v(:,:,:,2) = dv(:,:,:) * qv(:,:,:)
     call t_stopf("pcg_dotprod")
 
     !---- Find global sums of (r,z) and (d,q)
 
     call t_startf("pcg_glbsum_iter")
-    gsum = parallel_global_sum_stagger(work2u, 2, parallel, work2v)  ! nflds = 2
+    gsum = parallel_global_sum_stagger(work2u, 2, parallel, work2v)
     call t_stopf("pcg_glbsum_iter")
 
     !---- Halo update for q
@@ -3168,20 +3269,23 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
     !---- These are correct in halo
 
     call t_startf("pcg_vecupdate")
-    xu(:,:) = xu(:,:) + alpha*du(:,:)
-    xv(:,:) = xv(:,:) + alpha*dv(:,:)
+    xu(:,:,:) = xu(:,:,:) + alpha*du(:,:,:)
+    xv(:,:,:) = xv(:,:,:) + alpha*dv(:,:,:)
 
-    ru(:,:) = ru(:,:) - alpha*qu(:,:)     ! q = A*d
-    rv(:,:) = rv(:,:) - alpha*qv(:,:)
+    ru(:,:,:) = ru(:,:,:) - alpha*qu(:,:,:)     ! q = A*d
+    rv(:,:,:) = rv(:,:,:) - alpha*qv(:,:,:)
     call t_stopf("pcg_vecupdate")
 
-    !WHL - debug
     if (verbose_pcg .and. this_rank == rtest) then
        j = jtest
-!!       write(iulog,*) ' '
-!!       write(iulog,*) 'iter = 1: i, xu, xv, ru, rv:'
+       write(iulog,*) ' '
+       write(iulog,*) 'iter =', iter
+       write(iulog,*) 'i, k, xu, xv, ru, rv:'
        do i = itest-3, itest+3
-!!          write(iulog,'(i4, 4f16.10)') i, xu(i,j), xv(i,j), ru(i,j), rv(i,j)
+          write(iulog,*) ' '
+          do k = 1, nz
+             write(iulog,'(2i4, 4f16.10)') i, k, xu(k,i,j), xv(k,i,j), ru(k,i,j), rv(k,i,j)
+          enddo
        enddo  ! i
     endif
 
@@ -3191,190 +3295,86 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
 
     iter_loop: do iter = 2, maxiters_chrongear  ! first iteration done above
 
-       if (verbose_pcg .and. main_task) then
-!          write(iulog,*) 'iter =', iter
-       endif
-
        !---- Compute PC(r) = solution z of Mz = r
        !---- z is correct in halo
 
        call t_startf("pcg_precond_iter")
 
-       if (precond == HO_PRECOND_NONE) then      ! no preconditioning
-
-           zu(:,:) = ru(:,:)         ! PC(r) = r
-           zv(:,:) = rv(:,:)         ! PC(r) = r    
-
-       elseif (precond == HO_PRECOND_DIAG) then  ! diagonal preconditioning
-
-          do j = 1, ny-1
-          do i = 1, nx-1
-             if (Adiagu(i,j) /= 0.d0) then
-                zu(i,j) = ru(i,j) / Adiagu(i,j)   ! PC(r), where PC is formed from diagonal elements of A
-             else
-                zu(i,j) = 0.d0
-             endif
-             if (Adiagv(i,j) /= 0.d0) then
-                zv(i,j) = rv(i,j) / Adiagv(i,j)
-             else
-                zv(i,j) = 0.d0
-             endif
-          enddo    ! i
-          enddo    ! j
-
-       elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then   ! tridiagonal preconditioning with local solve
-
-          ! Solve M*z = r, where M is a local tridiagonal matrix (one matrix per task)
-
-          !TODO - Test a local solver that can compute zu and zv in the halo
-          !       (to avoid the halo update below)
-
-          !WHL - debug
-          if (verbose_tridiag .and. this_rank == rtest) then
-             i = itest
-             j = jtest
-!             write(iulog,*) 'Residual:'
-!             write(iulog,*) 'r, i, j, ru:', this_rank, i, j, ru(i,j)
-!             write(iulog,*) 'r, i, j, rv:', this_rank, i, j, rv(i,j)
-!             write(iulog,*) ' '
-!             write(iulog,*) 'jtest =', jtest
-!             write(iulog,*) 'i, ru, rv:'
-!             do i = staggered_ihi, staggered_ilo, -1
-!                write(iulog,'(i4, 2f15.10)') i, ru(i,j), rv(i,j)
-!             enddo
-          endif
-
-
-          !WHL - debug
-          if (verbose_pcg .and. this_rank == rtest) then
-             write(iulog,*) '   call tridiag_solver_local_2d'
-          endif
-
-          call tridiag_solver_local_2d(nx,            ny,            &
-                                       parallel,                     &
-                                       itest, jtest,  rtest,         &
-                                       Adiag_u,       Adiag_v,       &  ! entries of preconditioning matrix
-                                       Asubdiag_u,    Asubdiag_v,    &
-                                       Asupdiag_u,    Asupdiag_v,    &
-                                       omega_u,       omega_v,       &
-                                       denom_u,       denom_v,       &
-                                       ru,            rv,            &  ! right hand side
-                                       zu,            zv)               ! solution
-
-          !Note: Need zu and zv in a row of halo cells so that q = A*d is correct in all locally owned cells
-          !TODO: See whether tridiag solvers could be modified to provide zu and zv in halo cells?
-          call staggered_parallel_halo(zu, parallel)
-          call staggered_parallel_halo(zv, parallel)
-
-          if (verbose_pcg .and. this_rank == rtest) then
-             j = jtest
-             write(iulog,*) ' '
-             write(iulog,*) 'tridiag solve: i, ru, rv, zu, zv:'
-             do i = itest-3, itest+3
-                write(iulog,'(i4, 4f16.10)') i, ru(i,j), rv(i,j), zu(i,j), zv(i,j)
-             enddo
-          endif
-
-       elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then   ! tridiagonal preconditioning with global solve
+       if (precond == HO_PRECOND_NONE) then   ! no preconditioning
 
-          !WHL - debug
-          if (verbose_tridiag .and. this_rank == rtest) then
-             j = jtest
-             write(iulog,*) ' '
-             write(iulog,*) 'jtest =', jtest
-             write(iulog,*) 'i, ru, rv:'
-             do i = itest-3, itest+3
-                write(iulog,'(i4, 2f15.10)') i, ru(i,j), rv(i,j)
-             enddo
-          endif
+           zu(:,:,:) = ru(:,:,:)         ! PC(r) = r
+           zv(:,:,:) = rv(:,:,:)         ! PC(r) = r    
 
-          ! convert ru(nx-1,ny-1) to b_u(ilocal,jlocal)
+       elseif (precond == HO_PRECOND_DIAG ) then  ! diagonal preconditioning
 
-          do j = 1, jlocal
-             jj = j + staggered_jlo - 1
-             do i = 1, ilocal
-                ii = i + staggered_ilo - 1
-                b_u(i,j) = ru(ii,jj)
-             enddo
-          enddo
+          do j = 1, ny-1
+          do i = 1, nx-1
+          do k = 1, nz
+             if (Adiagu(k,i,j) /= 0.d0) then
+                zu(k,i,j) = ru(k,i,j) / Adiagu(k,i,j)   ! PC(r), where PC is formed from diagonal elements of A
+             else
+                zu(k,i,j) = 0.d0
+             endif
+             if (Adiagv(k,i,j) /= 0.d0) then
+                zv(k,i,j) = rv(k,i,j) / Adiagv(k,i,j)
+             else
+                zv(k,i,j) = 0.d0
+             endif
+          enddo    ! k
+          enddo    ! i
+          enddo    ! j
 
-          !WHL - debug
-          if (verbose_pcg .and. this_rank == rtest) then
-             j = jtest
-             write(iulog,*) ' '
-             write(iulog,*) 'Before global tridiag PC u solve, r, j =', rtest, jtest
-             write(iulog,*) ' '
-             write(iulog,*) 'i, Adiag_u, Asubdiag_u, Asupdiag_u, b_u:'
-             do i = itest-3, itest+3
-                write(iulog,'(i4, 4e16.8)') i, Adiag_u(i,j), Asubdiag_u(i,j), Asupdiag_u(i,j), b_u(i,j)
-             enddo
-          endif
+       elseif (precond == HO_PRECOND_SIA) then   ! local vertical shallow-ice solver for preconditioning
 
-          call tridiag_solver_global_2d(ilocal,       jlocal,      &
-                                        parallel,     tasks_row,   &
-                                        'row',                     &  ! tridiagonal solve for each row
-!!                                             itest, jtest, rtest,      &
-                                        itest - staggered_ilo + 1, &  ! itest referenced to (ilocal,jlocal) coordinates
-                                        jtest - staggered_jlo + 1, &  ! jtest referenced to (ilocal,jlocal) coordinates
-                                        rtest,                     &
-                                        Adiag_u,                   &
-                                        Asubdiag_u,   Asupdiag_u,  &
-                                        omega_u,      denom_u,     &
-                                        xuh_u,        xlh_u,       &
-                                        b_u,          x_u,         &
-                                        .false.,                   &  ! first_time
-                                        gather_data_row)
+          call easy_sia_solver(nx,   ny,   nz,        &
+                               active_vertex,         &
+                               Muu,  ru,   zu)      ! solve Muu*zu = ru for zu 
 
-          ! convert x_u(ilocal,jlocal) to zu(nx-1,ny-1)
-          zu(:,:) = 0.0d0
-          do j = 1, jlocal
-             jj = j + staggered_jlo - 1
-             do i = 1, ilocal
-                ii = i + staggered_ilo - 1
-                zu(ii,jj) = x_u(i,j)
-             enddo
-          enddo
+          call easy_sia_solver(nx,   ny,   nz,        &
+                               active_vertex,         &
+                               Mvv,  rv,   zv)      ! solve Mvv*zv = rv for zv
 
-          ! convert rv(nx-1,ny-1) to b_v(jlocal,ilocal)
+       elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then
 
-          do i = 1, ilocal
-             ii = i + staggered_ilo - 1
-             do j = 1, jlocal
-                jj = j + staggered_jlo - 1
-                b_v(j,i) = rv(ii,jj)
-             enddo
-          enddo
+          ! Use a local tridiagonal solver to find an approximate solution of A*z = r
 
-          call tridiag_solver_global_2d(jlocal,       ilocal,      &
-                                        parallel,     tasks_col,   &
-                                        'col',                     &  ! tridiagonal solve for each column
-!!                                             itest, jtest, rtest,      &
-                                        jtest - staggered_jlo + 1, &  ! jtest referenced to (jlocal,ilocal) coordinates
-                                        itest - staggered_ilo + 1, &  ! itest referenced to (jlocal,ilocal) coordinates
-                                        rtest,                     &
-                                        Adiag_v,                   &
-                                        Asubdiag_v,   Asupdiag_v,  &
-                                        omega_v,      denom_v,     &
-                                        xuh_v,        xlh_v,       &
-                                        b_v,          x_v,         &
-                                        .false.,                   &  ! first_time
-                                        gather_data_col)
+          call tridiag_solver_local_3d(&
+               nx,            ny,            &
+               nz,            parallel,      &
+               active_vertex,                &
+               itest, jtest,  rtest,         &
+               Adiag_u,       Adiag_v,       &  ! entries of 2D preconditioning matrix
+               Asubdiag_u,    Asubdiag_v,    &
+               Asupdiag_u,    Asupdiag_v,    &
+               omega_u,       omega_v,       &
+               denom_u,       denom_v,       &
+               Muu,           Mvv,           &  ! entries of SIA matrix
+               ru,            rv,            &  ! 3D residual
+               zu,            zv)               ! approximate solution of Az = r
 
-          ! convert x_v(jlocal,ilocal) to zv(nx-1,ny-1)
+       elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then   ! tridiagonal preconditioning with global solve
 
-          zv(:,:) = 0.0d0
-          do i = 1, ilocal
-             ii = i + staggered_ilo - 1
-             do j = 1, jlocal
-                jj = j + staggered_jlo - 1
-                zv(ii,jj) = x_v(j,i)
-             enddo
-          enddo
+          ! Use a global tridiagonal solver to find an approximate solution of A*z = r
 
-          !Note: Need zu and zv in a row of halo cells so that q = A*d is correct in all locally owned cells
-          !TODO: See whether tridiag solvers could be modified to provide zu and zv in halo cells?
-          call staggered_parallel_halo(zu, parallel)
-          call staggered_parallel_halo(zv, parallel)
+          call tridiag_solver_global_3d(&
+               nx,              ny,              &
+               nz,              parallel,        &
+               active_vertex,                    &
+               ilocal,          jlocal,          &
+               tasks_row,       tasks_col,       &
+               itest,  jtest,   rtest,           &
+               Adiag_u,         Adiag_v,         &  ! entries of 2D preconditioning matrix
+               Asubdiag_u,      Asubdiag_v,      &
+               Asupdiag_u,      Asupdiag_v,      &
+               omega_u,         omega_v,         &
+               denom_u,         denom_v,         &
+               xuh_u,           xuh_v,           &
+               xlh_u,           xlh_v,           &
+               Muu,             Mvv,             &  ! entries of SIA matrix
+               gather_data_row, gather_data_col, &
+               .false.,                          &  ! first_time = F (iteration 2+)
+               ru,              rv,              &  ! 3D residual
+               zu,              zv)                 ! approximate solution of Az = r
 
        endif    ! precond
 
@@ -3382,26 +3382,27 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
 
        !---- Compute Az = A*z
        !---- This is the one matvec multiply required per iteration
-       !---- Az is correct for locally owned nodes and needs a halo update (below)
+       !---- Az is correct for local owned nodes and needs a halo update (below)
 
        call t_startf("pcg_matmult_iter")
-       call matvec_multiply_structured_2d(nx,            ny,            &
-                                          parallel,                     &
-                                          indxA_2d,      active_vertex, &
+       call matvec_multiply_structured_3d(nx,            ny,            &
+                                          nz,            parallel,      &
+                                          indxA_3d,      active_vertex, &
                                           Auu,           Auv,           &
                                           Avu,           Avv,           &
                                           zu,            zv,            &
                                           Azu,           Azv)
        call t_stopf("pcg_matmult_iter")
 
+
        !---- Compute intermediate results for the dot products (r,z) and (Az,z)
 
        call t_startf("pcg_dotprod")
-       work2u(:,:,1) = ru(:,:)*zu(:,:)     ! terms of dot product (r,z)
-       work2v(:,:,1) = rv(:,:)*zv(:,:)    
+       work2u(:,:,:,1) = ru(:,:,:)*zu(:,:,:)     ! terms of dot product (r,z)
+       work2v(:,:,:,1) = rv(:,:,:)*zv(:,:,:)    
 
-       work2u(:,:,2) = Azu(:,:)*zu(:,:)    ! terms of dot product (A*z,z)
-       work2v(:,:,2) = Azv(:,:)*zv(:,:)    
+       work2u(:,:,:,2) = Azu(:,:,:)*zu(:,:,:)    ! terms of dot product (A*z,z)
+       work2v(:,:,:,2) = Azv(:,:,:)*zv(:,:,:)    
        call t_stopf("pcg_dotprod")
 
        ! Take the global sums of (r,z) and (Az,z)
@@ -3409,10 +3410,11 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
        ! this is the one MPI global reduction per iteration.
 
        call t_startf("pcg_glbsum_iter")
-       gsum = parallel_global_sum_stagger(work2u, 2, parallel, work2v)  ! nflds = 2
+       gsum = parallel_global_sum_stagger(work2u, 2, parallel, work2v)
        call t_stopf("pcg_glbsum_iter")
 
        !---- Halo update for Az
+       !---- This is the one halo update required per iteration
 
        call t_startf("pcg_halo_iter")
        call staggered_parallel_halo(Azu, parallel)
@@ -3431,7 +3433,7 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
 
        if (alpha /= alpha) then  ! alpha is NaN
 !!          write(iulog,*) 'rho, sigma, alpha:', rho, sigma, alpha
-          call write_log('Chron_Gear PCG solver has failed, alpha = NaN', GM_FATAL)
+          call write_log('Chron-Gear PCG solver has failed, alpha = NaN', GM_FATAL)
        endif
 
        !---- Update d and q
@@ -3439,25 +3441,38 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
 
        call t_startf("pcg_vecupdate")
 
-       du(:,:) = zu(:,:) + beta*du(:,:)       ! d_(i+1) = PC(r_(i+1)) + beta_(i+1)*d_i
-       dv(:,:) = zv(:,:) + beta*dv(:,:)       !
-                                              !                    (r_(i+1), PC(r_(i+1)))
-                                              ! where beta_(i+1) = --------------------  
-                                              !                        (r_i, PC(r_i)) 
-       qu(:,:) = Azu(:,:) + beta*qu(:,:)
-       qv(:,:) = Azv(:,:) + beta*qv(:,:)
+       du(:,:,:) = zu(:,:,:) + beta*du(:,:,:)       ! d_(i+1) = PC(r_(i+1)) + beta_(i+1)*d_i
+       dv(:,:,:) = zv(:,:,:) + beta*dv(:,:,:)       !
+                                                    !                    (r_(i+1), PC(r_(i+1)))
+                                                    ! where beta_(i+1) = --------------------  
+                                                    !                        (r_i, PC(r_i)) 
+       qu(:,:,:) = Azu(:,:,:) + beta*qu(:,:,:)
+       qv(:,:,:) = Azv(:,:,:) + beta*qv(:,:,:)
 
        !---- Update solution and residual
        !---- These are correct in halo
 
-       xu(:,:) = xu(:,:) + alpha*du(:,:)
-       xv(:,:) = xv(:,:) + alpha*dv(:,:)
+       xu(:,:,:) = xu(:,:,:) + alpha*du(:,:,:)
+       xv(:,:,:) = xv(:,:,:) + alpha*dv(:,:,:)
 
-       ru(:,:) = ru(:,:) - alpha*qu(:,:)
-       rv(:,:) = rv(:,:) - alpha*qv(:,:)
+       ru(:,:,:) = ru(:,:,:) - alpha*qu(:,:,:)
+       rv(:,:,:) = rv(:,:,:) - alpha*qv(:,:,:)
 
        call t_stopf("pcg_vecupdate")
 
+       if (verbose_pcg .and. this_rank == rtest) then
+          j = jtest
+          write(iulog,*) ' '
+          write(iulog,*) 'iter =', iter
+          write(iulog,*) 'i, k, xu, xv, ru, rv:'
+          do i = itest-3, itest+3
+             write(iulog,*) ' '
+             do k = 1, nz
+                write(iulog,'(2i4, 4f16.10)') i, k, xu(k,i,j), xv(k,i,j), ru(k,i,j), rv(k,i,j)
+             enddo
+          enddo  ! i
+       endif
+
        ! Check for convergence every linear_solve_ncheck iterations.
        ! Also check at iter = 5, to reduce iterations when the nonlinear solver is close to convergence.
        ! TODO: Check at iter = linear_solve_ncheck/2 instead of 5?  This would be answer-changing.
@@ -3468,19 +3483,15 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
 
           if (verbose_pcg .and. main_task) then
              write(iulog,*) ' '
-             write(iulog,*) '   check convergence, iter =', iter
+             write(iulog,*) 'Check convergence, iter =', iter
           endif
 
           !---- Compute z = A*x  (use z as a temp vector for A*x)
            
-          !WHL - debug - don't think this is needed, but try just in case
-!!          call staggered_parallel_halo(xu, parallel)
-!!          call staggered_parallel_halo(xv, parallel)
-
           call t_startf("pcg_matmult_resid")
-          call matvec_multiply_structured_2d(nx,            ny,            &
-                                             parallel,                     &
-                                             indxA_2d,      active_vertex, &
+          call matvec_multiply_structured_3d(nx,            ny,            &
+                                             nz,            parallel,      &
+                                             indxA_3d,      active_vertex, &
                                              Auu,           Auv,           &
                                              Avu,           Avv,           &
                                              xu,            xv,            &
@@ -3489,35 +3500,27 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
 
           !---- Compute residual r = b - A*x
 
-          !WHL - debug - don't think this is needed, but try just in case
-!!          call staggered_parallel_halo(bu, parallel)
-!!          call staggered_parallel_halo(bv, parallel)
-          !WHL - debug - don't think this is needed, but try just in case
-!!          call staggered_parallel_halo(zu, parallel)
-!!          call staggered_parallel_halo(zv, parallel)
-
           call t_startf("pcg_vecupdate")
-          ru(:,:) = bu(:,:) - zu(:,:)
-          rv(:,:) = bv(:,:) - zv(:,:)
+          ru(:,:,:) = bu(:,:,:) - zu(:,:,:)
+          rv(:,:,:) = bv(:,:,:) - zv(:,:,:)
           call t_stopf("pcg_vecupdate")
 
-          !WHL - debug - don't think this is needed, but try just in case
-!!          call staggered_parallel_halo(ru, parallel)
-!!          call staggered_parallel_halo(rv, parallel)
-
           !---- Compute dot product (r, r)
 
           call t_startf("pcg_dotprod")
-          worku(:,:) = ru(:,:)*ru(:,:)
-          workv(:,:) = rv(:,:)*rv(:,:)
+          worku(:,:,:) = ru(:,:,:)*ru(:,:,:)
+          workv(:,:,:) = rv(:,:,:)*rv(:,:,:)
           call t_stopf("pcg_dotprod")
 
           call t_startf("pcg_glbsum_resid")
           rr = parallel_global_sum_stagger(worku, parallel, workv)
           call t_stopf("pcg_glbsum_resid")
 
-          L2_resid = sqrt(rr)          ! L2 norm of residual
-          err = L2_resid/L2_rhs        ! normalized error
+          ! take square root
+          L2_resid = sqrt(rr)
+
+          ! compute normalized error
+          err = L2_resid/L2_rhs
 
           if (verbose_pcg .and. main_task) then
              write(iulog,*) 'iter, L2_resid, L2_rhs, error =', iter, L2_resid, L2_rhs, err
@@ -3531,13 +3534,13 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
              ju_max = 0
              do j = staggered_jlo, staggered_jhi
                 do i = staggered_ilo, staggered_ihi
-                   if (abs(ru(i,j)) > ru_max) then
-                      ru_max = ru(i,j)
+                   if (sum(abs(ru(:,i,j))) > ru_max) then
+                      ru_max = sum(abs(ru(:,i,j)))
                       iu_max = i
                       ju_max = j
                    endif
-                   if (abs(rv(i,j)) > rv_max) then
-                      rv_max = rv(i,j)
+                   if (sum(abs(rv(:,i,j))) > rv_max) then
+                      rv_max = sum(abs(rv(:,i,j)))
                       iv_max = i
                       jv_max = j
                    endif
@@ -3585,12 +3588,10 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
     if (allocated(denom_u))    deallocate(denom_u, denom_v)
     if (allocated(xuh_u))      deallocate(xuh_u, xuh_v)
     if (allocated(xlh_u))      deallocate(xlh_u, xlh_v)
-    if (allocated(b_u))        deallocate(b_u, b_v)
-    if (allocated(x_u))        deallocate(x_u, x_v)
     if (allocated(gather_data_row)) deallocate(gather_data_row)
     if (allocated(gather_data_col)) deallocate(gather_data_col)
 
-  end subroutine pcg_solver_chrongear_2d
+  end subroutine pcg_solver_chrongear_3d
 
 !****************************************************************************
 

From e60ae9bcc687fee89596f8990a4fa0f6baae7051 Mon Sep 17 00:00:00 2001
From: William Lipscomb <lipscomb@ucar.edu>
Date: Fri, 23 Jan 2026 15:20:41 -0700
Subject: [PATCH 16/21] Reproducible sums for norm of residual vector

This commit implements reproducible sums for computations of the L2 norm
(and related quantities) of the residual vector A*x - b in the velocity solver.

The summed L2 norm is compared to a target value to check whether the solver
has converged. Reproducible sums are needed only if the L2 norm is very close
to the target, such that the norm exceeds the target for (say) 128 processors
and fall below the target for 256 processors, due to roundoff differences.
This is unlikely but not impossible, so it's better to be on the safe side.
---
 libglissade/glissade_velo_higher.F90 | 192 ++++++++++++++-------------
 1 file changed, 97 insertions(+), 95 deletions(-)

diff --git a/libglissade/glissade_velo_higher.F90 b/libglissade/glissade_velo_higher.F90
index d68d3253..4bcf8755 100644
--- a/libglissade/glissade_velo_higher.F90
+++ b/libglissade/glissade_velo_higher.F90
@@ -9463,6 +9463,10 @@ subroutine compute_residual_vector_3d(nx,    ny,     nz,            &
     ! Compute the residual vector Ax - b and its L2 norm.
     ! This subroutine assumes that the matrix is stored in structured (x/y/z) format.
 
+    !----------------------------------------------------------------
+    ! Input/output variables
+    !----------------------------------------------------------------
+
     integer, intent(in) ::   &
        nx, ny,             &  ! horizontal grid dimensions (for scalars)
        nz                     ! number of vertical levels where velocity is computed
@@ -9502,8 +9506,13 @@ subroutine compute_residual_vector_3d(nx,    ny,     nz,            &
     real(dp), intent(out), optional ::    &
        L2_norm_relative    ! L2 norm of residual vector relative to rhs, |Ax - b| / |b|
 
+    !----------------------------------------------------------------
+    ! Local variables
+    !----------------------------------------------------------------
+
     real(dp), dimension(nz,nx-1,ny-1) ::   &
-       resid_sq        ! resid_u^2 + resid_v^2
+       worku, workv,     & ! work arrays for global sums
+       resid_sq            ! resid_u^2 + resid_v^2
 
     real(dp) :: my_max_resid, global_max_resid
 
@@ -9525,79 +9534,61 @@ subroutine compute_residual_vector_3d(nx,    ny,     nz,            &
     resid_u(:,:,:) = 0.d0
     resid_v(:,:,:) = 0.d0
 
-    !TODO - Replace the following by a call to matvec_multiply_structured_3d
     ! Loop over locally owned vertices
-
     do j = staggered_jlo, staggered_jhi
-    do i = staggered_ilo, staggered_ihi
-
-       if (active_vertex(i,j)) then
-
-          do k = 1, nz
-
-             do kA = -1,1
-             do jA = -1,1
-             do iA = -1,1
-
-                if ( (k+kA >= 1 .and. k+kA <= nz)      &
-                                .and.                  &
-                     (i+iA >= 1 .and. i+iA <= nx-1)    &
-                                .and.                  &
-                     (j+jA >= 1 .and. j+jA <= ny-1) ) then
-
-                   m = indxA_3d(iA,jA,kA)
-
-                   resid_u(k,i,j) = resid_u(k,i,j)                     & 
-                                  + Auu(m,k,i,j)*uvel(k+kA,i+iA,j+jA)  &
-                                  + Auv(m,k,i,j)*vvel(k+kA,i+iA,j+jA)
-
-                   resid_v(k,i,j) = resid_v(k,i,j)                     &
-                                  + Avu(m,k,i,j)*uvel(k+kA,i+iA,j+jA)  &
-                                  + Avv(m,k,i,j)*vvel(k+kA,i+iA,j+jA)
-
-                endif   ! in bounds
-
-             enddo   ! kA
-             enddo   ! iA
-             enddo   ! jA
+       do i = staggered_ilo, staggered_ihi
+          if (active_vertex(i,j)) then
+             do k = 1, nz
+                do kA = -1,1
+                   do jA = -1,1
+                      do iA = -1,1
+                         if ( (k+kA >= 1 .and. k+kA <= nz)      &
+                                         .and.                  &
+                              (i+iA >= 1 .and. i+iA <= nx-1)    &
+                                         .and.                  &
+                              (j+jA >= 1 .and. j+jA <= ny-1) ) then
 
-          enddo   ! k
+                            m = indxA_3d(iA,jA,kA)
 
-       endif   ! active_vertex
+                            resid_u(k,i,j) = resid_u(k,i,j)                     &
+                                           + Auu(m,k,i,j)*uvel(k+kA,i+iA,j+jA)  &
+                                           + Auv(m,k,i,j)*vvel(k+kA,i+iA,j+jA)
 
-    enddo   ! i
+                            resid_v(k,i,j) = resid_v(k,i,j)                     &
+                                           + Avu(m,k,i,j)*uvel(k+kA,i+iA,j+jA)  &
+                                           + Avv(m,k,i,j)*vvel(k+kA,i+iA,j+jA)
+                         endif   ! in bounds
+                      enddo   ! kA
+                   enddo   ! iA
+                enddo   ! jA
+             enddo   ! k
+          endif   ! active_vertex
+       enddo   ! i
     enddo   ! j
 
     ! Subtract b to get A*x - b
-    ! Sum up squared L2 norm as we go
 
-    L2_norm = 0.d0
-    resid_sq(:,:,:) = 0.0d0
+    worku(:,:,:) = 0.0d0
+    workv(:,:,:) = 0.0d0
 
     ! Loop over locally owned vertices
-
     do j = staggered_jlo, staggered_jhi
-    do i = staggered_ilo, staggered_ihi
-       if (active_vertex(i,j)) then
-          do k = 1, nz
-             resid_u(k,i,j) = resid_u(k,i,j) - bu(k,i,j)
-             resid_v(k,i,j) = resid_v(k,i,j) - bv(k,i,j)
-             resid_sq(k,i,j) = resid_u(k,i,j)*resid_u(k,i,j) + resid_v(k,i,j)*resid_v(k,i,j)
-             L2_norm = L2_norm + resid_sq(k,i,j)
-          enddo  ! k
-       endif     ! active vertex
-    enddo        ! i
+       do i = staggered_ilo, staggered_ihi
+          if (active_vertex(i,j)) then
+             do k = 1, nz
+                resid_u(k,i,j) = resid_u(k,i,j) - bu(k,i,j)
+                resid_v(k,i,j) = resid_v(k,i,j) - bv(k,i,j)
+                worku(k,i,j) = resid_u(k,i,j)*resid_u(k,i,j)
+                workv(k,i,j) = resid_v(k,i,j)*resid_v(k,i,j)
+             enddo  ! k
+          endif     ! active vertex
+       enddo        ! i
     enddo        ! j
 
     ! Take global sum, then take square root
-    L2_norm = parallel_reduce_sum(L2_norm)
+    L2_norm = parallel_global_sum_stagger(worku, parallel, workv)
     L2_norm = sqrt(L2_norm)
 
-!!    sum_resid_u_sq = parallel_global_sum(bu*bu, parallel, active_vertex)
-!!    sum_resid_v_sq = parallel_global_sum(bv*bv, parallel, active_vertex)
-!!    L2_norm = parallel_global_sum(resid_sq, parallel)
-!!    L2_norm = sqrt(L2_norm)
-
     if (verbose_residual) then
 
        if (this_rank==rtest) then
@@ -9633,26 +9624,28 @@ subroutine compute_residual_vector_3d(nx,    ny,     nz,            &
 
     if (present(L2_norm_relative)) then   ! compute L2_norm relative to rhs
 
-       L2_norm_rhs = 0.d0
+       worku(:,:,:) = 0.0d0
+       workv(:,:,:) = 0.0d0
 
        do j = staggered_jlo, staggered_jhi
-       do i = staggered_ilo, staggered_ihi
-          if (active_vertex(i,j)) then
-             do k = 1, nz
-                L2_norm_rhs = L2_norm_rhs + bu(k,i,j)*bu(k,i,j) + bv(k,i,j)*bv(k,i,j)
-             enddo  ! k
-          endif     ! active vertex
-       enddo        ! i
+          do i = staggered_ilo, staggered_ihi
+             if (active_vertex(i,j)) then
+                do k = 1, nz
+                   worku(k,i,j) = bu(k,i,j)*bu(k,i,j)
+                   workv(k,i,j) = bv(k,i,j)*bv(k,i,j)
+                enddo  ! k
+             endif     ! active vertex
+          enddo        ! i
        enddo        ! j
 
        ! Take global sum, then take square root
-       L2_norm_rhs = parallel_reduce_sum(L2_norm_rhs)
+       L2_norm_rhs = parallel_global_sum_stagger(worku, parallel, workv)
        L2_norm_rhs = sqrt(L2_norm_rhs)
 
-       if (L2_norm_rhs > 0.d0) then
+       if (L2_norm_rhs > 0.0d0) then
           L2_norm_relative = L2_norm / L2_norm_rhs
        else
-          L2_norm_relative = 0.d0
+          L2_norm_relative = 0.0d0
        endif
 
     endif
@@ -9675,6 +9668,10 @@ subroutine compute_residual_vector_2d(nx,            ny,            &
     ! Compute the residual vector Ax - b and its L2 norm.
     ! This subroutine assumes that the matrix is stored in structured (x/y/z) format.
 
+    !----------------------------------------------------------------
+    ! Input/output arguments
+    !----------------------------------------------------------------
+
     integer, intent(in) ::   &
        nx, ny                 ! horizontal grid dimensions (for scalars)
 
@@ -9713,8 +9710,13 @@ subroutine compute_residual_vector_2d(nx,            ny,            &
     real(dp), intent(out), optional ::    &
        L2_norm_relative    ! L2 norm of residual vector relative to rhs, |Ax - b| / |b|
 
+    !----------------------------------------------------------------
+    ! Local variables
+    !----------------------------------------------------------------
+
     real(dp), dimension(nx-1,ny-1) ::  &
-       resid_sq            ! resid_u^2 + resid_v^2
+         worku, workv,     & ! work arrays for global sums
+         resid_sq            ! resid_u^2 + resid_v^2
 
     real(dp) :: my_max_resid, global_max_resid
 
@@ -9760,27 +9762,23 @@ subroutine compute_residual_vector_2d(nx,            ny,            &
     enddo      ! jA
 
     ! Subtract b to get A*x - b
-    ! Sum up squared L2 norm as we go
-
-    L2_norm = 0.d0
-    resid_sq(:,:) = 0.0d0
+    worku(:,:) = 0.0d0
+    workv(:,:) = 0.0d0
 
     ! Loop over locally owned vertices
-
     do j = staggered_jlo, staggered_jhi
-    do i = staggered_ilo, staggered_ihi
-       if (active_vertex(i,j)) then
-          resid_u(i,j) = resid_u(i,j) - bu(i,j)
-          resid_v(i,j) = resid_v(i,j) - bv(i,j)
-          resid_sq(i,j) = resid_u(i,j)*resid_u(i,j) + resid_v(i,j)*resid_v(i,j)
-          L2_norm = L2_norm + resid_sq(i,j)
-       endif     ! active vertex
-    enddo        ! i
-    enddo        ! j
+       do i = staggered_ilo, staggered_ihi
+          if (active_vertex(i,j)) then
+             resid_u(i,j) = resid_u(i,j) - bu(i,j)
+             resid_v(i,j) = resid_v(i,j) - bv(i,j)
+             worku(i,j) = resid_u(i,j)*resid_u(i,j)
+             workv(i,j) = resid_v(i,j)*resid_v(i,j)
+          endif     ! active vertex
+       enddo     ! i
+    enddo     ! j
 
     ! Take global sum, then take square root
-
-    L2_norm = parallel_reduce_sum(L2_norm)
+    L2_norm = parallel_global_sum_stagger(worku, parallel, workv)
     L2_norm = sqrt(L2_norm)
 
     if (verbose_residual) then
@@ -9795,6 +9793,7 @@ subroutine compute_residual_vector_2d(nx,            ny,            &
 
        ! Compute max value of (squared) residual on this task.
        ! If this task owns the vertex with the global max residual, then print a diagnostic message.
+       resid_sq(:,:) = worku(:,:) + workv(:,:)
        my_max_resid = maxval(resid_sq)
        global_max_resid = parallel_reduce_max(my_max_resid)
 
@@ -9815,24 +9814,27 @@ subroutine compute_residual_vector_2d(nx,            ny,            &
 
     if (present(L2_norm_relative)) then   ! compute L2_norm relative to rhs
 
-       L2_norm_rhs = 0.d0
+       worku(:,:) = 0.0d0
+       workv(:,:) = 0.0d0
 
+       ! Loop over locally owned vertices
        do j = staggered_jlo, staggered_jhi
-       do i = staggered_ilo, staggered_ihi
-          if (active_vertex(i,j)) then
-             L2_norm_rhs = L2_norm_rhs + bu(i,j)*bu(i,j) + bv(i,j)*bv(i,j)
-          endif     ! active vertex
-       enddo        ! i
-       enddo        ! j
+          do i = staggered_ilo, staggered_ihi
+             if (active_vertex(i,j)) then
+                worku(i,j) = bu(i,j)*bu(i,j)
+                workv(i,j) = bv(i,j)*bv(i,j)
+             endif     ! active vertex
+          enddo     ! i
+       enddo     ! j
 
        ! Take global sum, then take square root
-       L2_norm_rhs = parallel_reduce_sum(L2_norm_rhs)
+       L2_norm_rhs = parallel_global_sum_stagger(worku, parallel, workv)
        L2_norm_rhs = sqrt(L2_norm_rhs)
 
-       if (L2_norm_rhs > 0.d0) then
+       if (L2_norm_rhs > 0.0d0) then
           L2_norm_relative = L2_norm / L2_norm_rhs
        else
-          L2_norm_relative = 0.d0
+          L2_norm_relative = 0.0d0
        endif
 
     endif

From 4a89b90215a6a4b17a9a8acd712cd4436a60ffe1 Mon Sep 17 00:00:00 2001
From: William Lipscomb <lipscomb@ucar.edu>
Date: Fri, 23 Jan 2026 15:59:22 -0700
Subject: [PATCH 17/21] Reordered some subroutines in glissade_velo_higher

This commit simply moves subroutines linked to the 2d velocity solver
above subroutines linked to the 3d velocity solver, as done earlier
for the PCG subroutine.

Historically, Blatter-Pattyn was coded before SSA/L1L2/DIVA, which
is why the 3d subroutines came first.
---
 libglissade/glissade_velo_higher.F90 | 3385 +++++++++++++-------------
 1 file changed, 1651 insertions(+), 1734 deletions(-)

diff --git a/libglissade/glissade_velo_higher.F90 b/libglissade/glissade_velo_higher.F90
index 4bcf8755..66074862 100644
--- a/libglissade/glissade_velo_higher.F90
+++ b/libglissade/glissade_velo_higher.F90
@@ -4898,26 +4898,38 @@ end subroutine lateral_shelf_bc
 
 !****************************************************************************
 
-  subroutine assemble_stiffness_matrix_3d(nx,               ny,              &
-                                          nz,               sigma,           &
+  subroutine assemble_stiffness_matrix_2d(nx,               ny,              &
+                                          nz,                                &
+                                          sigma,            stagsigma,       &
                                           nhalo,                             &
                                           itest,   jtest,   rtest,           &
                                           active_cell,                       &
                                           xVertex,          yVertex,         &
-                                          uvel,             vvel,            &
+                                          uvel_2d,          vvel_2d,         &
                                           stagusrf,         stagthck,        &
-                                          flwafact,         whichapprox,     &
+                                          flwa,             flwafact,        &
+                                          whichapprox,                       &
+                                          diva_slope_factor_x, diva_slope_factor_y, &
                                           whichefvs,        efvs,            &
                                           efvs_constant,    effstrain_min,   &
                                           Auu,              Auv,             &
-                                          Avu,              Avv)
+                                          Avu,              Avv,             &
+                                          dusrf_dx,         dusrf_dy,        &
+                                          thck,                              &
+                                          btractx,          btracty,         &
+                                          omega_k,          omega,           &
+                                          efvs_qp_3d)
 
     !----------------------------------------------------------------
     ! Assemble the stiffness matrix A in the linear system Ax = b.
     ! This subroutine is called for each nonlinear iteration if
     !  we are iterating on the effective viscosity.
+    ! The matrix A can be based on the shallow-shelf approximation or
+    !  the depth-integrated L1L2 approximation (Schoof and Hindmarsh, 2010).
     !----------------------------------------------------------------
 
+    use glissade_grid_operators, only: glissade_vertical_average
+
     !----------------------------------------------------------------
     ! Input-output arguments
     !----------------------------------------------------------------
@@ -4925,37 +4937,45 @@ subroutine assemble_stiffness_matrix_3d(nx,               ny,              &
     integer, intent(in) ::      &
        nx, ny,                  &    ! horizontal grid dimensions
        nz,                      &    ! number of vertical levels at which velocity is computed
-                                     ! Note: the number of elements per column is nz-1
+                                     ! (used for flwafact)
        nhalo                         ! number of halo layers
 
     real(dp), dimension(nz), intent(in) ::    &
-       sigma                         ! sigma vertical coordinate
+       sigma              ! sigma vertical coordinate
+
+    real(dp), dimension(nz-1), intent(in) ::    &
+       stagsigma          ! staggered sigma vertical coordinate
 
     integer, intent(in) :: &
        itest, jtest, rtest           ! coordinates of diagnostic point
 
     logical, dimension(nx,ny), intent(in) ::  &
-       active_cell                   ! true if cell contains ice and borders a locally owned vertex
+       active_cell        ! true if cell contains ice and borders a locally owned vertex
 
     real(dp), dimension(nx-1,ny-1), intent(in) ::   &
-       xVertex, yVertex     ! x and y coordinates of vertices
+       xVertex, yVertex   ! x and y coordinates of vertices
 
-    real(dp), dimension(nz,nx-1,ny-1), intent(in) ::  &
-       uvel, vvel         ! velocity components (m/yr)
+    real(dp), dimension(nx-1,ny-1), intent(in) ::  &
+       uvel_2d, vvel_2d   ! 2D velocity components (m/yr)
 
     real(dp), dimension(nx-1,ny-1), intent(in) ::  &
        stagusrf,       &  ! upper surface elevation on staggered grid (m)
        stagthck           ! ice thickness on staggered grid (m)
 
+    !TODO - Pass in flwa only, and compute flwafact here?
     real(dp), dimension(nz-1,nx,ny), intent(in) ::  &
-       flwafact           ! temperature-based flow factor, 0.5 * A^(-1/n), 
+       flwa,             &! temperature-based flow factor A, Pa^{-n} yr^{-1}
+       flwafact           ! temperature-based flow factor, 0.5 * A^(-1/n), Pa yr^(1/n)
                           ! used to compute the effective viscosity
-                          ! units: Pa yr^(1/n)
 
     integer, intent(in) ::   &
-       whichapprox,     & ! option for Stokes approximation (BP, SSA, SIA)
+       whichapprox,     & ! option for Stokes approximation (BP, L1L2, SSA, SIA)
        whichefvs          ! option for effective viscosity calculation 
 
+    real(dp), dimension(nx,ny), intent(in) ::   &
+       diva_slope_factor_x, &  ! correction factor for DIVA in x direction, based on theta_slope_x
+       diva_slope_factor_y     ! correction factor for DIVA in y direction, based on theta_slope_y
+
     real(dp), dimension(nz-1,nx,ny), intent(out) ::  &
        efvs               ! effective viscosity (Pa yr)
 
@@ -4963,36 +4983,61 @@ subroutine assemble_stiffness_matrix_3d(nx,               ny,              &
        efvs_constant,   & ! constant value of effective viscosity (Pa yr)
        effstrain_min      ! minimum value of effective strain rate (yr^-1) for computing viscosity
 
-    real(dp), dimension(nNodeNeighbors_3d,nz,nx-1,ny-1), intent(out) ::  &
+    real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(out) ::  &
        Auu, Auv,    &     ! assembled stiffness matrix, divided into 4 parts
        Avu, Avv
 
+    ! The following optional arguments are used for the L1L2 approximation only
+
+    real(dp), dimension(nx-1,ny-1), intent(in), optional ::  &
+       dusrf_dx,       &  ! upper surface elevation gradient on staggered grid (m/m)
+       dusrf_dy           ! needed for L1L2 assembly only
+
+    ! The following optional arguments are used for DIVA only
+
+    real(dp), dimension(nx,ny), intent(in), optional ::   &
+       thck               ! ice thickness (m)
+
+    real(dp), dimension(nx-1,ny-1), intent(in), optional ::   &
+       btractx, btracty   ! components of basal traction (Pa)
+
+    real(dp), dimension(nz,nx,ny), intent(out), optional :: &
+       omega_k            ! single integral, defined by Goldberg (2011) eq. 32
+
+    real(dp), dimension(nx,ny), intent(out), optional :: &
+       omega              ! double integral, defined by Goldberg (2011) eq. 35
+                          ! Note: omega here = Goldberg's omega/H
+
+    real(dp), dimension(nz-1,nQuadPoints_2d,nx,ny), intent(inout), optional ::  &
+       efvs_qp_3d         ! effective viscosity (Pa yr)
+
     !---------------------------------------------------------
     ! Local variables
     !---------------------------------------------------------
 
-    real(dp), dimension(nQuadPoints_3d) ::   &
+    real(dp), dimension(nQuadPoints_2d) ::   &
        detJ               ! determinant of J
 
-    real(dp), dimension(nNodesPerElement_3d) ::   &
-       dphi_dx_3d, dphi_dy_3d, dphi_dz_3d  ! derivatives of basis function, evaluated at quad pt
+    real(dp), dimension(nNodesPerElement_2d) ::   &
+       dphi_dx_2d, dphi_dy_2d, dphi_dz_2d  ! derivatives of basis function, evaluated at quad pts
+                                           ! set dphi_dz = 0 for 2D problem
 
     !----------------------------------------------------------------
-    ! Note: Kuu, Kuv, Kvu, and Kvv are 8x8 components of the stiffness matrix
-    !       for the local element.  (The combined stiffness matrix is 16x16.)
+    ! Note: Kuu, Kuv, Kvu, and Kvv are 4x4 components of the stiffness matrix
+    !       for the local element.  (The combined stiffness matrix is 8x8.)
     !
-    ! Once these matrices are formed, their coefficients are summed into the assembled
-    !  matrices Auu, Auv, Avu, Avv.  The A matrices each have as many rows as there are
-    !  active nodes, but only 27 columns, corresponding to the 27 vertices that belong to
-    !  the 8 elements sharing a given node.
+    ! Once these matrices are formed, their coefficients are summed into the global
+    !  matrices Auu_2d, Auv_2d, Avu_2d, Avv_2d.  The global matrices each have as
+    !  many rows as there are active vertices, but only 9 columns, corresponding to
+    !  the 9 vertices of the 4 elements sharing a given node.
     !
     ! The native structured PCG solver works with the dense A matrices in the form
     ! computed here.  For the SLAP solver, the terms of the A matrices are put
-    ! in a sparse matrix during preprocessing.  For the Trilinos solver, the terms
-    ! of the A matrices are passed to Trilinos one row at a time. 
+    ! in a sparse matrix format during preprocessing.  For the Trilinos solver,
+    ! the terms of the A matrices are passed to Trilinos one row at a time.
     !----------------------------------------------------------------
 
-    real(dp), dimension(nNodesPerElement_3d, nNodesPerElement_3d) ::   &   !
+    real(dp), dimension(nNodesPerElement_2d, nNodesPerElement_2d) ::   &   !
        Kuu,          &    ! element stiffness matrix, divided into 4 parts as shown below
        Kuv,          &    !  
        Kvu,          &    !
@@ -5003,25 +5048,36 @@ subroutine assemble_stiffness_matrix_3d(nx,               ny,              &
                           !         
                           ! Kvu may not be needed if matrix is symmetric, but is included for now
 
-    real(dp), dimension(nNodesPerElement_3d) ::     &
-       x, y, z,         & ! Cartesian coordinates of nodes
-       u, v,            & ! u and v at nodes
-       s                  ! upper surface elevation at nodes
+    real(dp), dimension(nNodesPerElement_2d) ::     &
+       x, y,            & ! Cartesian coordinates of vertices
+       u, v,            & ! depth-integrated mean velocity at vertices (m/yr)
+       h,               & ! thickness at vertices (m)
+       s,               & ! upper surface elevation at vertices (m)
+       bx, by,          & ! basal traction at vertices (Pa) (DIVA only)
+       dsdx, dsdy         ! upper surface elevation gradient at vertices (m/m) (L1L2 only)
 
-    real(dp), dimension(nQuadPoints_3d) ::    &
-       efvs_qp            ! effective viscosity at a quad pt
+    real(dp), dimension(nQuadPoints_2d) ::    &
+       efvs_qp_vertavg    ! vertically averaged effective viscosity at a quad pt
+
+    real(dp) ::         &
+       h_qp               ! thickness at a quad pt
+
+    real(dp), dimension(nz-1,nQuadPoints_2d) ::    &
+       efvs_qp            ! effective viscosity at each layer in a cell column
+                          ! corresponding to a quad pt
 
     logical, parameter ::   &
        check_symmetry_element = .true.  ! if true, then check symmetry of element matrix
-                                        !Note: Can speed up assembly a bit by setting to false for production
+
+    real(dp), dimension(nx,ny) ::  &
+       flwafact_2d        ! vertically averaged flow factor
 
     integer :: i, j, k, n, p
-    integer :: iNode, jNode, kNode
+    integer :: iVertex, jVertex
 
     if (verbose_matrix .and. main_task) then
        write(iulog,*) ' '
-       write(iulog,*) 'In assemble_stiffness_matrix_3d'
-       write(iulog,*) 'itest, jtest, ktest, rtest =', itest, jtest, ktest, rtest
+       write(iulog,*) 'In assemble_stiffness_matrix_2d'
     endif
 
     ! Initialize effective viscosity
@@ -5029,10 +5085,17 @@ subroutine assemble_stiffness_matrix_3d(nx,               ny,              &
 
     ! Initialize global stiffness matrix
 
-    Auu(:,:,:,:) = 0.d0
-    Auv(:,:,:,:) = 0.d0
-    Avu(:,:,:,:) = 0.d0
-    Avv(:,:,:,:) = 0.d0
+    Auu(:,:,:) = 0.d0
+    Auv(:,:,:) = 0.d0
+    Avu(:,:,:) = 0.d0
+    Avv(:,:,:) = 0.d0
+
+    ! Compute vertical average of flow factor (SSA only)
+    if (whichapprox == HO_APPROX_SSA) then
+       call glissade_vertical_average(nx,       ny,      &
+                                      nz,       sigma,   &
+                                      flwafact, flwafact_2d)
+    endif
 
     ! Sum over elements in active cells 
     ! Loop over all cells that border locally owned vertices.
@@ -5042,112 +5105,200 @@ subroutine assemble_stiffness_matrix_3d(nx,               ny,              &
 
        if (active_cell(i,j)) then
 
-          do k = 1, nz-1    ! loop over elements in this column 
-                            ! assume k increases from upper surface to bed
-
-             ! Initialize element stiffness matrix
-             Kuu(:,:) = 0.d0
-             Kuv(:,:) = 0.d0
-             Kvu(:,:) = 0.d0
-             Kvv(:,:) = 0.d0
+          ! Initialize element stiffness matrix
+          Kuu(:,:) = 0.d0
+          Kuv(:,:) = 0.d0
+          Kvu(:,:) = 0.d0
+          Kvv(:,:) = 0.d0
   
-             ! compute spatial coordinates, velocity, and upper surface elevation for each node
-
-             do n = 1, nNodesPerElement_3d
+          ! Compute spatial coordinates, velocity, thickness and surface elevation for each vertex
+          ! Also compute surface elevation gradient (for L1L2) and basal traction (for DIVA)
+          do n = 1, nNodesPerElement_2d
 
-                ! Determine (k,i,j) for this node
-                ! The reason for the '7' is that node 7, in the NE corner of the upper layer, has index (k,i,j).
-                ! Indices for other nodes are computed relative to this node.
-                iNode = i + ishift(7,n)
-                jNode = j + jshift(7,n)
-                kNode = k + kshift(7,n)
+             ! Determine (i,j) for this vertex
+             ! The reason for the '3' is that node 3, in the NE corner of the grid cell, has index (i,j).
+             ! Indices for other nodes are computed relative to this vertex.
+             iVertex = i + ishift(3,n)
+             jVertex = j + jshift(3,n)
 
-                x(n) = xVertex(iNode,jNode)
-                y(n) = yVertex(iNode,jNode)
-                z(n) = stagusrf(iNode,jNode) - sigma(kNode)*stagthck(iNode,jNode)
-                u(n) = uvel(kNode,iNode,jNode)
-                v(n) = vvel(kNode,iNode,jNode)
-                s(n) = stagusrf(iNode,jNode)
+             x(n) = xVertex(iVertex,jVertex)
+             y(n) = yVertex(iVertex,jVertex)
+             u(n) = uvel_2d(iVertex,jVertex)
+             v(n) = vvel_2d(iVertex,jVertex)
+             s(n) = stagusrf(iVertex,jVertex)
+             h(n) = stagthck(iVertex,jVertex)
+             if (present(dusrf_dx) .and. present(dusrf_dy)) then  ! L1L2
+                dsdx(n) = dusrf_dx(iVertex,jVertex)
+                dsdy(n) = dusrf_dy(iVertex,jVertex)
+             endif
+             if (present(btractx) .and. present(btracty)) then    ! DIVA
+                bx(n) = btractx(iVertex,jVertex)
+                by(n) = btracty(iVertex,jVertex)
+             endif
 
-                if (verbose_matrix .and. this_rank==rtest .and. i==itest .and. j==jtest .and. k==ktest) then
-                   write(iulog,*) ' '
-                   write(iulog,*) 'i, j, k, n, x, y, z:', i, j, k, n, x(n), y(n), z(n)
-                   write(iulog,*) 's, u, v:', s(n), u(n), v(n)
-                endif
+             if (verbose_matrix .and. this_rank==rtest .and. i==itest .and. j==jtest) then
+                write(iulog,*) ' '
+                write(iulog,*) 'i, j, n, x, y:', i, j, n, x(n), y(n)
+                write(iulog,*) 's, h, u, v:', s(n), h(n), u(n), v(n)
+                if (present(btractx) .and. present(btracty)) write(iulog,*) 'bx, by:', bx(n), by(n)
+             endif
 
-             enddo   ! nodes per element
+          enddo   ! vertices per element
 
-             ! Loop over quadrature points for this element
+          ! Loop over quadrature points for this element
    
-             do p = 1, nQuadPoints_3d
-
-                ! Evaluate the derivatives of the element basis functions at this quadrature point.
-                !WHL - Pass in i, j, k, and p to the following subroutines for debugging.
+          do p = 1, nQuadPoints_2d
 
-                call get_basis_function_derivatives_3d(x(:),             y(:),             z(:),              &          
-                                                       dphi_dxr_3d(:,p), dphi_dyr_3d(:,p), dphi_dzr_3d(:,p),  &
-                                                       dphi_dx_3d(:),    dphi_dy_3d(:),    dphi_dz_3d(:),     &
-                                                       detJ(p),                                               &
-                                                       itest, jtest, rtest,                                   &
-                                                       i, j, k, p)
+             ! Evaluate the derivatives of the element basis functions at this quadrature point.
 
-!          call t_startf('glissade_effective_viscosity')
-                call compute_effective_viscosity(whichefvs,        whichapprox,                       &
-                                                 efvs_constant,    effstrain_min,                     &
-                                                 nNodesPerElement_3d,                                 &
-                                                 dphi_dx_3d(:),    dphi_dy_3d(:),    dphi_dz_3d(:),   &
-                                                 u(:),             v(:),                              & 
-                                                 flwafact(k,i,j),  efvs_qp(p),                        &
-                                                 itest, jtest, rtest,                                 &
-                                                 i, j, k, p )
-!          call t_stopf('glissade_effective_viscosity')
+             call get_basis_function_derivatives_2d(x(:),             y(:),          &
+                                                    dphi_dxr_2d(:,p), dphi_dyr_2d(:,p), &
+                                                    dphi_dx_2d(:),    dphi_dy_2d(:),    &
+                                                    detJ(p),                            &
+                                                    itest, jtest, rtest,                &
+                                                    i, j, p)
 
-                if (verbose_efvs .and. this_rank==rtest .and. i==itest .and. j==jtest .and. p==ptest) then
-                   write(iulog,*) 'i, j, k, p, efvs (Pa yr):', i, j, k, p, efvs_qp(p)
-                endif
+             dphi_dz_2d(:) = 0.d0
 
-                ! Increment the element stiffness matrix with the contribution from each quadrature point.
+             if (whichapprox == HO_APPROX_L1L2) then
 
-!          call t_startf('glissade_compute_element_matrix')
-                call compute_element_matrix(whichapprox,     nNodesPerElement_3d,               & 
-                                            wqp_3d(p),       detJ(p),          efvs_qp(p),      &
-                                            dphi_dx_3d(:),   dphi_dy_3d(:),    dphi_dz_3d(:),   &
-                                            Kuu(:,:),        Kuv(:,:),                          &
-                                            Kvu(:,:),        Kvv(:,:),                          &
-                                            itest, jtest, rtest,                                &
-                                            i, j, k, p )
-!          call t_stopf('glissade_compute_element_matrix')
+                ! Compute effective viscosity for each layer at this quadrature point
+                !TODO - sigma -> stagsigma for L1L2 viscosity?
+                call compute_effective_viscosity_L1L2(whichefvs,                               &
+                                                      efvs_constant,        effstrain_min,     &
+                                                      nz,                   sigma,             &
+                                                      nNodesPerElement_2d,  phi_2d(:,p),       &
+                                                      dphi_dx_2d(:),        dphi_dy_2d(:),     &
+                                                      u(:),                 v(:),              &
+                                                      h(:),                                    &
+                                                      dsdx(:),              dsdy(:),           &
+                                                      flwa(:,i,j),          flwafact(:,i,j),   &
+                                                      efvs_qp(:,p),                            &
+                                                      itest,    jtest,      rtest,             &
+                                                      i, j, p)
 
-             enddo   ! nQuadPoints_3d
+                ! Compute vertical average of effective viscosity
+                efvs_qp_vertavg(p) = 0.d0
+                do k = 1, nz-1
+                   efvs_qp_vertavg(p) = efvs_qp_vertavg(p) + efvs_qp(k,p) * (sigma(k+1) - sigma(k))
+                enddo
 
-             ! Compute average of effective viscosity over quad pts
-             efvs(k,i,j) = 0.d0
+             elseif (whichapprox == HO_APPROX_DIVA) then
 
-             do p = 1, nQuadPoints_3d
-                efvs(k,i,j) = efvs(k,i,j) + efvs_qp(p)
+                ! Copy efvs_qp from global array to local column array
+                efvs_qp(:,:) = efvs_qp_3d(:,:,i,j)
+
+                ! Compute effective viscosity for each layer at this quadrature point
+                ! Note: efvs_qp_3d is intent(inout); old value is used to compute new value
+                call compute_effective_viscosity_diva(whichefvs,                               &
+                                                      efvs_constant,        effstrain_min,     &
+                                                      nz,                   stagsigma,         &
+                                                      nNodesPerElement_2d,  phi_2d(:,p),       &
+                                                      dphi_dx_2d(:),        dphi_dy_2d(:),     &
+                                                      u(:),                 v(:),              &
+                                                      bx(:),                by(:),             &
+                                                      diva_slope_factor_x(i,j),                &
+                                                      diva_slope_factor_y(i,j),                &
+                                                      h(:),                                    &
+                                                      flwa(:,i,j),          flwafact(:,i,j),   &
+                                                      efvs_qp(:,p),                            &
+                                                      itest,    jtest,      rtest,             &
+                                                      i, j, p)
+
+                if (verbose_efvs .and. this_rank==rtest .and. i==itest .and. j==jtest .and. p==ptest) then
+                   write(iulog,*) 'i, j, k, p, efvs (Pa yr):', i, j, k, p, efvs_qp(:,p)
+                endif
+
+                !WHL - Copy local efvs_qp to the global array
+                efvs_qp_3d(:,:,i,j) = efvs_qp(:,:)
+
+                ! Compute vertical average of effective viscosity
+                efvs_qp_vertavg(p) = 0.d0
+                do k = 1, nz-1
+                   efvs_qp_vertavg(p) = efvs_qp_vertavg(p) + efvs_qp(k,p)*(sigma(k+1) - sigma(k))
+                enddo
+
+             else     ! SSA
+
+                ! Compute vertically averaged effective viscosity at this quadrature point
+                !TODO - Why do we pass in dphi_dz_2d here and not elsewhere?
+                call compute_effective_viscosity(whichefvs,        whichapprox,                       &
+                                                 efvs_constant,    effstrain_min,                     &
+                                                 nNodesPerElement_2d,                                 &
+                                                 dphi_dx_2d(:),    dphi_dy_2d(:),    dphi_dz_2d(:),   &
+                                                 u(:),             v(:),                              &
+                                                 flwafact_2d(i,j), efvs_qp_vertavg(p),                &
+                                                 itest, jtest, rtest,                                 &
+                                                 i, j, 1, p)
+
+                ! Copy vertically averaged value to all levels
+                efvs_qp(:,p) = efvs_qp_vertavg(p)
+
+             endif    ! whichapprox
+
+             ! Compute ice thickness at this quadrature point
+
+             h_qp = 0.d0
+             do n = 1, nNodesPerElement_2d
+                h_qp = h_qp + phi_2d(n,p) * h(n)
              enddo
-             efvs(k,i,j) = efvs(k,i,j) / nQuadPoints_3d
-             
-             if (check_symmetry_element) then
-                call check_symmetry_element_matrix(nNodesPerElement_3d,  &
-                                                   Kuu, Kuv, Kvu, Kvv)
-             endif
 
-             ! Sum terms of element matrix K into dense assembled matrix A
+             ! Increment the element stiffness matrix with the contribution from each quadrature point.
+             ! Note: The effective viscosity is multiplied by thickness since the equation to be solved
+             !       is vertically integrated.
 
-             call element_to_global_matrix_3d(nx,           ny,          nz,    &
-                                              i,            j,           k,     &
-                                              itest,        jtest,       rtest, &
-                                              Kuu,          Kuv,                &
-                                              Kvu,          Kvv,                &
-                                              Auu,          Auv,                &
-                                              Avu,          Avv)
+             call compute_element_matrix(whichapprox,     nNodesPerElement_2d,               &
+                                         wqp_2d(p),       detJ(p),                           &
+                                         h_qp*efvs_qp_vertavg(p),                            &
+                                         dphi_dx_2d(:),   dphi_dy_2d(:),    dphi_dz_2d(:),   &
+                                         Kuu(:,:),        Kuv(:,:),                          &
+                                         Kvu(:,:),        Kvv(:,:),                          &
+                                         itest, jtest, rtest,                                &
+                                         i, j, 1, p )
 
-          enddo   ! nz  (loop over elements in this column)
+          enddo   ! nQuadPoints_2d
+
+          if (whichapprox == HO_APPROX_DIVA) then
+
+             ! Compute vertical integrals needed for the 2D solve and 3D velocity reconstruction
+             call compute_integrals_diva(nz,             sigma,         &
+                                         itest,  jtest,  rtest,         &
+                                         thck(i,j),      efvs_qp(:,:),  &
+                                         omega_k(:,i,j), omega(i,j),    &
+                                         i, j)
+
+          endif
+
+          ! Compute average of effective viscosity over quad points.
+          ! For L1L2 and DIVA there is a different efvs in each layer.
+          ! For SSA, simply write the vertical average value to each layer.
+
+          efvs(:,i,j) = 0.d0
+          do p = 1, nQuadPoints_2d
+             do k = 1, nz-1
+                efvs(k,i,j) = efvs(k,i,j) + efvs_qp(k,p)
+             enddo
+          enddo
+          efvs(:,i,j) = efvs(:,i,j) / nQuadPoints_2d
+
+          if (check_symmetry_element) then
+             call check_symmetry_element_matrix(nNodesPerElement_2d,   &
+                                                Kuu, Kuv, Kvu, Kvv)
+          endif
+
+          ! Sum the terms of element matrix K into the dense assembled matrix A
+
+          call element_to_global_matrix_2d(nx,           ny,        &
+                                           i,            j,         &
+                                           itest, jtest, rtest,     &
+                                           Kuu,          Kuv,       &
+                                           Kvu,          Kvv,       &
+                                           Auu,          Auv,       &
+                                           Avu,          Avv)
 
           if (verbose_efvs .and. this_rank==rtest .and. i==itest .and. j==jtest) then
              write(iulog,*) ' '
-             write(iulog,*) 'Assembled 3D matrix, i, j =', i, j
+             write(iulog,*) 'Assembled 2D matrix, i, j =', i, j
              write(iulog,*) 'k, flwafact, efvs:'
              do k = 1, nz-1
                 write(iulog,*) k, flwafact(k,i,j), efvs(k,i,j)
@@ -5159,42 +5310,30 @@ subroutine assemble_stiffness_matrix_3d(nx,               ny,              &
     enddo      ! i
     enddo      ! j
 
-  end subroutine assemble_stiffness_matrix_3d
+  end subroutine assemble_stiffness_matrix_2d
 
 !****************************************************************************
 
-  subroutine assemble_stiffness_matrix_2d(nx,               ny,              &
-                                          nz,                                &
-                                          sigma,            stagsigma,       &
+  subroutine assemble_stiffness_matrix_3d(nx,               ny,              &
+                                          nz,               sigma,           &
                                           nhalo,                             &
                                           itest,   jtest,   rtest,           &
                                           active_cell,                       &
                                           xVertex,          yVertex,         &
-                                          uvel_2d,          vvel_2d,         &
+                                          uvel,             vvel,            &
                                           stagusrf,         stagthck,        &
-                                          flwa,             flwafact,        &
-                                          whichapprox,                       &
-                                          diva_slope_factor_x, diva_slope_factor_y, &
+                                          flwafact,         whichapprox,     &
                                           whichefvs,        efvs,            &
                                           efvs_constant,    effstrain_min,   &
                                           Auu,              Auv,             &
-                                          Avu,              Avv,             &
-                                          dusrf_dx,         dusrf_dy,        &
-                                          thck,                              &
-                                          btractx,          btracty,         &
-                                          omega_k,          omega,           &
-                                          efvs_qp_3d)
-  
+                                          Avu,              Avv)
+
     !----------------------------------------------------------------
     ! Assemble the stiffness matrix A in the linear system Ax = b.
     ! This subroutine is called for each nonlinear iteration if
     !  we are iterating on the effective viscosity.
-    ! The matrix A can be based on the shallow-shelf approximation or 
-    !  the depth-integrated L1L2 approximation (Schoof and Hindmarsh, 2010).
     !----------------------------------------------------------------
 
-    use glissade_grid_operators, only: glissade_vertical_average
-
     !----------------------------------------------------------------
     ! Input-output arguments
     !----------------------------------------------------------------
@@ -5202,45 +5341,37 @@ subroutine assemble_stiffness_matrix_2d(nx,               ny,              &
     integer, intent(in) ::      &
        nx, ny,                  &    ! horizontal grid dimensions
        nz,                      &    ! number of vertical levels at which velocity is computed
-                                     ! (used for flwafact)
+                                     ! Note: the number of elements per column is nz-1
        nhalo                         ! number of halo layers
 
     real(dp), dimension(nz), intent(in) ::    &
-       sigma              ! sigma vertical coordinate
-
-    real(dp), dimension(nz-1), intent(in) ::    &
-       stagsigma          ! staggered sigma vertical coordinate
+       sigma                         ! sigma vertical coordinate
 
     integer, intent(in) :: &
        itest, jtest, rtest           ! coordinates of diagnostic point
 
     logical, dimension(nx,ny), intent(in) ::  &
-       active_cell        ! true if cell contains ice and borders a locally owned vertex
+       active_cell                   ! true if cell contains ice and borders a locally owned vertex
 
     real(dp), dimension(nx-1,ny-1), intent(in) ::   &
-       xVertex, yVertex   ! x and y coordinates of vertices
+       xVertex, yVertex     ! x and y coordinates of vertices
 
-    real(dp), dimension(nx-1,ny-1), intent(in) ::  &
-       uvel_2d, vvel_2d   ! 2D velocity components (m/yr)
+    real(dp), dimension(nz,nx-1,ny-1), intent(in) ::  &
+       uvel, vvel         ! velocity components (m/yr)
 
     real(dp), dimension(nx-1,ny-1), intent(in) ::  &
        stagusrf,       &  ! upper surface elevation on staggered grid (m)
        stagthck           ! ice thickness on staggered grid (m)
 
-    !TODO - Pass in flwa only, and compute flwafact here?
     real(dp), dimension(nz-1,nx,ny), intent(in) ::  &
-       flwa,             &! temperature-based flow factor A, Pa^{-n} yr^{-1}
-       flwafact           ! temperature-based flow factor, 0.5 * A^(-1/n), Pa yr^(1/n)
+       flwafact           ! temperature-based flow factor, 0.5 * A^(-1/n),
                           ! used to compute the effective viscosity
+                          ! units: Pa yr^(1/n)
 
     integer, intent(in) ::   &
-       whichapprox,     & ! option for Stokes approximation (BP, L1L2, SSA, SIA)
+       whichapprox,     & ! option for Stokes approximation (BP, SSA, SIA)
        whichefvs          ! option for effective viscosity calculation 
 
-    real(dp), dimension(nx,ny), intent(in) ::   &
-       diva_slope_factor_x, &  ! correction factor for DIVA in x direction, based on theta_slope_x
-       diva_slope_factor_y     ! correction factor for DIVA in y direction, based on theta_slope_y
-
     real(dp), dimension(nz-1,nx,ny), intent(out) ::  &
        efvs               ! effective viscosity (Pa yr)
 
@@ -5248,61 +5379,36 @@ subroutine assemble_stiffness_matrix_2d(nx,               ny,              &
        efvs_constant,   & ! constant value of effective viscosity (Pa yr)
        effstrain_min      ! minimum value of effective strain rate (yr^-1) for computing viscosity
 
-    real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(out) ::  &
+    real(dp), dimension(nNodeNeighbors_3d,nz,nx-1,ny-1), intent(out) ::  &
        Auu, Auv,    &     ! assembled stiffness matrix, divided into 4 parts
-       Avu, Avv                                    
-
-    ! The following optional arguments are used for the L1L2 approximation only
-
-    real(dp), dimension(nx-1,ny-1), intent(in), optional ::  &
-       dusrf_dx,       &  ! upper surface elevation gradient on staggered grid (m/m)
-       dusrf_dy           ! needed for L1L2 assembly only
-
-    ! The following optional arguments are used for DIVA only
-
-    real(dp), dimension(nx,ny), intent(in), optional ::   &
-       thck               ! ice thickness (m)
-
-    real(dp), dimension(nx-1,ny-1), intent(in), optional ::   &
-       btractx, btracty   ! components of basal traction (Pa)
-
-    real(dp), dimension(nz,nx,ny), intent(out), optional :: &
-       omega_k            ! single integral, defined by Goldberg (2011) eq. 32
-
-    real(dp), dimension(nx,ny), intent(out), optional :: &
-       omega              ! double integral, defined by Goldberg (2011) eq. 35
-                          ! Note: omega here = Goldberg's omega/H
-
-    real(dp), dimension(nz-1,nQuadPoints_2d,nx,ny), intent(inout), optional ::  &
-       efvs_qp_3d         ! effective viscosity (Pa yr)
+       Avu, Avv
 
     !---------------------------------------------------------
     ! Local variables
     !---------------------------------------------------------
 
-    real(dp), dimension(nQuadPoints_2d) ::   &
+    real(dp), dimension(nQuadPoints_3d) ::   &
        detJ               ! determinant of J
 
-    real(dp), dimension(nNodesPerElement_2d) ::   &
-       dphi_dx_2d, dphi_dy_2d, dphi_dz_2d  ! derivatives of basis function, evaluated at quad pts
-                                           ! set dphi_dz = 0 for 2D problem
+    real(dp), dimension(nNodesPerElement_3d) ::   &
+       dphi_dx_3d, dphi_dy_3d, dphi_dz_3d  ! derivatives of basis function, evaluated at quad pt
 
     !----------------------------------------------------------------
-    ! Note: Kuu, Kuv, Kvu, and Kvv are 4x4 components of the stiffness matrix
-    !       for the local element.  (The combined stiffness matrix is 8x8.)
+    ! Note: Kuu, Kuv, Kvu, and Kvv are 8x8 components of the stiffness matrix
+    !       for the local element.  (The combined stiffness matrix is 16x16.)
     !
-    ! Once these matrices are formed, their coefficients are summed into the global
-    !  matrices Auu_2d, Auv_2d, Avu_2d, Avv_2d.  The global matrices each have as 
-    !  many rows as there are active vertices, but only 9 columns, corresponding to 
-    !  the 9 vertices of the 4 elements sharing a given node.
+    ! Once these matrices are formed, their coefficients are summed into the assembled
+    !  matrices Auu, Auv, Avu, Avv.  The A matrices each have as many rows as there are
+    !  active nodes, but only 27 columns, corresponding to the 27 vertices that belong to
+    !  the 8 elements sharing a given node.
     !
     ! The native structured PCG solver works with the dense A matrices in the form
     ! computed here.  For the SLAP solver, the terms of the A matrices are put
-    ! in a sparse matrix format during preprocessing.  For the Trilinos solver, 
-    ! the terms of the A matrices are passed to Trilinos one row at a time. 
+    ! in a sparse matrix during preprocessing.  For the Trilinos solver, the terms
+    ! of the A matrices are passed to Trilinos one row at a time.
     !----------------------------------------------------------------
 
-    real(dp), dimension(nNodesPerElement_2d, nNodesPerElement_2d) ::   &   !
+    real(dp), dimension(nNodesPerElement_3d, nNodesPerElement_3d) ::   &   !
        Kuu,          &    ! element stiffness matrix, divided into 4 parts as shown below
        Kuv,          &    !  
        Kvu,          &    !
@@ -5313,36 +5419,25 @@ subroutine assemble_stiffness_matrix_2d(nx,               ny,              &
                           !         
                           ! Kvu may not be needed if matrix is symmetric, but is included for now
 
-    real(dp), dimension(nNodesPerElement_2d) ::     &
-       x, y,            & ! Cartesian coordinates of vertices
-       u, v,            & ! depth-integrated mean velocity at vertices (m/yr)
-       h,               & ! thickness at vertices (m)
-       s,               & ! upper surface elevation at vertices (m)
-       bx, by,          & ! basal traction at vertices (Pa) (DIVA only)
-       dsdx, dsdy         ! upper surface elevation gradient at vertices (m/m) (L1L2 only)
-
-    real(dp), dimension(nQuadPoints_2d) ::    &
-       efvs_qp_vertavg    ! vertically averaged effective viscosity at a quad pt
-
-    real(dp) ::         &
-       h_qp               ! thickness at a quad pt
+    real(dp), dimension(nNodesPerElement_3d) ::     &
+       x, y, z,         & ! Cartesian coordinates of nodes
+       u, v,            & ! u and v at nodes
+       s                  ! upper surface elevation at nodes
 
-    real(dp), dimension(nz-1,nQuadPoints_2d) ::    &
-       efvs_qp            ! effective viscosity at each layer in a cell column
-                          ! corresponding to a quad pt
+    real(dp), dimension(nQuadPoints_3d) ::    &
+       efvs_qp            ! effective viscosity at a quad pt
 
     logical, parameter ::   &
        check_symmetry_element = .true.  ! if true, then check symmetry of element matrix
-
-    real(dp), dimension(nx,ny) ::  &
-       flwafact_2d        ! vertically averaged flow factor
+                                        !Note: Can speed up assembly a bit by setting to false for production
 
     integer :: i, j, k, n, p
-    integer :: iVertex, jVertex
+    integer :: iNode, jNode, kNode
 
     if (verbose_matrix .and. main_task) then
        write(iulog,*) ' '
-       write(iulog,*) 'In assemble_stiffness_matrix_2d'
+       write(iulog,*) 'In assemble_stiffness_matrix_3d'
+       write(iulog,*) 'itest, jtest, ktest, rtest =', itest, jtest, ktest, rtest
     endif
 
     ! Initialize effective viscosity
@@ -5350,220 +5445,125 @@ subroutine assemble_stiffness_matrix_2d(nx,               ny,              &
 
     ! Initialize global stiffness matrix
 
-    Auu(:,:,:) = 0.d0
-    Auv(:,:,:) = 0.d0
-    Avu(:,:,:) = 0.d0
-    Avv(:,:,:) = 0.d0
-
-    ! Compute vertical average of flow factor (SSA only)
-    if (whichapprox == HO_APPROX_SSA) then
-       call glissade_vertical_average(nx,       ny,      &
-                                      nz,       sigma,   &
-                                      flwafact, flwafact_2d)
-    endif
+    Auu(:,:,:,:) = 0.d0
+    Auv(:,:,:,:) = 0.d0
+    Avu(:,:,:,:) = 0.d0
+    Avv(:,:,:,:) = 0.d0
 
     ! Sum over elements in active cells 
     ! Loop over all cells that border locally owned vertices.
 
     do j = nhalo+1, ny-nhalo+1
     do i = nhalo+1, nx-nhalo+1
-       
-       if (active_cell(i,j)) then
-
-          ! Initialize element stiffness matrix
-          Kuu(:,:) = 0.d0
-          Kuv(:,:) = 0.d0
-          Kvu(:,:) = 0.d0
-          Kvv(:,:) = 0.d0
-  
-          ! Compute spatial coordinates, velocity, thickness and surface elevation for each vertex
-          ! Also compute surface elevation gradient (for L1L2) and basal traction (for DIVA)
-          do n = 1, nNodesPerElement_2d
 
-             ! Determine (i,j) for this vertex
-             ! The reason for the '3' is that node 3, in the NE corner of the grid cell, has index (i,j).
-             ! Indices for other nodes are computed relative to this vertex.
-             iVertex = i + ishift(3,n)
-             jVertex = j + jshift(3,n)
-
-             x(n) = xVertex(iVertex,jVertex)
-             y(n) = yVertex(iVertex,jVertex)
-             u(n) = uvel_2d(iVertex,jVertex)
-             v(n) = vvel_2d(iVertex,jVertex)
-             s(n) = stagusrf(iVertex,jVertex)
-             h(n) = stagthck(iVertex,jVertex)
-             if (present(dusrf_dx) .and. present(dusrf_dy)) then  ! L1L2
-                dsdx(n) = dusrf_dx(iVertex,jVertex)
-                dsdy(n) = dusrf_dy(iVertex,jVertex)
-             endif
-             if (present(btractx) .and. present(btracty)) then    ! DIVA
-                bx(n) = btractx(iVertex,jVertex)
-                by(n) = btracty(iVertex,jVertex)
-             endif
-
-             if (verbose_matrix .and. this_rank==rtest .and. i==itest .and. j==jtest) then
-                write(iulog,*) ' '
-                write(iulog,*) 'i, j, n, x, y:', i, j, n, x(n), y(n)
-                write(iulog,*) 's, h, u, v:', s(n), h(n), u(n), v(n)
-                if (present(btractx) .and. present(btracty)) write(iulog,*) 'bx, by:', bx(n), by(n)
-             endif
-
-          enddo   ! vertices per element
-
-          ! Loop over quadrature points for this element
-   
-          do p = 1, nQuadPoints_2d
-
-             ! Evaluate the derivatives of the element basis functions at this quadrature point.
+       if (active_cell(i,j)) then
 
-             call get_basis_function_derivatives_2d(x(:),             y(:),          &
-                                                    dphi_dxr_2d(:,p), dphi_dyr_2d(:,p), &
-                                                    dphi_dx_2d(:),    dphi_dy_2d(:),    &
-                                                    detJ(p),                            &
-                                                    itest, jtest, rtest,                &
-                                                    i, j, p)
+          do k = 1, nz-1    ! loop over elements in this column
+                            ! assume k increases from upper surface to bed
 
-             dphi_dz_2d(:) = 0.d0
+             ! Initialize element stiffness matrix
+             Kuu(:,:) = 0.d0
+             Kuv(:,:) = 0.d0
+             Kvu(:,:) = 0.d0
+             Kvv(:,:) = 0.d0
 
-             if (whichapprox == HO_APPROX_L1L2) then
+             ! compute spatial coordinates, velocity, and upper surface elevation for each node
 
-                ! Compute effective viscosity for each layer at this quadrature point
-                !TODO - sigma -> stagsigma for L1L2 viscosity?
-                call compute_effective_viscosity_L1L2(whichefvs,                               &
-                                                      efvs_constant,        effstrain_min,     &
-                                                      nz,                   sigma,             &
-                                                      nNodesPerElement_2d,  phi_2d(:,p),       &
-                                                      dphi_dx_2d(:),        dphi_dy_2d(:),     &
-                                                      u(:),                 v(:),              & 
-                                                      h(:),                                    &
-                                                      dsdx(:),              dsdy(:),           &
-                                                      flwa(:,i,j),          flwafact(:,i,j),   &
-                                                      efvs_qp(:,p),                            &
-                                                      itest,    jtest,      rtest,             &
-                                                      i, j, p)
+             do n = 1, nNodesPerElement_3d
 
-                ! Compute vertical average of effective viscosity
-                efvs_qp_vertavg(p) = 0.d0
-                do k = 1, nz-1
-                   efvs_qp_vertavg(p) = efvs_qp_vertavg(p) + efvs_qp(k,p) * (sigma(k+1) - sigma(k))
-                enddo
+                ! Determine (k,i,j) for this node
+                ! The reason for the '7' is that node 7, in the NE corner of the upper layer, has index (k,i,j).
+                ! Indices for other nodes are computed relative to this node.
+                iNode = i + ishift(7,n)
+                jNode = j + jshift(7,n)
+                kNode = k + kshift(7,n)
 
-             elseif (whichapprox == HO_APPROX_DIVA) then
+                x(n) = xVertex(iNode,jNode)
+                y(n) = yVertex(iNode,jNode)
+                z(n) = stagusrf(iNode,jNode) - sigma(kNode)*stagthck(iNode,jNode)
+                u(n) = uvel(kNode,iNode,jNode)
+                v(n) = vvel(kNode,iNode,jNode)
+                s(n) = stagusrf(iNode,jNode)
 
-                ! Copy efvs_qp from global array to local column array
-                efvs_qp(:,:) = efvs_qp_3d(:,:,i,j)
+                if (verbose_matrix .and. this_rank==rtest .and. i==itest .and. j==jtest .and. k==ktest) then
+                   write(iulog,*) ' '
+                   write(iulog,*) 'i, j, k, n, x, y, z:', i, j, k, n, x(n), y(n), z(n)
+                   write(iulog,*) 's, u, v:', s(n), u(n), v(n)
+                endif
 
-                ! Compute effective viscosity for each layer at this quadrature point
-                ! Note: efvs_qp_3d is intent(inout); old value is used to compute new value
-                call compute_effective_viscosity_diva(whichefvs,                               &
-                                                      efvs_constant,        effstrain_min,     &
-                                                      nz,                   stagsigma,         &
-                                                      nNodesPerElement_2d,  phi_2d(:,p),       &
-                                                      dphi_dx_2d(:),        dphi_dy_2d(:),     &
-                                                      u(:),                 v(:),              & 
-                                                      bx(:),                by(:),             &
-                                                      diva_slope_factor_x(i,j),                &
-                                                      diva_slope_factor_y(i,j),                &
-                                                      h(:),                                    &
-                                                      flwa(:,i,j),          flwafact(:,i,j),   &
-                                                      efvs_qp(:,p),                            &
-                                                      itest,    jtest,      rtest,             &
-                                                      i, j, p)
+             enddo   ! nodes per element
 
-                if (verbose_efvs .and. this_rank==rtest .and. i==itest .and. j==jtest .and. p==ptest) then
-                   write(iulog,*) 'i, j, k, p, efvs (Pa yr):', i, j, k, p, efvs_qp(:,p)
-                endif
+             ! Loop over quadrature points for this element
 
-                !WHL - Copy local efvs_qp to the global array
-                efvs_qp_3d(:,:,i,j) = efvs_qp(:,:)
+             do p = 1, nQuadPoints_3d
 
-                ! Compute vertical average of effective viscosity
-                efvs_qp_vertavg(p) = 0.d0
-                do k = 1, nz-1
-                   efvs_qp_vertavg(p) = efvs_qp_vertavg(p) + efvs_qp(k,p)*(sigma(k+1) - sigma(k))
-                enddo
+                ! Evaluate the derivatives of the element basis functions at this quadrature point.
+                !WHL - Pass in i, j, k, and p to the following subroutines for debugging.
 
-             else     ! SSA
+                call get_basis_function_derivatives_3d(x(:),             y(:),             z(:),              &
+                                                       dphi_dxr_3d(:,p), dphi_dyr_3d(:,p), dphi_dzr_3d(:,p),  &
+                                                       dphi_dx_3d(:),    dphi_dy_3d(:),    dphi_dz_3d(:),     &
+                                                       detJ(p),                                               &
+                                                       itest, jtest, rtest,                                   &
+                                                       i, j, k, p)
 
-                ! Compute vertically averaged effective viscosity at this quadrature point
-                !TODO - Why do we pass in dphi_dz_2d here and not elsewhere?
+!          call t_startf('glissade_effective_viscosity')
                 call compute_effective_viscosity(whichefvs,        whichapprox,                       &
                                                  efvs_constant,    effstrain_min,                     &
-                                                 nNodesPerElement_2d,                                 &
-                                                 dphi_dx_2d(:),    dphi_dy_2d(:),    dphi_dz_2d(:),   &
+                                                 nNodesPerElement_3d,                                 &
+                                                 dphi_dx_3d(:),    dphi_dy_3d(:),    dphi_dz_3d(:),   &
                                                  u(:),             v(:),                              & 
-                                                 flwafact_2d(i,j), efvs_qp_vertavg(p),                &
+                                                 flwafact(k,i,j),  efvs_qp(p),                        &
                                                  itest, jtest, rtest,                                 &
-                                                 i, j, 1, p)
-
-                ! Copy vertically averaged value to all levels
-                efvs_qp(:,p) = efvs_qp_vertavg(p)
-
-             endif    ! whichapprox
-
-             ! Compute ice thickness at this quadrature point
-
-             h_qp = 0.d0
-             do n = 1, nNodesPerElement_2d
-                h_qp = h_qp + phi_2d(n,p) * h(n)
-             enddo
-
-             ! Increment the element stiffness matrix with the contribution from each quadrature point.
-             ! Note: The effective viscosity is multiplied by thickness since the equation to be solved
-             !       is vertically integrated.
-
-             call compute_element_matrix(whichapprox,     nNodesPerElement_2d,               & 
-                                         wqp_2d(p),       detJ(p),                           &
-                                         h_qp*efvs_qp_vertavg(p),                            &
-                                         dphi_dx_2d(:),   dphi_dy_2d(:),    dphi_dz_2d(:),   &
-                                         Kuu(:,:),        Kuv(:,:),                          &
-                                         Kvu(:,:),        Kvv(:,:),                          &
-                                         itest, jtest, rtest,                                &
-                                         i, j, 1, p )
+                                                 i, j, k, p )
+!          call t_stopf('glissade_effective_viscosity')
 
-          enddo   ! nQuadPoints_2d
+                if (verbose_efvs .and. this_rank==rtest .and. i==itest .and. j==jtest .and. p==ptest) then
+                   write(iulog,*) 'i, j, k, p, efvs (Pa yr):', i, j, k, p, efvs_qp(p)
+                endif
 
-          if (whichapprox == HO_APPROX_DIVA) then
+                ! Increment the element stiffness matrix with the contribution from each quadrature point.
 
-             ! Compute vertical integrals needed for the 2D solve and 3D velocity reconstruction
-             call compute_integrals_diva(nz,             sigma,         &
-                                         itest,  jtest,  rtest,         &
-                                         thck(i,j),      efvs_qp(:,:),  &
-                                         omega_k(:,i,j), omega(i,j),    &
-                                         i, j)
+!          call t_startf('glissade_compute_element_matrix')
+                call compute_element_matrix(whichapprox,     nNodesPerElement_3d,               &
+                                            wqp_3d(p),       detJ(p),          efvs_qp(p),      &
+                                            dphi_dx_3d(:),   dphi_dy_3d(:),    dphi_dz_3d(:),   &
+                                            Kuu(:,:),        Kuv(:,:),                          &
+                                            Kvu(:,:),        Kvv(:,:),                          &
+                                            itest, jtest, rtest,                                &
+                                            i, j, k, p )
+!          call t_stopf('glissade_compute_element_matrix')
 
-          endif
+             enddo   ! nQuadPoints_3d
 
-          ! Compute average of effective viscosity over quad points.
-          ! For L1L2 and DIVA there is a different efvs in each layer.
-          ! For SSA, simply write the vertical average value to each layer.
+             ! Compute average of effective viscosity over quad pts
+             efvs(k,i,j) = 0.d0
 
-          efvs(:,i,j) = 0.d0
-          do p = 1, nQuadPoints_2d
-             do k = 1, nz-1
-                efvs(k,i,j) = efvs(k,i,j) + efvs_qp(k,p)
+             do p = 1, nQuadPoints_3d
+                efvs(k,i,j) = efvs(k,i,j) + efvs_qp(p)
              enddo
-          enddo
-          efvs(:,i,j) = efvs(:,i,j) / nQuadPoints_2d
+             efvs(k,i,j) = efvs(k,i,j) / nQuadPoints_3d
+
+             if (check_symmetry_element) then
+                call check_symmetry_element_matrix(nNodesPerElement_3d,  &
+                                                   Kuu, Kuv, Kvu, Kvv)
+             endif
 
-          if (check_symmetry_element) then
-             call check_symmetry_element_matrix(nNodesPerElement_2d,   &
-                                                Kuu, Kuv, Kvu, Kvv)
-          endif
+             ! Sum terms of element matrix K into dense assembled matrix A
 
-          ! Sum the terms of element matrix K into the dense assembled matrix A
+             call element_to_global_matrix_3d(nx,           ny,          nz,    &
+                                              i,            j,           k,     &
+                                              itest,        jtest,       rtest, &
+                                              Kuu,          Kuv,                &
+                                              Kvu,          Kvv,                &
+                                              Auu,          Auv,                &
+                                              Avu,          Avv)
 
-          call element_to_global_matrix_2d(nx,           ny,        &
-                                           i,            j,         &
-                                           itest, jtest, rtest,     &
-                                           Kuu,          Kuv,       &
-                                           Kvu,          Kvv,       &
-                                           Auu,          Auv,       &
-                                           Avu,          Avv)
+          enddo   ! nz  (loop over elements in this column)
 
           if (verbose_efvs .and. this_rank==rtest .and. i==itest .and. j==jtest) then
              write(iulog,*) ' '
-             write(iulog,*) 'Assembled 2D matrix, i, j =', i, j
+             write(iulog,*) 'Assembled 3D matrix, i, j =', i, j
              write(iulog,*) 'k, flwafact, efvs:'
              do k = 1, nz-1
                 write(iulog,*) k, flwafact(k,i,j), efvs(k,i,j)
@@ -5575,7 +5575,7 @@ subroutine assemble_stiffness_matrix_2d(nx,               ny,              &
     enddo      ! i
     enddo      ! j
 
-  end subroutine assemble_stiffness_matrix_2d
+  end subroutine assemble_stiffness_matrix_3d
 
 !****************************************************************************
 
@@ -6274,127 +6274,104 @@ end subroutine compute_3d_velocity_L1L2
 
 !****************************************************************************
 
-  subroutine get_basis_function_derivatives_3d(xNode,       yNode,       zNode,       &
-                                               dphi_dxr_3d, dphi_dyr_3d, dphi_dzr_3d, &
-                                               dphi_dx_3d,  dphi_dy_3d,  dphi_dz_3d,  &
-                                               detJ,                                  &
-                                               itest, jtest, rtest,                   &
-                                               i, j, k, p)
+  subroutine get_basis_function_derivatives_2d(xNode,       yNode,         &
+                                               dphi_dxr_2d, dphi_dyr_2d,   &
+                                               dphi_dx_2d,  dphi_dy_2d,    &
+                                               detJ,                       &
+                                               itest, jtest, rtest,        &
+                                               i, j, p)
 
     !------------------------------------------------------------------
-    ! Evaluate the x, y and z derivatives of the element basis functions
+    ! Evaluate the x and y derivatives of 2D element basis functions
     ! at a particular quadrature point.
     !
     ! Also determine the Jacobian of the transformation between the
     ! reference element and the true element.
     ! 
-    ! This subroutine should work for any 3D element with any number of nodes.
+    ! This subroutine should work for any 2D element with any number of nodes.
     !------------------------------------------------------------------
- 
-    real(dp), dimension(nNodesPerElement_3d), intent(in) :: &
-       xNode, yNode, zNode,          &! nodal coordinates
-       dphi_dxr_3d, dphi_dyr_3d, dphi_dzr_3d   ! derivatives of basis functions at quad pt
-                                               !  wrt x, y and z in reference element
 
-    real(dp), dimension(nNodesPerElement_3d), intent(out) :: &
-       dphi_dx_3d, dphi_dy_3d, dphi_dz_3d      ! derivatives of basis functions at quad pt
-                                               !  wrt x, y and z in true Cartesian coordinates  
+    real(dp), dimension(nNodesPerElement_2d), intent(in) :: &
+       xNode, yNode,                   &! nodal coordinates
+       dphi_dxr_2d, dphi_dyr_2d         ! derivatives of basis functions at quad pt
+                                        !  wrt x and y in reference element
+
+    real(dp), dimension(nNodesPerElement_2d), intent(out) :: &
+       dphi_dx_2d, dphi_dy_2d           ! derivatives of basis functions at quad pt
+                                        !  wrt x and y in true Cartesian coordinates
 
     real(dp), intent(out) :: &
-         detJ      ! determinant of Jacobian matrix
+                detJ      ! determinant of Jacobian matrix
 
-    real(dp), dimension(3,3) ::  &
-         Jac,      &! Jacobian matrix
-         Jinv,     &! inverse Jacobian matrix
-         cofactor   ! matrix of cofactors
+    real(dp), dimension(2,2) ::  &
+                Jac,      &! Jacobian matrix
+                Jinv       ! inverse Jacobian matrix
 
     integer, intent(in) :: &
        itest, jtest, rtest              ! coordinates of diagnostic point
 
-    integer, intent(in) :: i, j, k, p   ! indices passed in for debugging
+    integer, intent(in) :: i, j, p
 
     integer :: n, row, col
 
     logical, parameter :: Jac_bug_check = .false.   ! set to true for debugging
-    real(dp), dimension(3,3) :: prod     ! Jac * Jinv (should be identity matrix)
+    real(dp), dimension(2,2) :: prod     ! Jac * Jinv (should be identity matrix)
 
     !------------------------------------------------------------------
     ! Compute the Jacobian for the transformation from the reference
     ! coordinates to the true coordinates:
     !
-    !                 |                                                                          |
-    !                 | sum_n{dphi_n/dxr * xn}   sum_n{dphi_n/dxr * yn}   sum_n{dphi_n/dxr * zn} |
-    !   J(xr,yr,zr) = |                                                                          |
-    !                 | sum_n{dphi_n/dyr * xn}   sum_n{dphi_n/dyr * yn}   sum_n{dphi_n/dyr * zn} |
-    !                 |                                                                          |
-    !                 | sum_n{dphi_n/dzr * xn}   sum_n{dphi_n/dzr * yn}   sum_n{dphi_n/dzr * zn} |
-    !                 !                                                                          |
+    !              |                                                  |
+    !              | sum_n{dphi_n/dxr * xn}   sum_n{dphi_n/dxr * yn}  |
+    !   J(xr,yr) = |                                                  |
+    !              | sum_n{dphi_n/dyr * xn}   sum_n{dphi_n/dyr * yn}  |
+    !              |                                                  |
     !
-    ! where (xn,yn,zn) are the true Cartesian nodal coordinates,
-    !       (xr,yr,zr) are the coordinates of the quad point in the reference element,
+    ! where (xn,yn) are the true Cartesian nodal coordinates,
+    !       (xr,yr) are the coordinates of the quad point in the reference element,
     !       and sum_n denotes a sum over nodes.
     !------------------------------------------------------------------
 
-    if (verbose_Jac .and. this_rank==rtest .and. i==itest .and. j==jtest .and. k==ktest) then
+    Jac(:,:) = 0.d0
+
+    if ((verbose_Jac .or. verbose_diva) .and. this_rank==rtest .and. i==itest .and. j==jtest) then
        write(iulog,*) ' '
-       write(iulog,*) 'In get_basis_function_derivatives_3d: i, j, k, p =', i, j, k, p
+       write(iulog,*) 'In get_basis_function_derivatives_2d: i, j, p =', i, j, p
     endif
 
-    Jac(:,:) = 0.d0
-
-    do n = 1, nNodesPerElement_3d
-       Jac(1,1) = Jac(1,1) + dphi_dxr_3d(n) * xNode(n)
-       Jac(1,2) = Jac(1,2) + dphi_dxr_3d(n) * yNode(n)
-       Jac(1,3) = Jac(1,3) + dphi_dxr_3d(n) * zNode(n)
-       Jac(2,1) = Jac(2,1) + dphi_dyr_3d(n) * xNode(n)
-       Jac(2,2) = Jac(2,2) + dphi_dyr_3d(n) * yNode(n)
-       Jac(2,3) = Jac(2,3) + dphi_dyr_3d(n) * zNode(n)
-       Jac(3,1) = Jac(3,1) + dphi_dzr_3d(n) * xNode(n)
-       Jac(3,2) = Jac(3,2) + dphi_dzr_3d(n) * yNode(n)
-       Jac(3,3) = Jac(3,3) + dphi_dzr_3d(n) * zNode(n)
+    do n = 1, nNodesPerElement_2d
+       if (verbose_Jac .and. this_rank==rtest .and. i==itest .and. j==jtest) then
+          write(iulog,*) ' '
+          write(iulog,*) 'n, x, y:', n, xNode(n), yNode(n)
+          write(iulog,*) 'dphi_dxr_2d, dphi_dyr_2d:', dphi_dxr_2d(n), dphi_dyr_2d(n)
+       endif
+       Jac(1,1) = Jac(1,1) + dphi_dxr_2d(n) * xNode(n)
+       Jac(1,2) = Jac(1,2) + dphi_dxr_2d(n) * yNode(n)
+       Jac(2,1) = Jac(2,1) + dphi_dyr_2d(n) * xNode(n)
+       Jac(2,2) = Jac(2,2) + dphi_dyr_2d(n) * yNode(n)
     enddo
 
     !------------------------------------------------------------------
     ! Compute the determinant and inverse of J
     !------------------------------------------------------------------
 
-    cofactor(1,1) =   Jac(2,2)*Jac(3,3) - Jac(2,3)*Jac(3,2)
-    cofactor(1,2) = -(Jac(2,1)*Jac(3,3) - Jac(2,3)*Jac(3,1))
-    cofactor(1,3) =   Jac(2,1)*Jac(3,2) - Jac(2,2)*Jac(3,1)
-    cofactor(2,1) = -(Jac(1,2)*Jac(3,3) - Jac(1,3)*Jac(3,2))
-    cofactor(2,2) =   Jac(1,1)*Jac(3,3) - Jac(1,3)*Jac(3,1)
-    cofactor(2,3) = -(Jac(1,1)*Jac(3,2) - Jac(1,2)*Jac(3,1))
-    cofactor(3,1) =   Jac(1,2)*Jac(2,3) - Jac(1,3)*Jac(2,2)
-    cofactor(3,2) = -(Jac(1,1)*Jac(2,3) - Jac(1,3)*Jac(2,1))
-    cofactor(3,3) =   Jac(1,1)*Jac(2,2) - Jac(1,2)*Jac(2,1)
-
-    detJ = Jac(1,1)*cofactor(1,1) + Jac(1,2)*cofactor(1,2) + Jac(1,3)*cofactor(1,3)
-
-    if (verbose_Jac .and. this_rank==rtest .and. i==itest .and. j==jtest .and. k==ktest) then
-       write(iulog,*) ' '
-       write(iulog,*) 'detJ1:', Jac(1,1)*cofactor(1,1) + Jac(1,2)*cofactor(1,2) + Jac(1,3)*cofactor(1,3)
-       write(iulog,*) 'detJ2:', Jac(2,1)*cofactor(2,1) + Jac(2,2)*cofactor(2,2) + Jac(2,3)*cofactor(2,3)
-       write(iulog,*) 'detJ3:', Jac(3,1)*cofactor(3,1) + Jac(3,2)*cofactor(3,2) + Jac(3,3)*cofactor(3,3)
-    endif
+    detJ = Jac(1,1)*Jac(2,2) - Jac(1,2)*Jac(2,1)
 
     if (abs(detJ) > 0.d0) then
-       do col = 1, 3
-          do row = 1, 3
-             Jinv(row,col) = cofactor(col,row)
-          enddo
-       enddo
-       Jinv(:,:) = Jinv(:,:) / detJ
+       Jinv(1,1) =  Jac(2,2)/detJ
+       Jinv(1,2) = -Jac(1,2)/detJ
+       Jinv(2,1) = -Jac(2,1)/detJ
+       Jinv(2,2) =  Jac(1,1)/detJ
     else
        write(iulog,*) 'stopping, det J = 0'
-       write(iulog,*) 'i, j, k, p:', i, j, k, p
+       write(iulog,*) 'i, j, p:', i, j, p
        write(iulog,*) 'Jacobian matrix:'
        write(iulog,*) Jac(1,:)
        write(iulog,*) Jac(2,:)
-       write(iulog,*) Jac(3,:) 
        call write_log('Jacobian matrix is singular', GM_FATAL)
     endif
 
-    if (verbose_Jac .and. this_rank==rtest .and. i==itest .and. j==jtest .and. k==ktest) then
+    if (verbose_Jac .and. this_rank==rtest .and. i==itest .and. j==jtest) then
        write(iulog,*) ' '
        write(iulog,*) 'Jacobian calc, p =', p
        write(iulog,*) 'det J =', detJ
@@ -6402,204 +6379,205 @@ subroutine get_basis_function_derivatives_3d(xNode,       yNode,       zNode,
        write(iulog,*) 'Jacobian matrix:'
        write(iulog,*) Jac(1,:)
        write(iulog,*) Jac(2,:)
-       write(iulog,*) Jac(3,:)
-       write(iulog,*) ' '
-       write(iulog,*) 'cofactor matrix:'
-       write(iulog,*) cofactor(1,:)
-       write(iulog,*) cofactor(2,:)
-       write(iulog,*) cofactor(3,:)
        write(iulog,*) ' '
        write(iulog,*) 'Inverse matrix:'
        write(iulog,*) Jinv(1,:)
        write(iulog,*) Jinv(2,:)
-       write(iulog,*) Jinv(3,:)
        write(iulog,*) ' '
        prod = matmul(Jac, Jinv)
        write(iulog,*) 'Jac*Jinv:'
        write(iulog,*) prod(1,:)
        write(iulog,*) prod(2,:)
-       write(iulog,*) prod(3,:)
     endif
 
-    ! Optional bug check: Verify that J * Jinv = I
+    ! Optional bug check - Verify that J * Jinv = I
 
     if (Jac_bug_check) then
        prod = matmul(Jac,Jinv)
-       do col = 1, 3
-          do row = 1, 3
-             if (abs(prod(row,col) - identity3(row,col)) > eps10) then
-                write(iulog,*) '3d Jacobian, stopping, Jac * Jinv /= identity'
-                write(iulog,*) 'rank, i, j, k, p:', this_rank, i, j, k, p
+       do col = 1, 2
+          do row = 1, 2
+             if (abs(prod(row,col) - identity3(row,col)) > 1.d-12) then
+                write(iulog,*) '2d Jacobian, stopping, Jac * Jinv /= identity'
+                write(iulog,*) 'rank, i, j, p:', this_rank, i, j, p
                 write(iulog,*) 'Jac*Jinv:'
                 write(iulog,*) prod(1,:)
                 write(iulog,*) prod(2,:)
-                write(iulog,*) prod(3,:)
                 call write_log('Jacobian matrix was not correctly inverted', GM_FATAL)
              endif
           enddo
        enddo
-    endif  ! Jac_bug_check
+    endif
 
     !------------------------------------------------------------------
     ! Compute the contribution of this quadrature point to dphi/dx and dphi/dy
     ! for each basis function.
     !
     !   | dphi_n/dx |          | dphi_n/dxr |
-    !   |           |          |            | 
-    !   | dphi_n/dy | = Jinv * | dphi_n/dyr |
-    !   |           |          |            |
-    !   | dphi_n/dz |          | dphi_n/dzr |
+    !   |           | = Jinv * |            |
+    !   | dphi_n/dy |          | dphi_n/dyr |
     !
     !------------------------------------------------------------------
 
-    dphi_dx_3d(:) = 0.d0
-    dphi_dy_3d(:) = 0.d0
-    dphi_dz_3d(:) = 0.d0
+    dphi_dx_2d(:) = 0.d0
+    dphi_dy_2d(:) = 0.d0
 
-    do n = 1, nNodesPerElement_3d
-       dphi_dx_3d(n) = Jinv(1,1)*dphi_dxr_3d(n)  &
-                     + Jinv(1,2)*dphi_dyr_3d(n)  &
-                     + Jinv(1,3)*dphi_dzr_3d(n)
-       dphi_dy_3d(n) = Jinv(2,1)*dphi_dxr_3d(n)  &
-                     + Jinv(2,2)*dphi_dyr_3d(n)  &
-                     + Jinv(2,3)*dphi_dzr_3d(n)
-       dphi_dz_3d(n) = Jinv(3,1)*dphi_dxr_3d(n)  &
-                     + Jinv(3,2)*dphi_dyr_3d(n)  &
-                     + Jinv(3,3)*dphi_dzr_3d(n)
+    do n = 1, nNodesPerElement_2d
+       dphi_dx_2d(n) = dphi_dx_2d(n) + Jinv(1,1)*dphi_dxr_2d(n)  &
+                                     + Jinv(1,2)*dphi_dyr_2d(n)
+       dphi_dy_2d(n) = dphi_dy_2d(n) + Jinv(2,1)*dphi_dxr_2d(n)  &
+                                     + Jinv(2,2)*dphi_dyr_2d(n)
     enddo
 
+    if (verbose_Jac .and. this_rank==rtest .and. i==itest .and. j==jtest) then
+       write(iulog,*) ' '
+       write(iulog,*) 'dphi_dx_2d:', dphi_dx_2d(:)
+       write(iulog,*) 'dphi_dy_2d:', dphi_dy_2d(:)
+    endif
+
     if (Jac_bug_check) then
 
        ! Check that the sum of dphi_dx, etc. is close to zero  
-
-       if (abs( sum(dphi_dx_3d)/maxval(dphi_dx_3d) ) > 1.d-11) then
+       if (abs( sum(dphi_dx_2d)/maxval(dphi_dx_2d) ) > 1.d-11) then
           write(iulog,*) 'stopping, sum over basis functions of dphi_dx > 0'
-          write(iulog,*) 'dphi_dx_3d =', dphi_dx_3d(:)
-          write(iulog,*) 'sum =', sum(dphi_dx_3d)
-          write(iulog,*) 'i, j, k, p =', i, j, k, p
+          write(iulog,*) 'dphi_dx_2d =', dphi_dx_2d(:)
+          write(iulog,*) 'i, j, p =', i, j, p
           call write_log('Sum over basis functions of dphi_dx /= 0', GM_FATAL)
        endif
 
-       if (abs( sum(dphi_dy_3d)/maxval(dphi_dy_3d) ) > 1.d-11) then
+       if (abs( sum(dphi_dy_2d)/maxval(dphi_dy_2d) ) > 1.d-11) then
           write(iulog,*) 'stopping, sum over basis functions of dphi_dy > 0'
-          write(iulog,*) 'dphi_dy_3d =', dphi_dy_3d(:)
-          write(iulog,*) 'sum =', sum(dphi_dy_3d)
-          write(iulog,*) 'i, j, k, p =', i, j, k, p
+          write(iulog,*) 'dphi_dy =', dphi_dy_2d(:)
+          write(iulog,*) 'i, j, p =', i, j, p
           call write_log('Sum over basis functions of dphi_dy /= 0', GM_FATAL)
        endif
 
-       if (abs( sum(dphi_dz_3d)/maxval(dphi_dz_3d) ) > 1.d-11) then
-          write(iulog,*) 'stopping, sum over basis functions of dphi_dz > 0'
-          write(iulog,*) 'dphi_dz_3d =', dphi_dz_3d(:)
-          write(iulog,*) 'sum =', sum(dphi_dz_3d)
-          write(iulog,*) 'i, j, k, p =', i, j, k, p
-          call write_log('Sum over basis functions of dphi_dz /= 0', GM_FATAL)
-       endif
-
-    endif  ! Jac_bug_check
-
-  end subroutine get_basis_function_derivatives_3d
+    endif
 
+  end subroutine get_basis_function_derivatives_2d
+  
 !****************************************************************************
 
-  subroutine get_basis_function_derivatives_2d(xNode,       yNode,         &
-                                               dphi_dxr_2d, dphi_dyr_2d,   &
-                                               dphi_dx_2d,  dphi_dy_2d,    &
-                                               detJ,                       &
-                                               itest, jtest, rtest,        &
-                                               i, j, p)
+  subroutine get_basis_function_derivatives_3d(xNode,       yNode,       zNode,       &
+                                               dphi_dxr_3d, dphi_dyr_3d, dphi_dzr_3d, &
+                                               dphi_dx_3d,  dphi_dy_3d,  dphi_dz_3d,  &
+                                               detJ,                                  &
+                                               itest, jtest, rtest,                   &
+                                               i, j, k, p)
 
     !------------------------------------------------------------------
-    ! Evaluate the x and y derivatives of 2D element basis functions
+    ! Evaluate the x, y and z derivatives of the element basis functions
     ! at a particular quadrature point.
     !
     ! Also determine the Jacobian of the transformation between the
     ! reference element and the true element.
     ! 
-    ! This subroutine should work for any 2D element with any number of nodes.
+    ! This subroutine should work for any 3D element with any number of nodes.
     !------------------------------------------------------------------
+ 
+    real(dp), dimension(nNodesPerElement_3d), intent(in) :: &
+       xNode, yNode, zNode,          &! nodal coordinates
+       dphi_dxr_3d, dphi_dyr_3d, dphi_dzr_3d   ! derivatives of basis functions at quad pt
+                                               !  wrt x, y and z in reference element
 
-    real(dp), dimension(nNodesPerElement_2d), intent(in) :: &
-       xNode, yNode,                   &! nodal coordinates
-       dphi_dxr_2d, dphi_dyr_2d         ! derivatives of basis functions at quad pt
-                                        !  wrt x and y in reference element
-
-    real(dp), dimension(nNodesPerElement_2d), intent(out) :: &
-       dphi_dx_2d, dphi_dy_2d           ! derivatives of basis functions at quad pt
-                                        !  wrt x and y in true Cartesian coordinates  
+    real(dp), dimension(nNodesPerElement_3d), intent(out) :: &
+       dphi_dx_3d, dphi_dy_3d, dphi_dz_3d      ! derivatives of basis functions at quad pt
+                                               !  wrt x, y and z in true Cartesian coordinates
 
     real(dp), intent(out) :: &
-                detJ      ! determinant of Jacobian matrix
+         detJ      ! determinant of Jacobian matrix
 
-    real(dp), dimension(2,2) ::  &
-                Jac,      &! Jacobian matrix
-                Jinv       ! inverse Jacobian matrix
+    real(dp), dimension(3,3) ::  &
+         Jac,      &! Jacobian matrix
+         Jinv,     &! inverse Jacobian matrix
+         cofactor   ! matrix of cofactors
 
     integer, intent(in) :: &
        itest, jtest, rtest              ! coordinates of diagnostic point
 
-    integer, intent(in) :: i, j, p
+    integer, intent(in) :: i, j, k, p   ! indices passed in for debugging
 
     integer :: n, row, col
 
     logical, parameter :: Jac_bug_check = .false.   ! set to true for debugging
-    real(dp), dimension(2,2) :: prod     ! Jac * Jinv (should be identity matrix)
+    real(dp), dimension(3,3) :: prod     ! Jac * Jinv (should be identity matrix)
 
     !------------------------------------------------------------------
     ! Compute the Jacobian for the transformation from the reference
     ! coordinates to the true coordinates:
     !
-    !              |                                                  |
-    !              | sum_n{dphi_n/dxr * xn}   sum_n{dphi_n/dxr * yn}  |
-    !   J(xr,yr) = |                                                  |
-    !              | sum_n{dphi_n/dyr * xn}   sum_n{dphi_n/dyr * yn}  |
-    !              |                                                  |
+    !                 |                                                                          |
+    !                 | sum_n{dphi_n/dxr * xn}   sum_n{dphi_n/dxr * yn}   sum_n{dphi_n/dxr * zn} |
+    !   J(xr,yr,zr) = |                                                                          |
+    !                 | sum_n{dphi_n/dyr * xn}   sum_n{dphi_n/dyr * yn}   sum_n{dphi_n/dyr * zn} |
+    !                 |                                                                          |
+    !                 | sum_n{dphi_n/dzr * xn}   sum_n{dphi_n/dzr * yn}   sum_n{dphi_n/dzr * zn} |
+    !                 !                                                                          |
     !
-    ! where (xn,yn) are the true Cartesian nodal coordinates,
-    !       (xr,yr) are the coordinates of the quad point in the reference element,
+    ! where (xn,yn,zn) are the true Cartesian nodal coordinates,
+    !       (xr,yr,zr) are the coordinates of the quad point in the reference element,
     !       and sum_n denotes a sum over nodes.
     !------------------------------------------------------------------
 
-    Jac(:,:) = 0.d0
-
-    if ((verbose_Jac .or. verbose_diva) .and. this_rank==rtest .and. i==itest .and. j==jtest) then
+    if (verbose_Jac .and. this_rank==rtest .and. i==itest .and. j==jtest .and. k==ktest) then
        write(iulog,*) ' '
-       write(iulog,*) 'In get_basis_function_derivatives_2d: i, j, p =', i, j, p
+       write(iulog,*) 'In get_basis_function_derivatives_3d: i, j, k, p =', i, j, k, p
     endif
 
-    do n = 1, nNodesPerElement_2d
-       if (verbose_Jac .and. this_rank==rtest .and. i==itest .and. j==jtest) then
-          write(iulog,*) ' '
-          write(iulog,*) 'n, x, y:', n, xNode(n), yNode(n)
-          write(iulog,*) 'dphi_dxr_2d, dphi_dyr_2d:', dphi_dxr_2d(n), dphi_dyr_2d(n)
-       endif
-       Jac(1,1) = Jac(1,1) + dphi_dxr_2d(n) * xNode(n)
-       Jac(1,2) = Jac(1,2) + dphi_dxr_2d(n) * yNode(n)
-       Jac(2,1) = Jac(2,1) + dphi_dyr_2d(n) * xNode(n)
-       Jac(2,2) = Jac(2,2) + dphi_dyr_2d(n) * yNode(n)
+    Jac(:,:) = 0.d0
+
+    do n = 1, nNodesPerElement_3d
+       Jac(1,1) = Jac(1,1) + dphi_dxr_3d(n) * xNode(n)
+       Jac(1,2) = Jac(1,2) + dphi_dxr_3d(n) * yNode(n)
+       Jac(1,3) = Jac(1,3) + dphi_dxr_3d(n) * zNode(n)
+       Jac(2,1) = Jac(2,1) + dphi_dyr_3d(n) * xNode(n)
+       Jac(2,2) = Jac(2,2) + dphi_dyr_3d(n) * yNode(n)
+       Jac(2,3) = Jac(2,3) + dphi_dyr_3d(n) * zNode(n)
+       Jac(3,1) = Jac(3,1) + dphi_dzr_3d(n) * xNode(n)
+       Jac(3,2) = Jac(3,2) + dphi_dzr_3d(n) * yNode(n)
+       Jac(3,3) = Jac(3,3) + dphi_dzr_3d(n) * zNode(n)
     enddo
 
     !------------------------------------------------------------------
     ! Compute the determinant and inverse of J
     !------------------------------------------------------------------
 
-    detJ = Jac(1,1)*Jac(2,2) - Jac(1,2)*Jac(2,1)
+    cofactor(1,1) =   Jac(2,2)*Jac(3,3) - Jac(2,3)*Jac(3,2)
+    cofactor(1,2) = -(Jac(2,1)*Jac(3,3) - Jac(2,3)*Jac(3,1))
+    cofactor(1,3) =   Jac(2,1)*Jac(3,2) - Jac(2,2)*Jac(3,1)
+    cofactor(2,1) = -(Jac(1,2)*Jac(3,3) - Jac(1,3)*Jac(3,2))
+    cofactor(2,2) =   Jac(1,1)*Jac(3,3) - Jac(1,3)*Jac(3,1)
+    cofactor(2,3) = -(Jac(1,1)*Jac(3,2) - Jac(1,2)*Jac(3,1))
+    cofactor(3,1) =   Jac(1,2)*Jac(2,3) - Jac(1,3)*Jac(2,2)
+    cofactor(3,2) = -(Jac(1,1)*Jac(2,3) - Jac(1,3)*Jac(2,1))
+    cofactor(3,3) =   Jac(1,1)*Jac(2,2) - Jac(1,2)*Jac(2,1)
+
+    detJ = Jac(1,1)*cofactor(1,1) + Jac(1,2)*cofactor(1,2) + Jac(1,3)*cofactor(1,3)
+
+    if (verbose_Jac .and. this_rank==rtest .and. i==itest .and. j==jtest .and. k==ktest) then
+       write(iulog,*) ' '
+       write(iulog,*) 'detJ1:', Jac(1,1)*cofactor(1,1) + Jac(1,2)*cofactor(1,2) + Jac(1,3)*cofactor(1,3)
+       write(iulog,*) 'detJ2:', Jac(2,1)*cofactor(2,1) + Jac(2,2)*cofactor(2,2) + Jac(2,3)*cofactor(2,3)
+       write(iulog,*) 'detJ3:', Jac(3,1)*cofactor(3,1) + Jac(3,2)*cofactor(3,2) + Jac(3,3)*cofactor(3,3)
+    endif
 
     if (abs(detJ) > 0.d0) then
-       Jinv(1,1) =  Jac(2,2)/detJ
-       Jinv(1,2) = -Jac(1,2)/detJ
-       Jinv(2,1) = -Jac(2,1)/detJ
-       Jinv(2,2) =  Jac(1,1)/detJ
+       do col = 1, 3
+          do row = 1, 3
+             Jinv(row,col) = cofactor(col,row)
+          enddo
+       enddo
+       Jinv(:,:) = Jinv(:,:) / detJ
     else
        write(iulog,*) 'stopping, det J = 0'
-       write(iulog,*) 'i, j, p:', i, j, p
+       write(iulog,*) 'i, j, k, p:', i, j, k, p
        write(iulog,*) 'Jacobian matrix:'
        write(iulog,*) Jac(1,:)
        write(iulog,*) Jac(2,:)
+       write(iulog,*) Jac(3,:) 
        call write_log('Jacobian matrix is singular', GM_FATAL)
     endif
 
-    if (verbose_Jac .and. this_rank==rtest .and. i==itest .and. j==jtest) then
+    if (verbose_Jac .and. this_rank==rtest .and. i==itest .and. j==jtest .and. k==ktest) then
        write(iulog,*) ' '
        write(iulog,*) 'Jacobian calc, p =', p
        write(iulog,*) 'det J =', detJ
@@ -6607,81 +6585,103 @@ subroutine get_basis_function_derivatives_2d(xNode,       yNode,         &
        write(iulog,*) 'Jacobian matrix:'
        write(iulog,*) Jac(1,:)
        write(iulog,*) Jac(2,:)
+       write(iulog,*) Jac(3,:)
+       write(iulog,*) ' '
+       write(iulog,*) 'cofactor matrix:'
+       write(iulog,*) cofactor(1,:)
+       write(iulog,*) cofactor(2,:)
+       write(iulog,*) cofactor(3,:)
        write(iulog,*) ' '
        write(iulog,*) 'Inverse matrix:'
        write(iulog,*) Jinv(1,:)
        write(iulog,*) Jinv(2,:)
+       write(iulog,*) Jinv(3,:)
        write(iulog,*) ' '
        prod = matmul(Jac, Jinv)
        write(iulog,*) 'Jac*Jinv:'
        write(iulog,*) prod(1,:)
        write(iulog,*) prod(2,:)
+       write(iulog,*) prod(3,:)
     endif
 
-    ! Optional bug check - Verify that J * Jinv = I
+    ! Optional bug check: Verify that J * Jinv = I
 
     if (Jac_bug_check) then
        prod = matmul(Jac,Jinv)
-       do col = 1, 2
-          do row = 1, 2
-             if (abs(prod(row,col) - identity3(row,col)) > 1.d-12) then
-                write(iulog,*) '2d Jacobian, stopping, Jac * Jinv /= identity'
-                write(iulog,*) 'rank, i, j, p:', this_rank, i, j, p
+       do col = 1, 3
+          do row = 1, 3
+             if (abs(prod(row,col) - identity3(row,col)) > eps10) then
+                write(iulog,*) '3d Jacobian, stopping, Jac * Jinv /= identity'
+                write(iulog,*) 'rank, i, j, k, p:', this_rank, i, j, k, p
                 write(iulog,*) 'Jac*Jinv:'
                 write(iulog,*) prod(1,:)
                 write(iulog,*) prod(2,:)
+                write(iulog,*) prod(3,:)
                 call write_log('Jacobian matrix was not correctly inverted', GM_FATAL)
              endif
           enddo
        enddo
-    endif
+    endif  ! Jac_bug_check
 
     !------------------------------------------------------------------
     ! Compute the contribution of this quadrature point to dphi/dx and dphi/dy
     ! for each basis function.
     !
     !   | dphi_n/dx |          | dphi_n/dxr |
-    !   |           | = Jinv * |            |
-    !   | dphi_n/dy |          | dphi_n/dyr |
+    !   |           |          |            | 
+    !   | dphi_n/dy | = Jinv * | dphi_n/dyr |
+    !   |           |          |            |
+    !   | dphi_n/dz |          | dphi_n/dzr |
     !
     !------------------------------------------------------------------
 
-    dphi_dx_2d(:) = 0.d0
-    dphi_dy_2d(:) = 0.d0
+    dphi_dx_3d(:) = 0.d0
+    dphi_dy_3d(:) = 0.d0
+    dphi_dz_3d(:) = 0.d0
 
-    do n = 1, nNodesPerElement_2d
-       dphi_dx_2d(n) = dphi_dx_2d(n) + Jinv(1,1)*dphi_dxr_2d(n)  &
-                                     + Jinv(1,2)*dphi_dyr_2d(n)
-       dphi_dy_2d(n) = dphi_dy_2d(n) + Jinv(2,1)*dphi_dxr_2d(n)  &
-                                     + Jinv(2,2)*dphi_dyr_2d(n)
+    do n = 1, nNodesPerElement_3d
+       dphi_dx_3d(n) = Jinv(1,1)*dphi_dxr_3d(n)  &
+                     + Jinv(1,2)*dphi_dyr_3d(n)  &
+                     + Jinv(1,3)*dphi_dzr_3d(n)
+       dphi_dy_3d(n) = Jinv(2,1)*dphi_dxr_3d(n)  &
+                     + Jinv(2,2)*dphi_dyr_3d(n)  &
+                     + Jinv(2,3)*dphi_dzr_3d(n)
+       dphi_dz_3d(n) = Jinv(3,1)*dphi_dxr_3d(n)  &
+                     + Jinv(3,2)*dphi_dyr_3d(n)  &
+                     + Jinv(3,3)*dphi_dzr_3d(n)
     enddo
 
-    if (verbose_Jac .and. this_rank==rtest .and. i==itest .and. j==jtest) then
-       write(iulog,*) ' '
-       write(iulog,*) 'dphi_dx_2d:', dphi_dx_2d(:)
-       write(iulog,*) 'dphi_dy_2d:', dphi_dy_2d(:)
-    endif
-
     if (Jac_bug_check) then
 
        ! Check that the sum of dphi_dx, etc. is close to zero  
-       if (abs( sum(dphi_dx_2d)/maxval(dphi_dx_2d) ) > 1.d-11) then
+
+       if (abs( sum(dphi_dx_3d)/maxval(dphi_dx_3d) ) > 1.d-11) then
           write(iulog,*) 'stopping, sum over basis functions of dphi_dx > 0'
-          write(iulog,*) 'dphi_dx_2d =', dphi_dx_2d(:)
-          write(iulog,*) 'i, j, p =', i, j, p
+          write(iulog,*) 'dphi_dx_3d =', dphi_dx_3d(:)
+          write(iulog,*) 'sum =', sum(dphi_dx_3d)
+          write(iulog,*) 'i, j, k, p =', i, j, k, p
           call write_log('Sum over basis functions of dphi_dx /= 0', GM_FATAL)
        endif
 
-       if (abs( sum(dphi_dy_2d)/maxval(dphi_dy_2d) ) > 1.d-11) then
+       if (abs( sum(dphi_dy_3d)/maxval(dphi_dy_3d) ) > 1.d-11) then
           write(iulog,*) 'stopping, sum over basis functions of dphi_dy > 0'
-          write(iulog,*) 'dphi_dy =', dphi_dy_2d(:)
-          write(iulog,*) 'i, j, p =', i, j, p
+          write(iulog,*) 'dphi_dy_3d =', dphi_dy_3d(:)
+          write(iulog,*) 'sum =', sum(dphi_dy_3d)
+          write(iulog,*) 'i, j, k, p =', i, j, k, p
           call write_log('Sum over basis functions of dphi_dy /= 0', GM_FATAL)
        endif
 
-    endif
+       if (abs( sum(dphi_dz_3d)/maxval(dphi_dz_3d) ) > 1.d-11) then
+          write(iulog,*) 'stopping, sum over basis functions of dphi_dz > 0'
+          write(iulog,*) 'dphi_dz_3d =', dphi_dz_3d(:)
+          write(iulog,*) 'sum =', sum(dphi_dz_3d)
+          write(iulog,*) 'i, j, k, p =', i, j, k, p
+          call write_log('Sum over basis functions of dphi_dz /= 0', GM_FATAL)
+       endif
 
-  end subroutine get_basis_function_derivatives_2d
+    endif  ! Jac_bug_check
+
+  end subroutine get_basis_function_derivatives_3d
 
 !****************************************************************************
 
@@ -8076,148 +8076,148 @@ end subroutine compute_element_matrix
 
 !****************************************************************************
 
-  subroutine element_to_global_matrix_3d(nx,           ny,          nz,          &
-                                         iElement,     jElement,    kElement,    &
-                                         itest,        jtest,       rtest,       &
-                                         Kuu,          Kuv,                      &
-                                         Kvu,          Kvv,                      &
-                                         Auu,          Auv,                      &
+  subroutine element_to_global_matrix_2d(nx,           ny,        &
+                                         iElement,     jElement,  &
+                                         itest, jtest, rtest,     &
+                                         Kuu,          Kuv,       &
+                                         Kvu,          Kvv,       &
+                                         Auu,          Auv,       &
                                          Avu,          Avv)
-             
+
     ! Sum terms of element matrix K into dense assembled matrix A
     ! K is partitioned into Kuu, Kuv, Kvu, and Kvv, and similarly for A.
 
     integer, intent(in) ::   &
-       nx, ny,               &  ! horizontal grid dimensions
-       nz                       ! number of vertical levels where velocity is computed
+       nx, ny                   ! horizontal grid dimensions
 
     integer, intent(in) ::   &
-       iElement, jElement, kElement     ! i, j and k indices for this element
+       iElement, jElement       ! i and j indices for this element
 
     integer, intent(in) :: &
        itest, jtest, rtest         ! coordinates of diagnostic point
 
-    real(dp), dimension(nNodesPerElement_3d,nNodesPerElement_3d), intent(in) ::  &
+    real(dp), dimension(nNodesPerElement_2d,nNodesPerElement_2d), intent(in) ::  &
        Kuu, Kuv, Kvu, Kvv       ! element matrix
 
-    real(dp), dimension(nNodeNeighbors_3d,nz,nx-1,ny-1), intent(inout) ::    &
+    real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(inout) ::    &
        Auu, Auv, Avu, Avv       ! assembled matrix
 
-    integer :: i, j, k, m
-    integer :: iA, jA, kA
+    integer :: i, j, m
+    integer :: iA, jA
     integer :: n, nr, nc
 
-    if (verbose_matrix .and. this_rank==rtest .and. iElement==itest .and. jElement==jtest .and. kElement==ktest) then
-       write(iulog,*) 'Element i, j, k:', iElement, jElement, kElement 
+    if (verbose_matrix .and. this_rank==rtest .and. iElement==itest .and. jElement==jtest) then
+       write(iulog,*) 'Element i, j:', iElement, jElement 
        write(iulog,*) 'Rows of Kuu:'
-       do n = 1, nNodesPerElement_3d
+       do n = 1, nNodesPerElement_2d
           write(iulog, '(8e12.4)') Kuu(n,:)
        enddo
     endif
 
-    !WHL - On a Mac I tried switching the loops to put nc on the outside, but 
-    !      the one with nr on the outside is faster.
-    do nr = 1, nNodesPerElement_3d       ! rows of K
+    do nr = 1, nNodesPerElement_2d       ! rows of K
 
-       ! Determine row of A to be incremented by finding (k,i,j) for this node
-       ! The reason for the '7' is that node 7, in the NE corner of the upper layer, has index (k,i,j).
+       ! Determine row of A to be incremented by finding (i,j) for this node
+       ! The reason for the '3' is that node 3, in the NE corner of this gridcell, has index (i,j).
        ! Indices for other nodes are computed relative to this node.
-       i = iElement + ishift(7,nr)
-       j = jElement + jshift(7,nr)
-       k = kElement + kshift(7,nr)
+       i = iElement + ishift(3,nr)
+       j = jElement + jshift(3,nr)
       
-       do nc = 1, nNodesPerElement_3d    ! columns of K
+       do nc = 1, nNodesPerElement_2d    ! columns of K
 
           ! Determine column of A to be incremented
-          kA = kshift(nr,nc)           ! k index of A into which K(m,n) is summed
-          iA = ishift(nr,nc)           ! similarly for i and j indices 
+          iA = ishift(nr,nc)           ! similarly for i and j indices
           jA = jshift(nr,nc)           ! these indices can take values -1, 0 and 1
-          m = indxA_3d(iA,jA,kA)
+          m = indxA_2d(iA,jA)
 
           ! Increment A
-          Auu(m,k,i,j) = Auu(m,k,i,j) + Kuu(nr,nc)
-          Auv(m,k,i,j) = Auv(m,k,i,j) + Kuv(nr,nc)
-          Avu(m,k,i,j) = Avu(m,k,i,j) + Kvu(nr,nc)
-          Avv(m,k,i,j) = Avv(m,k,i,j) + Kvv(nr,nc)
+          Auu(i,j,m) = Auu(i,j,m) + Kuu(nr,nc)
+          Auv(i,j,m) = Auv(i,j,m) + Kuv(nr,nc)
+          Avu(i,j,m) = Avu(i,j,m) + Kvu(nr,nc)
+          Avv(i,j,m) = Avv(i,j,m) + Kvv(nr,nc)
 
-       enddo     ! nc
+          if (verbose_matrix .and. this_rank==rtest .and. iElement==itest .and. jElement==jtest) then
+             write(iulog,*) 'Increment Auu, element i, j, nr, nc =', iElement, jElement, nr, nc
+             write(iulog,*) '     i, j, m, Kuu, new Auu:', i, j, m, Kuu(nr,nc), Auu(i,j,m)
+!!             write(iulog,*) 'Increment Avv, element i, j, nr, nc =', iElement, jElement, nr, nc
+!!             write(iulog,*) '     i, j, m, Kvv, new Avv:', i, j, m, Kvv(nr,nc), Avv(i,j,m)
+          endif
 
+       enddo     ! nc
     enddo        ! nr
 
-  end subroutine element_to_global_matrix_3d
-
+  end subroutine element_to_global_matrix_2d
+  
 !****************************************************************************
 
-  subroutine element_to_global_matrix_2d(nx,           ny,        &
-                                         iElement,     jElement,  &
-                                         itest, jtest, rtest,     &
-                                         Kuu,          Kuv,       &
-                                         Kvu,          Kvv,       &
-                                         Auu,          Auv,       &
+  subroutine element_to_global_matrix_3d(nx,           ny,          nz,          &
+                                         iElement,     jElement,    kElement,    &
+                                         itest,        jtest,       rtest,       &
+                                         Kuu,          Kuv,                      &
+                                         Kvu,          Kvv,                      &
+                                         Auu,          Auv,                      &
                                          Avu,          Avv)
-
+             
     ! Sum terms of element matrix K into dense assembled matrix A
     ! K is partitioned into Kuu, Kuv, Kvu, and Kvv, and similarly for A.
 
     integer, intent(in) ::   &
-       nx, ny                   ! horizontal grid dimensions
+       nx, ny,               &  ! horizontal grid dimensions
+       nz                       ! number of vertical levels where velocity is computed
 
     integer, intent(in) ::   &
-       iElement, jElement       ! i and j indices for this element
+       iElement, jElement, kElement     ! i, j and k indices for this element
 
     integer, intent(in) :: &
        itest, jtest, rtest         ! coordinates of diagnostic point
 
-    real(dp), dimension(nNodesPerElement_2d,nNodesPerElement_2d), intent(in) ::  &
+    real(dp), dimension(nNodesPerElement_3d,nNodesPerElement_3d), intent(in) ::  &
        Kuu, Kuv, Kvu, Kvv       ! element matrix
 
-    real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(inout) ::    &
+    real(dp), dimension(nNodeNeighbors_3d,nz,nx-1,ny-1), intent(inout) ::    &
        Auu, Auv, Avu, Avv       ! assembled matrix
 
-    integer :: i, j, m
-    integer :: iA, jA
+    integer :: i, j, k, m
+    integer :: iA, jA, kA
     integer :: n, nr, nc
 
-    if (verbose_matrix .and. this_rank==rtest .and. iElement==itest .and. jElement==jtest) then
-       write(iulog,*) 'Element i, j:', iElement, jElement 
+    if (verbose_matrix .and. this_rank==rtest .and. iElement==itest .and. jElement==jtest .and. kElement==ktest) then
+       write(iulog,*) 'Element i, j, k:', iElement, jElement, kElement 
        write(iulog,*) 'Rows of Kuu:'
-       do n = 1, nNodesPerElement_2d
+       do n = 1, nNodesPerElement_3d
           write(iulog, '(8e12.4)') Kuu(n,:)
        enddo
     endif
 
-    do nr = 1, nNodesPerElement_2d       ! rows of K
+    !WHL - On a Mac I tried switching the loops to put nc on the outside, but
+    !      the one with nr on the outside is faster.
+    do nr = 1, nNodesPerElement_3d       ! rows of K
 
-       ! Determine row of A to be incremented by finding (i,j) for this node
-       ! The reason for the '3' is that node 3, in the NE corner of this gridcell, has index (i,j).
+       ! Determine row of A to be incremented by finding (k,i,j) for this node
+       ! The reason for the '7' is that node 7, in the NE corner of the upper layer, has index (k,i,j).
        ! Indices for other nodes are computed relative to this node.
-       i = iElement + ishift(3,nr)
-       j = jElement + jshift(3,nr)
+       i = iElement + ishift(7,nr)
+       j = jElement + jshift(7,nr)
+       k = kElement + kshift(7,nr)
       
-       do nc = 1, nNodesPerElement_2d    ! columns of K
+       do nc = 1, nNodesPerElement_3d    ! columns of K
 
           ! Determine column of A to be incremented
-          iA = ishift(nr,nc)           ! similarly for i and j indices
+          kA = kshift(nr,nc)           ! k index of A into which K(m,n) is summed
+          iA = ishift(nr,nc)           ! similarly for i and j indices 
           jA = jshift(nr,nc)           ! these indices can take values -1, 0 and 1
-          m = indxA_2d(iA,jA)
+          m = indxA_3d(iA,jA,kA)
 
           ! Increment A
-          Auu(i,j,m) = Auu(i,j,m) + Kuu(nr,nc)
-          Auv(i,j,m) = Auv(i,j,m) + Kuv(nr,nc)
-          Avu(i,j,m) = Avu(i,j,m) + Kvu(nr,nc)
-          Avv(i,j,m) = Avv(i,j,m) + Kvv(nr,nc)
-
-          if (verbose_matrix .and. this_rank==rtest .and. iElement==itest .and. jElement==jtest) then
-             write(iulog,*) 'Increment Auu, element i, j, nr, nc =', iElement, jElement, nr, nc
-             write(iulog,*) '     i, j, m, Kuu, new Auu:', i, j, m, Kuu(nr,nc), Auu(i,j,m)
-!!             write(iulog,*) 'Increment Avv, element i, j, nr, nc =', iElement, jElement, nr, nc
-!!             write(iulog,*) '     i, j, m, Kvv, new Avv:', i, j, m, Kvv(nr,nc), Avv(i,j,m)
-          endif
+          Auu(m,k,i,j) = Auu(m,k,i,j) + Kuu(nr,nc)
+          Auv(m,k,i,j) = Auv(m,k,i,j) + Kuv(nr,nc)
+          Avu(m,k,i,j) = Avu(m,k,i,j) + Kvu(nr,nc)
+          Avv(m,k,i,j) = Avv(m,k,i,j) + Kvv(nr,nc)
 
        enddo     ! nc
+
     enddo        ! nr
 
-  end subroutine element_to_global_matrix_2d
+  end subroutine element_to_global_matrix_3d
 
 !****************************************************************************
     !WHL, May 2025:
@@ -9018,65 +9018,237 @@ subroutine basal_sliding_bc_3d(nx,               ny,              &
                       iA = ishift(nr,nc)          ! iA index of A into which K(nr,nc) is summed
                       jA = jshift(nr,nc)          ! similarly for jA
 
-                      if (nNeighbors == nNodeNeighbors_3d) then  ! 3D problem
-                         m = indxA_3d(iA,jA,0)
-                      else  ! 2D problem
-                         m = indxA_2d(iA,jA)
-                      endif
+                      if (nNeighbors == nNodeNeighbors_3d) then  ! 3D problem
+                         m = indxA_3d(iA,jA,0)
+                      else  ! 2D problem
+                         m = indxA_2d(iA,jA)
+                      endif
+
+                      Auu(m,ii,jj) = Auu(m,ii,jj) + Kuu(nr,nc)
+                      Avv(m,ii,jj) = Avv(m,ii,jj) + Kvv(nr,nc)
+
+                      if (verbose_basal .and. this_rank==rtest .and. ii==itest .and. jj==jtest .and. m==5) then
+                         ! m = 5 gives the influence of beta at vertex(i,j) on velocity at vertex(ii,jj).
+                         ! For local assembly, Auu and Avv get nonzero increments only for m = 5.
+                         write(iulog,*) 'Basal increment for Auu and Avv: source (i,j), Kuu, new Auu, ii, jj, m =', &
+                              i, j, Kuu(nr,nc), Auu(m,ii,jj), ii, jj, m
+                      endif
+
+                   enddo     ! nc
+                enddo        ! nr
+
+                if (verbose_basal .and. this_rank==rtest .and. i==itest .and. j==jtest) then
+!                  write(iulog,*) ' '
+!                  write(iulog,*) 'i, j =', i, j
+!                  write(iulog,*) 'Kuu:'
+!                  do nr = 1, nNodesPerElement_2d
+!                     write(iulog,*) nr, Kuu(nr,:)
+!                  enddo
+!                  write(iulog,*) ' '
+!                  write(iulog,*) 'rowsum(Kuu):'
+!                  do nr = 1, nNodesPerElement_2d
+!                     write(iulog,*) nr, sum(Kuu(nr,:))
+!                  enddo
+!                  write(iulog,*) ' '
+!                  write(iulog,*) 'sum(Kuu):', sum(Kuu(:,:))
+                endif
+
+             enddo   ! nQuadPoints_2d
+
+          endif      ! active_cell
+
+       enddo         ! i
+       enddo         ! j
+
+    endif   ! whichassemble_beta
+
+    if (verbose_basal .and. this_rank==rtest) then
+       i = itest
+       j = jtest
+       if (nNeighbors == nNodeNeighbors_3d) then  ! 3D problem
+          m = indxA_3d(0,0,0)
+       else
+          m = indxA_2d(0,0)
+       endif
+       write(iulog,*) ' '
+       write(iulog,*) 'Basal BC: i, j, diagonal index =', i, j, m
+       write(iulog,*) 'New Auu diagonal:', Auu(m,i,j)
+       write(iulog,*) 'New Avv diagonal:', Avv(m,i,j)
+    endif
+
+  end subroutine basal_sliding_bc_3d
+
+!****************************************************************************
+
+  subroutine dirichlet_boundary_conditions_2d(nx,              ny,               &
+                                              nhalo,                             &
+                                              active_vertex,                     &
+                                              umask_dirichlet, vmask_dirichlet,  &
+                                              uvel,            vvel,             &
+                                              Auu,             Auv,              &
+                                              Avu,             Avv,              &
+                                              bu,              bv)
+
+    !----------------------------------------------------------------
+    ! Modify the global matrix and RHS for Dirichlet boundary conditions,
+    !  where uvel and vvel are prescribed at certain nodes.
+    ! For each such node, we zero out the row, except for setting the diagonal term to 1.
+    ! We also zero out the column, moving terms containing uvel/vvel to the rhs.
+    !----------------------------------------------------------------
+
+    !----------------------------------------------------------------
+    ! Input-output arguments
+    !----------------------------------------------------------------
+
+    integer, intent(in) ::   &
+       nx, ny,               &  ! horizontal grid dimensions
+       nhalo                    ! number of halo layers
+
+    logical, dimension(nx-1,ny-1), intent(in) ::  &
+       active_vertex       ! true for active vertices (vertices of active cells)
+
+    integer, dimension(nx-1,ny-1), intent(in) ::  &
+       umask_dirichlet,   &! Dirichlet mask for velocity (if true, u is prescribed)
+       vmask_dirichlet     ! Dirichlet mask for velocity (if true, v is prescribed)
+
+    real(dp), dimension(nx-1,ny-1), intent(in) ::  &
+       uvel, vvel          ! velocity components
+
+    real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(inout) ::   &
+       Auu, Auv,    &      ! assembled stiffness matrix, divided into 4 parts
+       Avu, Avv                                    
+
+    real(dp), dimension(nx-1,ny-1), intent(inout) ::   &
+       bu, bv              ! assembled load vector, divided into 2 parts
+
+    !----------------------------------------------------------------
+    ! Local variables
+    !----------------------------------------------------------------
+    
+    integer :: i, j     ! Cartesian indices of nodes
+    integer :: iA, jA   ! i and j offsets of neighboring nodes 
+    integer :: m, mm
+
+    ! Loop over all vertices that border locally owned vertices.
+    ! Locally owned vertices are (staggered_ilo:staggered_ihi, staggered_jlo_staggered_jhi).
+    ! OK to skip vertices outside the global domain (i < nhalo or j < nhalo).
+    ! Note: Need nhalo >= 2 so as not to step out of bounds.
+
+    do jA = -1,1
+    do iA = -1,1
+       m  = indxA_2d(iA,jA)
+       mm = indxA_2d(-iA,-jA)
+
+       do j = nhalo, ny-nhalo+1
+          do i = nhalo, nx-nhalo+1
+             if (active_vertex(i,j)) then
+
+                if (umask_dirichlet(i,j) == 1) then
+
+                   ! set the rhs to the prescribed velocity
+                   bu(i,j) = uvel(i,j)
+
+                   ! loop through matrix values in the rows associated with this vertex
+                   ! (Auu contains one row, Avu contains a second row)
+
+                   if ( (i+iA >= 1 .and. i+iA <= nx-1)       &
+                                   .and.                     &
+                        (j+jA >= 1 .and. j+jA <= ny-1) ) then
+
+                      if (iA==0 .and. jA==0) then  ! main diagonal
+
+                         ! Set Auu = 1 on the main diagonal
+                         ! Set Auv term = 0; this term is off-diagonal for the fully assembled matrix
+                         ! Set Avu term = 0 to preserve matrix symmetry (given that Auv term = 0)
+                         Auu(i,j,indxA_2d(0,0)) = 1.d0
+                         Auv(i,j,indxA_2d(0,0)) = 1.d0
+                         Avu(i,j,indxA_2d(0,0)) = 1.d0
+
+                      else     ! not on the diagonal
+
+                         ! Zero out non-diagonal matrix terms in the row associated with this vertex
+                         Auu(i,j,m) = 0.d0
+                         Auv(i,j,m) = 0.d0
+
+                         ! Shift terms associated with this velocity to the rhs.
+                         ! Note: The remaining operations do not change the answer, but do restore symmetry to the matrix.
+                         ! Recall mm = indxA_2d(-iA,-jA)
+
+                         if (umask_dirichlet(i+iA, j+jA) /= 1) then
+                            ! Move (Auu term) * uvel to rhs
+                            bu(i+iA, j+jA) = bu(i+iA, j+jA) - Auu(i+iA, j+jA, mm) * uvel(i,j)
+                            Auu(i+iA, j+jA, mm) = 0.d0
+                         endif
+
+                         if (vmask_dirichlet(i+iA, j+jA) /= 1) then
+                            ! Move (Avu term) * uvel to rhs
+                            bv(i+iA, j+jA) = bv(i+iA, j+jA) - Avu(i+iA, j+jA, mm) * uvel(i,j)
+                            Avu(i+iA, j+jA, mm) = 0.d0
+                         endif
+
+                      endif  ! on the diagonal
+
+                   endif     ! i+iA and j+jA in bounds
+
+                endif       ! umask_dirichlet
+
+                if (vmask_dirichlet(i,j) == 1) then
+
+                   ! set the rhs to the prescribed velocity
+                   bv(i,j) = vvel(i,j)
+
+                   ! loop through matrix values in the rows associated with this vertex
+                   ! (Auv contains one row, Avv contains a second row)
+
+                   if ( (i+iA >= 1 .and. i+iA <= nx-1)       &
+                                   .and.                     &
+                        (j+jA >= 1 .and. j+jA <= ny-1) ) then
+
+                      if (iA==0 .and. jA==0) then  ! main diagonal
+
+                         ! Set Avv = 1 on the main diagonal
+                         ! Set Avu term = 0; this term is off-diagonal for the fully assembled matrix
+                         ! Set Auv term = 0 to preserve matrix symmetry (given that Avu term = 0)
+                         Auv(i,j,indxA_2d(0,0)) = 0.d0
+                         Avu(i,j,indxA_2d(0,0)) = 0.d0
+                         Avv(i,j,indxA_2d(0,0)) = 1.d0
+
+                      else     ! not on the diagonal
 
-                      Auu(m,ii,jj) = Auu(m,ii,jj) + Kuu(nr,nc)
-                      Avv(m,ii,jj) = Avv(m,ii,jj) + Kvv(nr,nc)
+                         ! Zero out non-diagonal matrix terms in the rows associated with this vertex
+                         Avu(i,j,m) = 0.d0
+                         Avv(i,j,m) = 0.d0
 
-                      if (verbose_basal .and. this_rank==rtest .and. ii==itest .and. jj==jtest .and. m==5) then
-                         ! m = 5 gives the influence of beta at vertex(i,j) on velocity at vertex(ii,jj).
-                         ! For local assembly, Auu and Avv get nonzero increments only for m = 5.
-                         write(iulog,*) 'Basal increment for Auu and Avv: source (i,j), Kuu, new Auu, ii, jj, m =', &
-                              i, j, Kuu(nr,nc), Auu(m,ii,jj), ii, jj, m
-                      endif
+                         ! Shift terms associated with this velocity to the rhs.
+                         ! Note: The remaining operations do not change the answer, but do restore symmetry to the matrix.
+                         ! Recall mm = indxA_2d(-iA,-jA)
 
-                   enddo     ! nc
-                enddo        ! nr
+                         if (umask_dirichlet(i+iA, j+jA) /= 1) then
+                            ! Move (Auv term) * vvel to rhs
+                            bu(i+iA, j+jA) = bu(i+iA, j+jA) - Auv(i+iA, j+jA, mm) * vvel(i,j)
+                            Auv(i+iA, j+jA, mm) = 0.d0
+                         endif
 
-                if (verbose_basal .and. this_rank==rtest .and. i==itest .and. j==jtest) then
-!                  write(iulog,*) ' '
-!                  write(iulog,*) 'i, j =', i, j
-!                  write(iulog,*) 'Kuu:'
-!                  do nr = 1, nNodesPerElement_2d
-!                     write(iulog,*) nr, Kuu(nr,:)
-!                  enddo
-!                  write(iulog,*) ' '
-!                  write(iulog,*) 'rowsum(Kuu):'
-!                  do nr = 1, nNodesPerElement_2d
-!                     write(iulog,*) nr, sum(Kuu(nr,:))
-!                  enddo
-!                  write(iulog,*) ' '
-!                  write(iulog,*) 'sum(Kuu):', sum(Kuu(:,:))
-                endif
+                         if (vmask_dirichlet(i+iA, j+jA) /= 1) then
+                            ! Move (Avv term) * vvel to rhs
+                            bv(i+iA, j+jA) = bv(i+iA, j+jA) - Avv(i+iA, j+jA, mm) * vvel(i,j)
+                            Avv(i+iA, j+jA, mm) = 0.d0
+                         endif
 
-             enddo   ! nQuadPoints_2d
+                      endif  ! on the diagonal
 
-          endif      ! active_cell
+                   endif     ! i+iA and j+jA in bounds
 
-       enddo         ! i
-       enddo         ! j
+                endif       ! vmask_dirichlet
 
-    endif   ! whichassemble_beta
+             endif          ! active_vertex
+          enddo             ! i
+       enddo                ! j
 
-    if (verbose_basal .and. this_rank==rtest) then
-       i = itest
-       j = jtest
-       if (nNeighbors == nNodeNeighbors_3d) then  ! 3D problem
-          m = indxA_3d(0,0,0)
-       else
-          m = indxA_2d(0,0)
-       endif
-       write(iulog,*) ' '
-       write(iulog,*) 'Basal BC: i, j, diagonal index =', i, j, m
-       write(iulog,*) 'New Auu diagonal:', Auu(m,i,j)
-       write(iulog,*) 'New Avv diagonal:', Avv(m,i,j)
-    endif
+    enddo    ! iA
+    enddo    ! jA
 
-  end subroutine basal_sliding_bc_3d
+  end subroutine dirichlet_boundary_conditions_2d
 
 !****************************************************************************
 
@@ -9217,239 +9389,67 @@ subroutine dirichlet_boundary_conditions_3d(nx,              ny,               &
                       if ( (k+kA >= 1 .and. k+kA <= nz)         &
                                       .and.                     &
                            (i+iA >= 1 .and. i+iA <= nx-1)       &
-                                      .and.                     &
-                           (j+jA >= 1 .and. j+jA <= ny-1) ) then
-
-                         if (iA==0 .and. jA==0 .and. kA==0) then  ! main diagonal
-
-                            ! Set Avv = 1 on the main diagonal
-                            ! Set Avu term = 0; this term is off-diagonal for the fully assembled matrix
-                            ! Set Auv term = 0 to preserve matrix symmetry (given that Avu term = 0)
-                            m = indxA_3d(0,0,0)
-
-                            Auv(m,k,i,j) = 0.d0
-                            Avu(m,k,i,j) = 0.d0
-                            Avv(m,k,i,j) = 1.d0
-
-                            !TODO - Set bv above, outside iA/jA loop
-                            ! Set the rhs to the prescribed velocity, forcing v = prescribed vvel for this node
-!!                            bv(k,i,j) = vvel(k,i,j)
-                            
-                         else     ! not on the diagonal
-
-                            ! Zero out non-diagonal matrix terms in the rows associated with this node
-                            m = indxA_3d(iA,jA,kA)
-                            Avu(m, k, i, j) = 0.d0
-                            Avv(m, k, i, j) = 0.d0
-
-                            ! Shift terms associated with this velocity to the rhs.
-                            ! Note: The remaining operations do not change the answer, but do restore symmetry to the matrix.
-                            m = indxA_3d(-iA,-jA,-kA)
-
-                            if (umask_dirichlet(k+kA, i+iA, j+jA) /= 1) then
-                               ! Move (Auv term) * vvel to rhs
-                               bu(k+kA, i+iA, j+jA) = bu(k+kA, i+iA, j+jA) - Auv(m, k+kA, i+iA, j+jA) * vvel(k,i,j)
-                               Auv(m, k+kA, i+iA, j+jA) = 0.d0
-                            endif
-
-                            if (vmask_dirichlet(k+kA, i+iA, j+jA) /= 1) then
-                               ! Move (Avv term) * vvel to rhs
-                               bv(k+kA, i+iA, j+jA) = bv(k+kA, i+iA, j+jA) - Avv(m, k+kA, i+iA, j+jA) * vvel(k,i,j)
-                               Avv(m, k+kA, i+iA, j+jA) = 0.d0
-                            endif
-
-                         endif  ! on the diagonal
-
-                     endif     ! i+iA, j+jA, and k+kA in bounds
-
-                  enddo        ! kA
-                  enddo        ! iA
-                  enddo        ! jA
-
-                endif    ! vmask_dirichlet
-
-             enddo       ! k
-          endif          ! active_vertex
-       enddo             ! i
-    enddo                ! j
-
-  end subroutine dirichlet_boundary_conditions_3d
-
-!****************************************************************************
-
-  subroutine dirichlet_boundary_conditions_2d(nx,              ny,               &
-                                              nhalo,                             &
-                                              active_vertex,                     &
-                                              umask_dirichlet, vmask_dirichlet,  &
-                                              uvel,            vvel,             &
-                                              Auu,             Auv,              &
-                                              Avu,             Avv,              &
-                                              bu,              bv)
-
-    !----------------------------------------------------------------
-    ! Modify the global matrix and RHS for Dirichlet boundary conditions,
-    !  where uvel and vvel are prescribed at certain nodes.
-    ! For each such node, we zero out the row, except for setting the diagonal term to 1.
-    ! We also zero out the column, moving terms containing uvel/vvel to the rhs.
-    !----------------------------------------------------------------
-
-    !----------------------------------------------------------------
-    ! Input-output arguments
-    !----------------------------------------------------------------
-
-    integer, intent(in) ::   &
-       nx, ny,               &  ! horizontal grid dimensions
-       nhalo                    ! number of halo layers
-
-    logical, dimension(nx-1,ny-1), intent(in) ::  &
-       active_vertex       ! true for active vertices (vertices of active cells)
-
-    integer, dimension(nx-1,ny-1), intent(in) ::  &
-       umask_dirichlet,   &! Dirichlet mask for velocity (if true, u is prescribed)
-       vmask_dirichlet     ! Dirichlet mask for velocity (if true, v is prescribed)
-
-    real(dp), dimension(nx-1,ny-1), intent(in) ::  &
-       uvel, vvel          ! velocity components
-
-    real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(inout) ::   &
-       Auu, Auv,    &      ! assembled stiffness matrix, divided into 4 parts
-       Avu, Avv                                    
-
-    real(dp), dimension(nx-1,ny-1), intent(inout) ::   &
-       bu, bv              ! assembled load vector, divided into 2 parts
-
-    !----------------------------------------------------------------
-    ! Local variables
-    !----------------------------------------------------------------
-    
-    integer :: i, j     ! Cartesian indices of nodes
-    integer :: iA, jA   ! i and j offsets of neighboring nodes 
-    integer :: m, mm
-
-    ! Loop over all vertices that border locally owned vertices.
-    ! Locally owned vertices are (staggered_ilo:staggered_ihi, staggered_jlo_staggered_jhi).
-    ! OK to skip vertices outside the global domain (i < nhalo or j < nhalo).
-    ! Note: Need nhalo >= 2 so as not to step out of bounds.
-
-    do jA = -1,1
-    do iA = -1,1
-       m  = indxA_2d(iA,jA)
-       mm = indxA_2d(-iA,-jA)
-
-       do j = nhalo, ny-nhalo+1
-          do i = nhalo, nx-nhalo+1
-             if (active_vertex(i,j)) then
-
-                if (umask_dirichlet(i,j) == 1) then
-
-                   ! set the rhs to the prescribed velocity
-                   bu(i,j) = uvel(i,j)
-
-                   ! loop through matrix values in the rows associated with this vertex
-                   ! (Auu contains one row, Avu contains a second row)
-
-                   if ( (i+iA >= 1 .and. i+iA <= nx-1)       &
-                                   .and.                     &
-                        (j+jA >= 1 .and. j+jA <= ny-1) ) then
-
-                      if (iA==0 .and. jA==0) then  ! main diagonal
-
-                         ! Set Auu = 1 on the main diagonal
-                         ! Set Auv term = 0; this term is off-diagonal for the fully assembled matrix
-                         ! Set Avu term = 0 to preserve matrix symmetry (given that Auv term = 0)
-                         Auu(i,j,indxA_2d(0,0)) = 1.d0
-                         Auv(i,j,indxA_2d(0,0)) = 1.d0
-                         Avu(i,j,indxA_2d(0,0)) = 1.d0
-
-                      else     ! not on the diagonal
-
-                         ! Zero out non-diagonal matrix terms in the row associated with this vertex
-                         Auu(i,j,m) = 0.d0
-                         Auv(i,j,m) = 0.d0
-
-                         ! Shift terms associated with this velocity to the rhs.
-                         ! Note: The remaining operations do not change the answer, but do restore symmetry to the matrix.
-                         ! Recall mm = indxA_2d(-iA,-jA)
-
-                         if (umask_dirichlet(i+iA, j+jA) /= 1) then
-                            ! Move (Auu term) * uvel to rhs
-                            bu(i+iA, j+jA) = bu(i+iA, j+jA) - Auu(i+iA, j+jA, mm) * uvel(i,j)
-                            Auu(i+iA, j+jA, mm) = 0.d0
-                         endif
-
-                         if (vmask_dirichlet(i+iA, j+jA) /= 1) then
-                            ! Move (Avu term) * uvel to rhs
-                            bv(i+iA, j+jA) = bv(i+iA, j+jA) - Avu(i+iA, j+jA, mm) * uvel(i,j)
-                            Avu(i+iA, j+jA, mm) = 0.d0
-                         endif
-
-                      endif  ! on the diagonal
-
-                   endif     ! i+iA and j+jA in bounds
-
-                endif       ! umask_dirichlet
-
-                if (vmask_dirichlet(i,j) == 1) then
-
-                   ! set the rhs to the prescribed velocity
-                   bv(i,j) = vvel(i,j)
-
-                   ! loop through matrix values in the rows associated with this vertex
-                   ! (Auv contains one row, Avv contains a second row)
+                                      .and.                     &
+                           (j+jA >= 1 .and. j+jA <= ny-1) ) then
 
-                   if ( (i+iA >= 1 .and. i+iA <= nx-1)       &
-                                   .and.                     &
-                        (j+jA >= 1 .and. j+jA <= ny-1) ) then
+                         if (iA==0 .and. jA==0 .and. kA==0) then  ! main diagonal
 
-                      if (iA==0 .and. jA==0) then  ! main diagonal
+                            ! Set Avv = 1 on the main diagonal
+                            ! Set Avu term = 0; this term is off-diagonal for the fully assembled matrix
+                            ! Set Auv term = 0 to preserve matrix symmetry (given that Avu term = 0)
+                            m = indxA_3d(0,0,0)
 
-                         ! Set Avv = 1 on the main diagonal
-                         ! Set Avu term = 0; this term is off-diagonal for the fully assembled matrix
-                         ! Set Auv term = 0 to preserve matrix symmetry (given that Avu term = 0)
-                         Auv(i,j,indxA_2d(0,0)) = 0.d0
-                         Avu(i,j,indxA_2d(0,0)) = 0.d0
-                         Avv(i,j,indxA_2d(0,0)) = 1.d0
+                            Auv(m,k,i,j) = 0.d0
+                            Avu(m,k,i,j) = 0.d0
+                            Avv(m,k,i,j) = 1.d0
 
-                      else     ! not on the diagonal
+                            !TODO - Set bv above, outside iA/jA loop
+                            ! Set the rhs to the prescribed velocity, forcing v = prescribed vvel for this node
+!!                            bv(k,i,j) = vvel(k,i,j)
+                            
+                         else     ! not on the diagonal
 
-                         ! Zero out non-diagonal matrix terms in the rows associated with this vertex
-                         Avu(i,j,m) = 0.d0
-                         Avv(i,j,m) = 0.d0
+                            ! Zero out non-diagonal matrix terms in the rows associated with this node
+                            m = indxA_3d(iA,jA,kA)
+                            Avu(m, k, i, j) = 0.d0
+                            Avv(m, k, i, j) = 0.d0
 
-                         ! Shift terms associated with this velocity to the rhs.
-                         ! Note: The remaining operations do not change the answer, but do restore symmetry to the matrix.
-                         ! Recall mm = indxA_2d(-iA,-jA)
+                            ! Shift terms associated with this velocity to the rhs.
+                            ! Note: The remaining operations do not change the answer, but do restore symmetry to the matrix.
+                            m = indxA_3d(-iA,-jA,-kA)
 
-                         if (umask_dirichlet(i+iA, j+jA) /= 1) then
-                            ! Move (Auv term) * vvel to rhs
-                            bu(i+iA, j+jA) = bu(i+iA, j+jA) - Auv(i+iA, j+jA, mm) * vvel(i,j)
-                            Auv(i+iA, j+jA, mm) = 0.d0
-                         endif
+                            if (umask_dirichlet(k+kA, i+iA, j+jA) /= 1) then
+                               ! Move (Auv term) * vvel to rhs
+                               bu(k+kA, i+iA, j+jA) = bu(k+kA, i+iA, j+jA) - Auv(m, k+kA, i+iA, j+jA) * vvel(k,i,j)
+                               Auv(m, k+kA, i+iA, j+jA) = 0.d0
+                            endif
 
-                         if (vmask_dirichlet(i+iA, j+jA) /= 1) then
-                            ! Move (Avv term) * vvel to rhs
-                            bv(i+iA, j+jA) = bv(i+iA, j+jA) - Avv(i+iA, j+jA, mm) * vvel(i,j)
-                            Avv(i+iA, j+jA, mm) = 0.d0
-                         endif
+                            if (vmask_dirichlet(k+kA, i+iA, j+jA) /= 1) then
+                               ! Move (Avv term) * vvel to rhs
+                               bv(k+kA, i+iA, j+jA) = bv(k+kA, i+iA, j+jA) - Avv(m, k+kA, i+iA, j+jA) * vvel(k,i,j)
+                               Avv(m, k+kA, i+iA, j+jA) = 0.d0
+                            endif
 
-                      endif  ! on the diagonal
+                         endif  ! on the diagonal
 
-                   endif     ! i+iA and j+jA in bounds
+                     endif     ! i+iA, j+jA, and k+kA in bounds
 
-                endif       ! vmask_dirichlet
+                  enddo        ! kA
+                  enddo        ! iA
+                  enddo        ! jA
 
-             endif          ! active_vertex
-          enddo             ! i
-       enddo                ! j
+                endif    ! vmask_dirichlet
 
-    enddo    ! iA
-    enddo    ! jA
+             enddo       ! k
+          endif          ! active_vertex
+       enddo             ! i
+    enddo                ! j
 
-  end subroutine dirichlet_boundary_conditions_2d
+  end subroutine dirichlet_boundary_conditions_3d
 
 !****************************************************************************
 
-  subroutine compute_residual_vector_3d(nx,    ny,     nz,            &
+  subroutine compute_residual_vector_2d(nx,            ny,            &
                                         parallel,                     &
                                         itest,  jtest, rtest,         &
                                         active_vertex,                &
@@ -9464,12 +9464,11 @@ subroutine compute_residual_vector_3d(nx,    ny,     nz,            &
     ! This subroutine assumes that the matrix is stored in structured (x/y/z) format.
 
     !----------------------------------------------------------------
-    ! Input/output variables
+    ! Input/output arguments
     !----------------------------------------------------------------
 
     integer, intent(in) ::   &
-       nx, ny,             &  ! horizontal grid dimensions (for scalars)
-       nz                     ! number of vertical levels where velocity is computed
+       nx, ny                 ! horizontal grid dimensions (for scalars)
 
     type(parallel_type), intent(in) :: &
        parallel               ! info for parallel communication
@@ -9480,25 +9479,25 @@ subroutine compute_residual_vector_3d(nx,    ny,     nz,            &
     logical, dimension(nx-1,ny-1), intent(in) ::   &
        active_vertex          ! T for columns (i,j) where velocity is computed, else F
 
-    real(dp), dimension(nNodeNeighbors_3d,nz,nx-1,ny-1), intent(in) ::   &
+    real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(in) ::   &
        Auu, Auv, Avu, Avv     ! four components of assembled matrix
-                              ! 1st dimension = 3 (node and its nearest neighbors in x, y and z direction)
-                              ! other dimensions = (z,x,y) indices
+                              ! 3rd dimension = 9 (node and its nearest neighbors in x and y directions)
+                              ! 1st and 2nd dimensions = (x,y) indices
                               !
                               !    Auu  | Auv
                               !    _____|____
                               !    Avu  | Avv
                               !         |
 
-    real(dp), dimension(nz,nx-1,ny-1), intent(in) ::  &
+    real(dp), dimension(nx-1,ny-1), intent(in) ::  &
        bu, bv              ! assembled load (rhs) vector, divided into 2 parts
 
-   real(dp), dimension(nz,nx-1,ny-1), intent(in) ::   &
+   real(dp), dimension(nx-1,ny-1), intent(in) ::   &
        uvel, vvel          ! u and v components of velocity (m/yr)
 
-    real(dp), dimension(nz,nx-1,ny-1), intent(out) ::   &
+    real(dp), dimension(nx-1,ny-1), intent(out) ::   &
        resid_u,      & ! residual vector, divided into 2 parts
-       resid_v         !
+       resid_v
 
     real(dp), intent(out) ::    &
        L2_norm             ! L2 norm of residual vector, |Ax - b|
@@ -9510,13 +9509,13 @@ subroutine compute_residual_vector_3d(nx,    ny,     nz,            &
     ! Local variables
     !----------------------------------------------------------------
 
-    real(dp), dimension(nz,nx-1,ny-1) ::   &
-       worku, workv,     & ! work arrays for global sums
-       resid_sq            ! resid_u^2 + resid_v^2
+    real(dp), dimension(nx-1,ny-1) ::  &
+         worku, workv,     & ! work arrays for global sums
+         resid_sq            ! resid_u^2 + resid_v^2
 
     real(dp) :: my_max_resid, global_max_resid
 
-    integer :: i, j, k, iA, jA, kA, m, iglobal, jglobal
+    integer :: i, j, iA, jA, m, iglobal, jglobal
 
     real(dp) :: L2_norm_rhs   ! L2 norm of rhs vector, |b|
 
@@ -9531,59 +9530,47 @@ subroutine compute_residual_vector_3d(nx,    ny,     nz,            &
 
     ! Compute u and v components of A*x
 
-    resid_u(:,:,:) = 0.d0
-    resid_v(:,:,:) = 0.d0
+    resid_u(:,:) = 0.d0
+    resid_v(:,:) = 0.d0
 
     ! Loop over locally owned vertices
-    do j = staggered_jlo, staggered_jhi
-       do i = staggered_ilo, staggered_ihi
-          if (active_vertex(i,j)) then
-             do k = 1, nz
-                do kA = -1,1
-                   do jA = -1,1
-                      do iA = -1,1
-                         if ( (k+kA >= 1 .and. k+kA <= nz)      &
-                                         .and.                  &
-                              (i+iA >= 1 .and. i+iA <= nx-1)    &
-                                         .and.                  &
-                              (j+jA >= 1 .and. j+jA <= ny-1) ) then
-
-                            m = indxA_3d(iA,jA,kA)
-
-                            resid_u(k,i,j) = resid_u(k,i,j)                     &
-                                           + Auu(m,k,i,j)*uvel(k+kA,i+iA,j+jA)  &
-                                           + Auv(m,k,i,j)*vvel(k+kA,i+iA,j+jA)
-
-                            resid_v(k,i,j) = resid_v(k,i,j)                     &
-                                           + Avu(m,k,i,j)*uvel(k+kA,i+iA,j+jA)  &
-                                           + Avv(m,k,i,j)*vvel(k+kA,i+iA,j+jA)
-                         endif   ! in bounds
-                      enddo   ! kA
-                   enddo   ! iA
-                enddo   ! jA
-             enddo   ! k
-          endif   ! active_vertex
-       enddo   ! i
-    enddo   ! j
+    do jA = -1,1
+       do iA = -1,1
+          m = indxA_2d(iA,jA)
+          do j = staggered_jlo, staggered_jhi
+             do i = staggered_ilo, staggered_ihi
+                if (active_vertex(i,j)) then
+                   if ( (i+iA >= 1 .and. i+iA <= nx-1)    &
+                                   .and.                     &
+                        (j+jA >= 1 .and. j+jA <= ny-1) ) then
+                      resid_u(i,j) = resid_u(i,j)                     &
+                                   + Auu(i,j,m)*uvel(i+iA,j+jA)  &
+                                   + Auv(i,j,m)*vvel(i+iA,j+jA)
+                      resid_v(i,j) = resid_v(i,j)                     &
+                                   + Avu(i,j,m)*uvel(i+iA,j+jA)  &
+                                   + Avv(i,j,m)*vvel(i+iA,j+jA)
+                   endif   ! in bounds
+                endif   ! active_vertex
+             enddo   ! i
+          enddo   ! j
+       enddo   ! iA
+    enddo      ! jA
 
     ! Subtract b to get A*x - b
-
-    worku(:,:,:) = 0.0d0
-    workv(:,:,:) = 0.0d0
+    worku(:,:) = 0.0d0
+    workv(:,:) = 0.0d0
 
     ! Loop over locally owned vertices
     do j = staggered_jlo, staggered_jhi
        do i = staggered_ilo, staggered_ihi
           if (active_vertex(i,j)) then
-             do k = 1, nz
-                resid_u(k,i,j) = resid_u(k,i,j) - bu(k,i,j)
-                resid_v(k,i,j) = resid_v(k,i,j) - bv(k,i,j)
-                worku(k,i,j) = resid_u(k,i,j)*resid_u(k,i,j)
-                workv(k,i,j) = resid_v(k,i,j)*resid_v(k,i,j)
-             enddo  ! k
+             resid_u(i,j) = resid_u(i,j) - bu(i,j)
+             resid_v(i,j) = resid_v(i,j) - bv(i,j)
+             worku(i,j) = resid_u(i,j)*resid_u(i,j)
+             workv(i,j) = resid_v(i,j)*resid_v(i,j)
           endif     ! active vertex
-       enddo        ! i
-    enddo        ! j
+       enddo     ! i
+    enddo     ! j
 
     ! Take global sum, then take square root
     L2_norm = parallel_global_sum_stagger(worku, parallel, workv)
@@ -9594,28 +9581,26 @@ subroutine compute_residual_vector_3d(nx,    ny,     nz,            &
        if (this_rank==rtest) then
           i = itest
           j = jtest
-          k = ktest
-          write(iulog,*) 'In compute_residual_vector_3d: task, i, j, k =', this_rank, i, j, k
+          write(iulog,*) 'In compute_residual_vector_2d: task, i, j =', this_rank, i, j
           write(iulog, '(a16, 2f13.7, 2e13.5)') &
-               '  u, v, ru, rv: ', uvel(k,i,j), vvel(k,i,j), resid_u(k,i,j), resid_v(k,i,j)
+               '  u, v, ru, rv: ', uvel(i,j), vvel(i,j), resid_u(i,j), resid_v(i,j)
        endif
 
        ! Compute max value of (squared) residual on this task.
        ! If this task owns the vertex with the global max residual, then print a diagnostic message.
+       resid_sq(:,:) = worku(:,:) + workv(:,:)
        my_max_resid = maxval(resid_sq)
        global_max_resid = parallel_reduce_max(my_max_resid)
 
        if (abs((my_max_resid - global_max_resid)/global_max_resid) < 1.0d-6) then
           do j = staggered_jlo, staggered_jhi
              do i = staggered_ilo, staggered_ihi
-                do k = 1, nz
-                   if (abs((resid_sq(k,i,j) - global_max_resid)/global_max_resid) < 1.0d-6) then
-                      call parallel_globalindex(i, j, iglobal, jglobal, parallel)
-                      write(iulog, '(a24, 2i6, i4, 2e13.5, e16.8)') 'ig, jg, k, ru, rv, rmax:', &
-                           iglobal, jglobal, k, resid_u(k,i,j), resid_v(k,i,j), sqrt(global_max_resid)
-                      write(iulog,*) ' '
-                   endif
-                enddo
+                if (abs((resid_sq(i,j) - global_max_resid)/global_max_resid) < 1.0d-6) then
+                   call parallel_globalindex(i, j, iglobal, jglobal, parallel)
+                   write(iulog, '(a24, 2i6, 2e13.5, e16.8)') 'ig, jg, ru, rv, rmax:', &
+                        iglobal, jglobal, resid_u(i,j), resid_v(i,j), sqrt(global_max_resid)
+                   write(iulog,*) ' '
+                endif
              enddo
           enddo
        endif
@@ -9624,19 +9609,18 @@ subroutine compute_residual_vector_3d(nx,    ny,     nz,            &
 
     if (present(L2_norm_relative)) then   ! compute L2_norm relative to rhs
 
-       worku(:,:,:) = 0.0d0
-       workv(:,:,:) = 0.0d0
+       worku(:,:) = 0.0d0
+       workv(:,:) = 0.0d0
 
+       ! Loop over locally owned vertices
        do j = staggered_jlo, staggered_jhi
           do i = staggered_ilo, staggered_ihi
              if (active_vertex(i,j)) then
-                do k = 1, nz
-                   worku(k,i,j) = bu(k,i,j)*bu(k,i,j)
-                   workv(k,i,j) = bv(k,i,j)*bv(k,i,j)
-                enddo  ! k
+                worku(i,j) = bu(i,j)*bu(i,j)
+                workv(i,j) = bv(i,j)*bv(i,j)
              endif     ! active vertex
-          enddo        ! i
-       enddo        ! j
+          enddo     ! i
+       enddo     ! j
 
        ! Take global sum, then take square root
        L2_norm_rhs = parallel_global_sum_stagger(worku, parallel, workv)
@@ -9650,11 +9634,11 @@ subroutine compute_residual_vector_3d(nx,    ny,     nz,            &
 
     endif
 
-  end subroutine compute_residual_vector_3d
+  end subroutine compute_residual_vector_2d
 
 !****************************************************************************
 
-  subroutine compute_residual_vector_2d(nx,            ny,            &
+  subroutine compute_residual_vector_3d(nx,    ny,     nz,            &
                                         parallel,                     &
                                         itest,  jtest, rtest,         &
                                         active_vertex,                &
@@ -9669,11 +9653,12 @@ subroutine compute_residual_vector_2d(nx,            ny,            &
     ! This subroutine assumes that the matrix is stored in structured (x/y/z) format.
 
     !----------------------------------------------------------------
-    ! Input/output arguments
+    ! Input/output variables
     !----------------------------------------------------------------
 
     integer, intent(in) ::   &
-       nx, ny                 ! horizontal grid dimensions (for scalars)
+       nx, ny,             &  ! horizontal grid dimensions (for scalars)
+       nz                     ! number of vertical levels where velocity is computed
 
     type(parallel_type), intent(in) :: &
        parallel               ! info for parallel communication
@@ -9684,25 +9669,25 @@ subroutine compute_residual_vector_2d(nx,            ny,            &
     logical, dimension(nx-1,ny-1), intent(in) ::   &
        active_vertex          ! T for columns (i,j) where velocity is computed, else F
 
-    real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(in) ::   &
+    real(dp), dimension(nNodeNeighbors_3d,nz,nx-1,ny-1), intent(in) ::   &
        Auu, Auv, Avu, Avv     ! four components of assembled matrix
-                              ! 3rd dimension = 9 (node and its nearest neighbors in x and y directions)
-                              ! 1st and 2nd dimensions = (x,y) indices
+                              ! 1st dimension = 3 (node and its nearest neighbors in x, y and z direction)
+                              ! other dimensions = (z,x,y) indices
                               !
                               !    Auu  | Auv
                               !    _____|____
                               !    Avu  | Avv
                               !         |
 
-    real(dp), dimension(nx-1,ny-1), intent(in) ::  &
+    real(dp), dimension(nz,nx-1,ny-1), intent(in) ::  &
        bu, bv              ! assembled load (rhs) vector, divided into 2 parts
 
-   real(dp), dimension(nx-1,ny-1), intent(in) ::   &
+   real(dp), dimension(nz,nx-1,ny-1), intent(in) ::   &
        uvel, vvel          ! u and v components of velocity (m/yr)
 
-    real(dp), dimension(nx-1,ny-1), intent(out) ::   &
+    real(dp), dimension(nz,nx-1,ny-1), intent(out) ::   &
        resid_u,      & ! residual vector, divided into 2 parts
-       resid_v
+       resid_v         !
 
     real(dp), intent(out) ::    &
        L2_norm             ! L2 norm of residual vector, |Ax - b|
@@ -9714,13 +9699,13 @@ subroutine compute_residual_vector_2d(nx,            ny,            &
     ! Local variables
     !----------------------------------------------------------------
 
-    real(dp), dimension(nx-1,ny-1) ::  &
-         worku, workv,     & ! work arrays for global sums
-         resid_sq            ! resid_u^2 + resid_v^2
+    real(dp), dimension(nz,nx-1,ny-1) ::   &
+       worku, workv,     & ! work arrays for global sums
+       resid_sq            ! resid_u^2 + resid_v^2
 
     real(dp) :: my_max_resid, global_max_resid
 
-    integer :: i, j, iA, jA, m, iglobal, jglobal
+    integer :: i, j, k, iA, jA, kA, m, iglobal, jglobal
 
     real(dp) :: L2_norm_rhs   ! L2 norm of rhs vector, |b|
 
@@ -9733,49 +9718,61 @@ subroutine compute_residual_vector_2d(nx,            ny,            &
     staggered_jlo = parallel%staggered_jlo
     staggered_jhi = parallel%staggered_jhi
 
-    ! Compute u and v components of A*x
+    ! Compute u and v components of A*x
+
+    resid_u(:,:,:) = 0.d0
+    resid_v(:,:,:) = 0.d0
+
+    ! Loop over locally owned vertices
+    do j = staggered_jlo, staggered_jhi
+       do i = staggered_ilo, staggered_ihi
+          if (active_vertex(i,j)) then
+             do k = 1, nz
+                do kA = -1,1
+                   do jA = -1,1
+                      do iA = -1,1
+                         if ( (k+kA >= 1 .and. k+kA <= nz)      &
+                                         .and.                  &
+                              (i+iA >= 1 .and. i+iA <= nx-1)    &
+                                         .and.                  &
+                              (j+jA >= 1 .and. j+jA <= ny-1) ) then
+
+                            m = indxA_3d(iA,jA,kA)
 
-    resid_u(:,:) = 0.d0
-    resid_v(:,:) = 0.d0
+                            resid_u(k,i,j) = resid_u(k,i,j)                     &
+                                           + Auu(m,k,i,j)*uvel(k+kA,i+iA,j+jA)  &
+                                           + Auv(m,k,i,j)*vvel(k+kA,i+iA,j+jA)
 
-    ! Loop over locally owned vertices
-    do jA = -1,1
-       do iA = -1,1
-          m = indxA_2d(iA,jA)
-          do j = staggered_jlo, staggered_jhi
-             do i = staggered_ilo, staggered_ihi
-                if (active_vertex(i,j)) then
-                   if ( (i+iA >= 1 .and. i+iA <= nx-1)    &
-                                   .and.                     &
-                        (j+jA >= 1 .and. j+jA <= ny-1) ) then
-                      resid_u(i,j) = resid_u(i,j)                     &
-                                   + Auu(i,j,m)*uvel(i+iA,j+jA)  &
-                                   + Auv(i,j,m)*vvel(i+iA,j+jA)
-                      resid_v(i,j) = resid_v(i,j)                     &
-                                   + Avu(i,j,m)*uvel(i+iA,j+jA)  &
-                                   + Avv(i,j,m)*vvel(i+iA,j+jA)
-                   endif   ! in bounds
-                endif   ! active_vertex
-             enddo   ! i
-          enddo   ! j
-       enddo   ! iA
-    enddo      ! jA
+                            resid_v(k,i,j) = resid_v(k,i,j)                     &
+                                           + Avu(m,k,i,j)*uvel(k+kA,i+iA,j+jA)  &
+                                           + Avv(m,k,i,j)*vvel(k+kA,i+iA,j+jA)
+                         endif   ! in bounds
+                      enddo   ! kA
+                   enddo   ! iA
+                enddo   ! jA
+             enddo   ! k
+          endif   ! active_vertex
+       enddo   ! i
+    enddo   ! j
 
     ! Subtract b to get A*x - b
-    worku(:,:) = 0.0d0
-    workv(:,:) = 0.0d0
+
+    worku(:,:,:) = 0.0d0
+    workv(:,:,:) = 0.0d0
 
     ! Loop over locally owned vertices
     do j = staggered_jlo, staggered_jhi
        do i = staggered_ilo, staggered_ihi
           if (active_vertex(i,j)) then
-             resid_u(i,j) = resid_u(i,j) - bu(i,j)
-             resid_v(i,j) = resid_v(i,j) - bv(i,j)
-             worku(i,j) = resid_u(i,j)*resid_u(i,j)
-             workv(i,j) = resid_v(i,j)*resid_v(i,j)
+             do k = 1, nz
+                resid_u(k,i,j) = resid_u(k,i,j) - bu(k,i,j)
+                resid_v(k,i,j) = resid_v(k,i,j) - bv(k,i,j)
+                worku(k,i,j) = resid_u(k,i,j)*resid_u(k,i,j)
+                workv(k,i,j) = resid_v(k,i,j)*resid_v(k,i,j)
+             enddo  ! k
           endif     ! active vertex
-       enddo     ! i
-    enddo     ! j
+       enddo        ! i
+    enddo        ! j
 
     ! Take global sum, then take square root
     L2_norm = parallel_global_sum_stagger(worku, parallel, workv)
@@ -9786,26 +9783,28 @@ subroutine compute_residual_vector_2d(nx,            ny,            &
        if (this_rank==rtest) then
           i = itest
           j = jtest
-          write(iulog,*) 'In compute_residual_vector_2d: task, i, j =', this_rank, i, j
+          k = ktest
+          write(iulog,*) 'In compute_residual_vector_3d: task, i, j, k =', this_rank, i, j, k
           write(iulog, '(a16, 2f13.7, 2e13.5)') &
-               '  u, v, ru, rv: ', uvel(i,j), vvel(i,j), resid_u(i,j), resid_v(i,j)
+               '  u, v, ru, rv: ', uvel(k,i,j), vvel(k,i,j), resid_u(k,i,j), resid_v(k,i,j)
        endif
 
        ! Compute max value of (squared) residual on this task.
        ! If this task owns the vertex with the global max residual, then print a diagnostic message.
-       resid_sq(:,:) = worku(:,:) + workv(:,:)
        my_max_resid = maxval(resid_sq)
        global_max_resid = parallel_reduce_max(my_max_resid)
 
        if (abs((my_max_resid - global_max_resid)/global_max_resid) < 1.0d-6) then
           do j = staggered_jlo, staggered_jhi
              do i = staggered_ilo, staggered_ihi
-                if (abs((resid_sq(i,j) - global_max_resid)/global_max_resid) < 1.0d-6) then
-                   call parallel_globalindex(i, j, iglobal, jglobal, parallel)
-                   write(iulog, '(a24, 2i6, 2e13.5, e16.8)') 'ig, jg, ru, rv, rmax:', &
-                        iglobal, jglobal, resid_u(i,j), resid_v(i,j), sqrt(global_max_resid)
-                   write(iulog,*) ' '
-                endif
+                do k = 1, nz
+                   if (abs((resid_sq(k,i,j) - global_max_resid)/global_max_resid) < 1.0d-6) then
+                      call parallel_globalindex(i, j, iglobal, jglobal, parallel)
+                      write(iulog, '(a24, 2i6, i4, 2e13.5, e16.8)') 'ig, jg, k, ru, rv, rmax:', &
+                           iglobal, jglobal, k, resid_u(k,i,j), resid_v(k,i,j), sqrt(global_max_resid)
+                      write(iulog,*) ' '
+                   endif
+                enddo
              enddo
           enddo
        endif
@@ -9814,18 +9813,19 @@ subroutine compute_residual_vector_2d(nx,            ny,            &
 
     if (present(L2_norm_relative)) then   ! compute L2_norm relative to rhs
 
-       worku(:,:) = 0.0d0
-       workv(:,:) = 0.0d0
+       worku(:,:,:) = 0.0d0
+       workv(:,:,:) = 0.0d0
 
-       ! Loop over locally owned vertices
        do j = staggered_jlo, staggered_jhi
           do i = staggered_ilo, staggered_ihi
              if (active_vertex(i,j)) then
-                worku(i,j) = bu(i,j)*bu(i,j)
-                workv(i,j) = bv(i,j)*bv(i,j)
+                do k = 1, nz
+                   worku(k,i,j) = bu(k,i,j)*bu(k,i,j)
+                   workv(k,i,j) = bv(k,i,j)*bv(k,i,j)
+                enddo  ! k
              endif     ! active vertex
-          enddo     ! i
-       enddo     ! j
+          enddo        ! i
+       enddo        ! j
 
        ! Take global sum, then take square root
        L2_norm_rhs = parallel_global_sum_stagger(worku, parallel, workv)
@@ -9839,26 +9839,30 @@ subroutine compute_residual_vector_2d(nx,            ny,            &
 
     endif
 
-  end subroutine compute_residual_vector_2d
+  end subroutine compute_residual_vector_3d
 
 !****************************************************************************
 
-  subroutine evaluate_accelerated_picard_3d(&
+  subroutine evaluate_accelerated_picard_2d(&
+       nx,            ny,                   &
        L2_norm,       L2_norm_large,        &
        L2_norm_alpha_sav,                   &
        alpha_accel,   alpha_accel_max,      &
        gamma_accel,   resid_reduction_threshold,  &
-       uvel,          vvel,                 &
-       Auu,           Auv,                  &
-       Avu,           Avv,                  &
-       uvel_old,      vvel_old,             &
-       duvel,         dvvel,                &
-       uvel_sav,      vvel_sav,             &
-       Auu_sav,       Auv_sav,              &
-       Avu_sav,       Avv_sav,              &
+       uvel_2d,       vvel_2d,              &
+       Auu_2d,        Auv_2d,               &
+       Avu_2d,        Avv_2d,               &
+       uvel_2d_old,   vvel_2d_old,          &
+       duvel_2d,      dvvel_2d,             &
+       uvel_2d_sav,   vvel_2d_sav,          &
+       Auu_2d_sav,    Auv_2d_sav,           &
+       Avu_2d_sav,    Avv_2d_sav,           &
        beta_internal, beta_internal_sav,    &
        assembly_is_done)
 
+    integer, intent(in) :: &
+         nx,  ny                    ! number of grid cells in each direction
+
     real(dp), intent(in) :: &
          L2_norm,                &  ! latest value of L2 norm of residual
          L2_norm_large,          &  ! large value for re-initializing the L2 norm
@@ -9870,21 +9874,19 @@ subroutine evaluate_accelerated_picard_3d(&
          alpha_accel,            &  ! factor for extending the vector (duvel, dvvel) to reduce the residual
          L2_norm_alpha_sav          ! value of L2 norm of residual, given the previous alpha_accel
 
-    real(dp), dimension(:,:), intent(inout) ::  &
+    real(dp), dimension(nx-1,ny-1), intent(inout) ::  &
+         uvel_2d,     vvel_2d,         & ! latest guess for the velocity solution
+         uvel_2d_old, vvel_2d_old,     & ! velocity solution from previous nonlinear iteration
+         duvel_2d,    dvvel_2d,        & ! difference between old velocity solution and latest solution
+         uvel_2d_sav, vvel_2d_sav,     & ! best velocity solution so far, based on the residual norm
          beta_internal,                & ! beta_internal as a function of uvel_2d and vvel_2d
          beta_internal_sav               ! beta_internal as a function of uvel_2d_sav and vvel_2d_sav
 
-    real(dp), dimension(:,:,:), intent(inout) ::  &
-         uvel,        vvel,            & ! latest guess for the velocity solution
-         uvel_old,    vvel_old,        & ! velocity solution from previous nonlinear iteration
-         duvel,       dvvel,           & ! difference between old velocity solution and latest solution
-         uvel_sav,    vvel_sav           ! best velocity solution so far, based on the residual norm
-
-    real(dp), dimension(:,:,:,:), intent(inout) ::  &
-         Auu,         Auv,             & ! latest assembled matrices as a function of uvel_2d and vvel_2d
-         Avu,         Avv,             &
-         Auu_sav,     Auv_sav,         & ! assembled matrices as a function of uvel_2d_sav and vvel_2d_sav
-         Avu_sav,     Avv_sav
+    real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(inout) ::  &
+         Auu_2d,      Auv_2d,          & ! latest assembled matrices as a function of uvel_2d and vvel_2d
+         Avu_2d,      Avv_2d,          &
+         Auu_2d_sav,  Auv_2d_sav,      & ! assembled matrices as a function of uvel_2d_sav and vvel_2d_sav
+         Avu_2d_sav,  Avv_2d_sav
 
     logical, intent(inout) :: &
          assembly_is_done                ! if true, then accept the current assembled matrices and proceed to solution
@@ -9897,12 +9899,12 @@ subroutine evaluate_accelerated_picard_3d(&
 
        ! Save the latest values of the solver inputs
 
-       uvel_sav = uvel
-       vvel_sav = vvel
-       Auu_sav = Auu
-       Auv_sav = Auv
-       Avu_sav = Avu
-       Avv_sav = Avv
+       uvel_2d_sav = uvel_2d
+       vvel_2d_sav = vvel_2d
+       Auu_2d_sav = Auu_2d
+       Auv_2d_sav = Auv_2d
+       Avu_2d_sav = Avu_2d
+       Avv_2d_sav = Avv_2d
        beta_internal_sav = beta_internal
 
        ! Increase alpha_accel and see if the residual keeps getting smaller.
@@ -9927,8 +9929,8 @@ subroutine evaluate_accelerated_picard_3d(&
        endif
 
        ! Save this velocity as the starting point for the next nonlinear iteration
-       uvel_old = uvel
-       vvel_old = vvel
+       uvel_2d_old = uvel_2d
+       vvel_2d_old = vvel_2d
 
        ! Reset alpha_accel and L2_norm_alpha_sav for the next nonlinear iteration
        alpha_accel = 1.0d0
@@ -9941,17 +9943,17 @@ subroutine evaluate_accelerated_picard_3d(&
 
        ! The residual is larger than the previous value.
        ! Switch back to the previously saved velocity and matrix with the lower residual.
-       uvel = uvel_sav
-       vvel = vvel_sav
-       Auu = Auu_sav
-       Auv = Auv_sav
-       Avu = Avu_sav
-       Avv = Avv_sav
+       uvel_2d = uvel_2d_sav
+       vvel_2d = vvel_2d_sav
+       Auu_2d = Auu_2d_sav
+       Auv_2d = Auv_2d_sav
+       Avu_2d = Avu_2d_sav
+       Avv_2d = Avv_2d_sav
        beta_internal = beta_internal_sav
 
        ! Save this velocity as the starting point for the next nonlinear iteration
-       uvel_old = uvel
-       vvel_old = vvel
+       uvel_2d_old = uvel_2d
+       vvel_2d_old = vvel_2d
 
        if (verbose_picard .and. main_task) then
           write(iulog,*) 'Back up to alpha =', alpha_accel - gamma_accel
@@ -9967,30 +9969,26 @@ subroutine evaluate_accelerated_picard_3d(&
 
     endif  ! L2_norm of residual has reduced
 
-  end subroutine evaluate_accelerated_picard_3d
-
+  end subroutine evaluate_accelerated_picard_2d
+  
 !****************************************************************************
 
-  subroutine evaluate_accelerated_picard_2d(&
-       nx,            ny,                   &
+  subroutine evaluate_accelerated_picard_3d(&
        L2_norm,       L2_norm_large,        &
        L2_norm_alpha_sav,                   &
        alpha_accel,   alpha_accel_max,      &
        gamma_accel,   resid_reduction_threshold,  &
-       uvel_2d,       vvel_2d,              &
-       Auu_2d,        Auv_2d,               &
-       Avu_2d,        Avv_2d,               &
-       uvel_2d_old,   vvel_2d_old,          &
-       duvel_2d,      dvvel_2d,             &
-       uvel_2d_sav,   vvel_2d_sav,          &
-       Auu_2d_sav,    Auv_2d_sav,           &
-       Avu_2d_sav,    Avv_2d_sav,           &
+       uvel,          vvel,                 &
+       Auu,           Auv,                  &
+       Avu,           Avv,                  &
+       uvel_old,      vvel_old,             &
+       duvel,         dvvel,                &
+       uvel_sav,      vvel_sav,             &
+       Auu_sav,       Auv_sav,              &
+       Avu_sav,       Avv_sav,              &
        beta_internal, beta_internal_sav,    &
        assembly_is_done)
 
-    integer, intent(in) :: &
-         nx,  ny                    ! number of grid cells in each direction
-
     real(dp), intent(in) :: &
          L2_norm,                &  ! latest value of L2 norm of residual
          L2_norm_large,          &  ! large value for re-initializing the L2 norm
@@ -10002,19 +10000,21 @@ subroutine evaluate_accelerated_picard_2d(&
          alpha_accel,            &  ! factor for extending the vector (duvel, dvvel) to reduce the residual
          L2_norm_alpha_sav          ! value of L2 norm of residual, given the previous alpha_accel
 
-    real(dp), dimension(nx-1,ny-1), intent(inout) ::  &
-         uvel_2d,     vvel_2d,         & ! latest guess for the velocity solution
-         uvel_2d_old, vvel_2d_old,     & ! velocity solution from previous nonlinear iteration
-         duvel_2d,    dvvel_2d,        & ! difference between old velocity solution and latest solution
-         uvel_2d_sav, vvel_2d_sav,     & ! best velocity solution so far, based on the residual norm
+    real(dp), dimension(:,:), intent(inout) ::  &
          beta_internal,                & ! beta_internal as a function of uvel_2d and vvel_2d
          beta_internal_sav               ! beta_internal as a function of uvel_2d_sav and vvel_2d_sav
 
-    real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(inout) ::  &
-         Auu_2d,      Auv_2d,          & ! latest assembled matrices as a function of uvel_2d and vvel_2d
-         Avu_2d,      Avv_2d,          &
-         Auu_2d_sav,  Auv_2d_sav,      & ! assembled matrices as a function of uvel_2d_sav and vvel_2d_sav
-         Avu_2d_sav,  Avv_2d_sav
+    real(dp), dimension(:,:,:), intent(inout) ::  &
+         uvel,        vvel,            & ! latest guess for the velocity solution
+         uvel_old,    vvel_old,        & ! velocity solution from previous nonlinear iteration
+         duvel,       dvvel,           & ! difference between old velocity solution and latest solution
+         uvel_sav,    vvel_sav           ! best velocity solution so far, based on the residual norm
+
+    real(dp), dimension(:,:,:,:), intent(inout) ::  &
+         Auu,         Auv,             & ! latest assembled matrices as a function of uvel_2d and vvel_2d
+         Avu,         Avv,             &
+         Auu_sav,     Auv_sav,         & ! assembled matrices as a function of uvel_2d_sav and vvel_2d_sav
+         Avu_sav,     Avv_sav
 
     logical, intent(inout) :: &
          assembly_is_done                ! if true, then accept the current assembled matrices and proceed to solution
@@ -10027,12 +10027,12 @@ subroutine evaluate_accelerated_picard_2d(&
 
        ! Save the latest values of the solver inputs
 
-       uvel_2d_sav = uvel_2d
-       vvel_2d_sav = vvel_2d
-       Auu_2d_sav = Auu_2d
-       Auv_2d_sav = Auv_2d
-       Avu_2d_sav = Avu_2d
-       Avv_2d_sav = Avv_2d
+       uvel_sav = uvel
+       vvel_sav = vvel
+       Auu_sav = Auu
+       Auv_sav = Auv
+       Avu_sav = Avu
+       Avv_sav = Avv
        beta_internal_sav = beta_internal
 
        ! Increase alpha_accel and see if the residual keeps getting smaller.
@@ -10057,8 +10057,8 @@ subroutine evaluate_accelerated_picard_2d(&
        endif
 
        ! Save this velocity as the starting point for the next nonlinear iteration
-       uvel_2d_old = uvel_2d
-       vvel_2d_old = vvel_2d
+       uvel_old = uvel
+       vvel_old = vvel
 
        ! Reset alpha_accel and L2_norm_alpha_sav for the next nonlinear iteration
        alpha_accel = 1.0d0
@@ -10068,20 +10068,20 @@ subroutine evaluate_accelerated_picard_2d(&
        assembly_is_done = .true.
 
     else
-
-       ! The residual is larger than the previous value.
-       ! Switch back to the previously saved velocity and matrix with the lower residual.
-       uvel_2d = uvel_2d_sav
-       vvel_2d = vvel_2d_sav
-       Auu_2d = Auu_2d_sav
-       Auv_2d = Auv_2d_sav
-       Avu_2d = Avu_2d_sav
-       Avv_2d = Avv_2d_sav
+
+       ! The residual is larger than the previous value.
+       ! Switch back to the previously saved velocity and matrix with the lower residual.
+       uvel = uvel_sav
+       vvel = vvel_sav
+       Auu = Auu_sav
+       Auv = Auv_sav
+       Avu = Avu_sav
+       Avv = Avv_sav
        beta_internal = beta_internal_sav
 
        ! Save this velocity as the starting point for the next nonlinear iteration
-       uvel_2d_old = uvel_2d
-       vvel_2d_old = vvel_2d
+       uvel_old = uvel
+       vvel_old = vvel
 
        if (verbose_picard .and. main_task) then
           write(iulog,*) 'Back up to alpha =', alpha_accel - gamma_accel
@@ -10097,38 +10097,38 @@ subroutine evaluate_accelerated_picard_2d(&
 
     endif  ! L2_norm of residual has reduced
 
-  end subroutine evaluate_accelerated_picard_2d
+  end subroutine evaluate_accelerated_picard_3d
 
 !****************************************************************************
 
-  subroutine compute_residual_velocity_3d(whichresid,    parallel,      &
-                                          uvel,          vvel,          &
-                                          usav,          vsav,          &
+  subroutine compute_residual_velocity_2d(whichresid,    parallel,    &
+                                          uvel,          vvel,        &
+                                          usav,          vsav,        &
                                           resid_velo)
 
     integer, intent(in) ::   &
-       whichresid             ! option for method to use when calculating residual
+         whichresid         ! option for method to use when calculating residual
 
     type(parallel_type), intent(in) :: &
          parallel             ! info for parallel communication
 
-    real(dp), dimension(:,:,:), intent(in) ::  &
-       uvel, vvel,      & ! current guess for velocity
-       usav, vsav         ! previous guess for velocity
+    real(dp), dimension(:,:), intent(in) ::  &
+         uvel, vvel,      & ! current guess for velocity
+         usav, vsav         ! previous guess for velocity
 
     real(dp), intent(out) ::    &
-       resid_velo         ! quantity related to velocity convergence
+         resid_velo         ! quantity related to velocity convergence
 
     integer ::   &
-       imaxdiff, jmaxdiff, kmaxdiff   ! location of maximum speed difference
-                                      ! currently computed but not used
+         imaxdiff, jmaxdiff   ! location of maximum speed difference
+                              ! currently computed but not used
  
-    integer :: i, j, k, count
+    integer :: i, j, count
 
     real(dp) ::   &
-       speed,      &   ! current guess for ice speed
-       oldspeed,   &   ! previous guess for ice speed
-       diffspeed       ! abs(speed-oldspeed)
+         speed,      &   ! current guess for ice speed
+         oldspeed,   &   ! previous guess for ice speed
+         diffspeed       ! abs(speed-oldspeed)
 
     integer :: &
          staggered_ilo, staggered_ihi, &  ! bounds of locally owned vertices on staggered grid
@@ -10140,7 +10140,6 @@ subroutine compute_residual_velocity_3d(whichresid,    parallel,      &
     staggered_jhi = parallel%staggered_jhi
 
     ! Compute a residual quantity based on convergence of the velocity field.
-    !TODO - Remove some of these velocity residual methods?  They are rarely if ever used.
 
     ! options for residual calculation method, as specified in configuration file
     ! case(0): use max of abs( vel_old - vel ) / vel )
@@ -10151,7 +10150,6 @@ subroutine compute_residual_velocity_3d(whichresid,    parallel,      &
     resid_velo = 0.d0
     imaxdiff = 0
     jmaxdiff = 0
-    kmaxdiff = 0
 
     select case (whichresid)
 
@@ -10161,19 +10159,16 @@ subroutine compute_residual_velocity_3d(whichresid,    parallel,      &
 
        do j = staggered_jlo, staggered_jhi
           do i = staggered_ilo, staggered_ihi
-             do k = 1, size(uvel,1) - 1         ! ignore bed velocity
-                speed = sqrt(uvel(k,i,j)**2 + vvel(k,i,j)**2)
-                if (speed /= 0.d0) then
-                   oldspeed = sqrt(usav(k,i,j)**2 + vsav(k,i,j)**2)
-                   diffspeed = abs((oldspeed - speed)/speed)
-                   if (diffspeed > resid_velo) then
-                      resid_velo = diffspeed
-                      imaxdiff = i
-                      jmaxdiff = j
-                      kmaxdiff = k
-                   endif
+             speed = sqrt(uvel(i,j)**2 + vvel(i,j)**2)
+             if (speed /= 0.d0) then
+                oldspeed = sqrt(usav(i,j)**2 + vsav(i,j)**2)
+                diffspeed = abs((oldspeed - speed)/speed)
+                if (diffspeed > resid_velo) then
+                   resid_velo = diffspeed
+                   imaxdiff = i
+                   jmaxdiff = j
                 endif
-             enddo
+             endif
           enddo
        enddo
        
@@ -10188,15 +10183,13 @@ subroutine compute_residual_velocity_3d(whichresid,    parallel,      &
 
        do j = staggered_jlo, staggered_jhi
           do i = staggered_ilo, staggered_ihi
-             do k = 1, size(uvel,1) - 1         ! ignore bed velocity
-                speed = sqrt(uvel(k,i,j)**2 + vvel(k,i,j)**2)
-                if (speed /= 0.d0) then
-                   count = count+1
-                   oldspeed = sqrt(usav(k,i,j)**2 + vsav(k,i,j)**2)
-                   diffspeed = abs((oldspeed - speed)/speed)                
-                   resid_velo = resid_velo + diffspeed
-                endif
-             enddo
+             speed = sqrt(uvel(i,j)**2 + vvel(i,j)**2)
+             if (speed /= 0.d0) then
+                count = count+1
+                oldspeed = sqrt(usav(i,j)**2 + vsav(i,j)**2)
+                diffspeed = abs((oldspeed - speed)/speed)
+                resid_velo = resid_velo + diffspeed
+             endif
           enddo
        enddo
 
@@ -10207,25 +10200,22 @@ subroutine compute_residual_velocity_3d(whichresid,    parallel,      &
        call not_parallel(__FILE__, __LINE__)
 
    case default    ! max speed difference, including basal speeds
-                   ! (case HO_RESID_MAXU or HO_RESID_L2NORM or HO_RESID_L2NORM_RELATIVE)
+                   ! (case HO_RESID_MAXU or HO_RESID_L2NORM)
 
        ! Loop over locally owned vertices
 
        do j = staggered_jlo, staggered_jhi
           do i = staggered_ilo, staggered_ihi
-             do k = 1, size(uvel,1)
-                speed = sqrt(uvel(k,i,j)**2 + vvel(k,i,j)**2)
-                if (speed /= 0.d0) then
-                   oldspeed = sqrt(usav(k,i,j)**2 + vsav(k,i,j)**2)
-                   diffspeed = abs((oldspeed - speed)/speed)
-                   if (diffspeed > resid_velo) then
-                      resid_velo = diffspeed
-                      imaxdiff = i
-                      jmaxdiff = j
-                      kmaxdiff = k
-                   endif
+             speed = sqrt(uvel(i,j)**2 + vvel(i,j)**2)
+             if (speed /= 0.d0) then
+                oldspeed = sqrt(usav(i,j)**2 + vsav(i,j)**2)
+                diffspeed = abs((oldspeed - speed)/speed)
+                if (diffspeed > resid_velo) then
+                   resid_velo = diffspeed
+                   imaxdiff = i
+                   jmaxdiff = j
                 endif
-             enddo
+             endif
           enddo
        enddo
 
@@ -10233,38 +10223,38 @@ subroutine compute_residual_velocity_3d(whichresid,    parallel,      &
        
     end select
 
-  end subroutine compute_residual_velocity_3d
+  end subroutine compute_residual_velocity_2d
 
 !****************************************************************************
 
-  subroutine compute_residual_velocity_2d(whichresid,    parallel,    &
-                                          uvel,          vvel,        &
-                                          usav,          vsav,        &
+  subroutine compute_residual_velocity_3d(whichresid,    parallel,      &
+                                          uvel,          vvel,          &
+                                          usav,          vsav,          &
                                           resid_velo)
 
     integer, intent(in) ::   &
-         whichresid         ! option for method to use when calculating residual
+       whichresid             ! option for method to use when calculating residual
 
     type(parallel_type), intent(in) :: &
          parallel             ! info for parallel communication
 
-    real(dp), dimension(:,:), intent(in) ::  &
-         uvel, vvel,      & ! current guess for velocity
-         usav, vsav         ! previous guess for velocity
+    real(dp), dimension(:,:,:), intent(in) ::  &
+       uvel, vvel,      & ! current guess for velocity
+       usav, vsav         ! previous guess for velocity
 
     real(dp), intent(out) ::    &
-         resid_velo         ! quantity related to velocity convergence
+       resid_velo         ! quantity related to velocity convergence
 
     integer ::   &
-         imaxdiff, jmaxdiff   ! location of maximum speed difference
-                              ! currently computed but not used
+       imaxdiff, jmaxdiff, kmaxdiff   ! location of maximum speed difference
+                                      ! currently computed but not used
  
-    integer :: i, j, count
+    integer :: i, j, k, count
 
     real(dp) ::   &
-         speed,      &   ! current guess for ice speed
-         oldspeed,   &   ! previous guess for ice speed
-         diffspeed       ! abs(speed-oldspeed)
+       speed,      &   ! current guess for ice speed
+       oldspeed,   &   ! previous guess for ice speed
+       diffspeed       ! abs(speed-oldspeed)
 
     integer :: &
          staggered_ilo, staggered_ihi, &  ! bounds of locally owned vertices on staggered grid
@@ -10276,6 +10266,7 @@ subroutine compute_residual_velocity_2d(whichresid,    parallel,    &
     staggered_jhi = parallel%staggered_jhi
 
     ! Compute a residual quantity based on convergence of the velocity field.
+    !TODO - Remove some of these velocity residual methods?  They are rarely if ever used.
 
     ! options for residual calculation method, as specified in configuration file
     ! case(0): use max of abs( vel_old - vel ) / vel )
@@ -10286,6 +10277,7 @@ subroutine compute_residual_velocity_2d(whichresid,    parallel,    &
     resid_velo = 0.d0
     imaxdiff = 0
     jmaxdiff = 0
+    kmaxdiff = 0
 
     select case (whichresid)
 
@@ -10295,16 +10287,19 @@ subroutine compute_residual_velocity_2d(whichresid,    parallel,    &
 
        do j = staggered_jlo, staggered_jhi
           do i = staggered_ilo, staggered_ihi
-             speed = sqrt(uvel(i,j)**2 + vvel(i,j)**2)
-             if (speed /= 0.d0) then
-                oldspeed = sqrt(usav(i,j)**2 + vsav(i,j)**2)
-                diffspeed = abs((oldspeed - speed)/speed)
-                if (diffspeed > resid_velo) then
-                   resid_velo = diffspeed
-                   imaxdiff = i
-                   jmaxdiff = j
+             do k = 1, size(uvel,1) - 1         ! ignore bed velocity
+                speed = sqrt(uvel(k,i,j)**2 + vvel(k,i,j)**2)
+                if (speed /= 0.d0) then
+                   oldspeed = sqrt(usav(k,i,j)**2 + vsav(k,i,j)**2)
+                   diffspeed = abs((oldspeed - speed)/speed)
+                   if (diffspeed > resid_velo) then
+                      resid_velo = diffspeed
+                      imaxdiff = i
+                      jmaxdiff = j
+                      kmaxdiff = k
+                   endif
                 endif
-             endif
+             enddo
           enddo
        enddo
        
@@ -10319,13 +10314,15 @@ subroutine compute_residual_velocity_2d(whichresid,    parallel,    &
 
        do j = staggered_jlo, staggered_jhi
           do i = staggered_ilo, staggered_ihi
-             speed = sqrt(uvel(i,j)**2 + vvel(i,j)**2)
-             if (speed /= 0.d0) then
-                count = count+1
-                oldspeed = sqrt(usav(i,j)**2 + vsav(i,j)**2)
-                diffspeed = abs((oldspeed - speed)/speed)                
-                resid_velo = resid_velo + diffspeed
-             endif
+             do k = 1, size(uvel,1) - 1         ! ignore bed velocity
+                speed = sqrt(uvel(k,i,j)**2 + vvel(k,i,j)**2)
+                if (speed /= 0.d0) then
+                   count = count+1
+                   oldspeed = sqrt(usav(k,i,j)**2 + vsav(k,i,j)**2)
+                   diffspeed = abs((oldspeed - speed)/speed)
+                   resid_velo = resid_velo + diffspeed
+                endif
+             enddo
           enddo
        enddo
 
@@ -10336,30 +10333,96 @@ subroutine compute_residual_velocity_2d(whichresid,    parallel,    &
        call not_parallel(__FILE__, __LINE__)
 
    case default    ! max speed difference, including basal speeds
-                   ! (case HO_RESID_MAXU or HO_RESID_L2NORM)
+                   ! (case HO_RESID_MAXU or HO_RESID_L2NORM or HO_RESID_L2NORM_RELATIVE)
 
        ! Loop over locally owned vertices
 
        do j = staggered_jlo, staggered_jhi
           do i = staggered_ilo, staggered_ihi
-             speed = sqrt(uvel(i,j)**2 + vvel(i,j)**2)
-             if (speed /= 0.d0) then
-                oldspeed = sqrt(usav(i,j)**2 + vsav(i,j)**2)
-                diffspeed = abs((oldspeed - speed)/speed)
-                if (diffspeed > resid_velo) then
-                   resid_velo = diffspeed
-                   imaxdiff = i
-                   jmaxdiff = j
+             do k = 1, size(uvel,1)
+                speed = sqrt(uvel(k,i,j)**2 + vvel(k,i,j)**2)
+                if (speed /= 0.d0) then
+                   oldspeed = sqrt(usav(k,i,j)**2 + vsav(k,i,j)**2)
+                   diffspeed = abs((oldspeed - speed)/speed)
+                   if (diffspeed > resid_velo) then
+                      resid_velo = diffspeed
+                      imaxdiff = i
+                      jmaxdiff = j
+                      kmaxdiff = k
+                   endif
                 endif
-             endif
+             enddo
           enddo
        enddo
 
-       resid_velo = parallel_reduce_max(resid_velo)
-       
-    end select
+       resid_velo = parallel_reduce_max(resid_velo)
+       
+    end select
+
+  end subroutine compute_residual_velocity_3d
+
+!****************************************************************************
+
+  subroutine count_nonzeros_2d(nx,            ny,     &
+                               parallel,              &
+                               Auu,           Auv,    &
+                               Avu,           Avv,    &
+                               active_vertex,         &
+                               nNonzeros)
+
+    !----------------------------------------------------------------
+    ! Input-output arguments
+    !----------------------------------------------------------------
+
+    integer, intent(in) ::   &
+       nx, ny             ! number of grid cells in each direction
+
+    type(parallel_type), intent(in) :: &
+         parallel             ! info for parallel communication
+
+    real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(in) ::  &
+       Auu, Auv,    &     ! assembled stiffness matrix, divided into 4 parts
+       Avu, Avv
+
+    logical, dimension(nx-1,ny-1), intent(in) :: &
+       active_vertex      ! true for vertices of active cells
+
+    integer, intent(out) ::   &
+       nNonzeros          ! number of nonzero matrix elements
+
+    !----------------------------------------------------------------
+    ! Local variables
+    !----------------------------------------------------------------
+
+    integer :: i, j, m
+
+    integer :: &
+         staggered_ilo, staggered_ihi, &  ! bounds of locally owned vertices on staggered grid
+         staggered_jlo, staggered_jhi
+
+    staggered_ilo = parallel%staggered_ilo
+    staggered_ihi = parallel%staggered_ihi
+    staggered_jlo = parallel%staggered_jlo
+    staggered_jhi = parallel%staggered_jhi
+
+    nNonzeros = 0
+
+    do m = 1, nNodeNeighbors_2d
+       do j = staggered_jlo, staggered_jhi
+          do i = staggered_ilo, staggered_ihi
+             if (active_vertex(i,j)) then
+                if (Auu(i,j,m) /= 0.d0) nNonzeros = nNonzeros + 1
+                if (Auv(i,j,m) /= 0.d0) nNonzeros = nNonzeros + 1
+                if (Avu(i,j,m) /= 0.d0) nNonzeros = nNonzeros + 1
+                if (Avv(i,j,m) /= 0.d0) nNonzeros = nNonzeros + 1
+             endif     ! active_vertex
+          enddo        ! i
+       enddo           ! j
+    enddo              ! m
+
+    nNonzeros = parallel_reduce_sum(nNonzeros)
 
-  end subroutine compute_residual_velocity_2d
+  end subroutine count_nonzeros_2d
 
 !****************************************************************************
 
@@ -10431,69 +10494,6 @@ subroutine count_nonzeros_3d(nx,    ny,     nz,      &
 
   end subroutine count_nonzeros_3d
 
-!****************************************************************************
-
-  subroutine count_nonzeros_2d(nx,            ny,     &
-                               parallel,              &
-                               Auu,           Auv,    &
-                               Avu,           Avv,    &
-                               active_vertex,         &
-                               nNonzeros)
-
-    !----------------------------------------------------------------
-    ! Input-output arguments
-    !----------------------------------------------------------------
-
-    integer, intent(in) ::   &
-       nx, ny             ! number of grid cells in each direction
-
-    type(parallel_type), intent(in) :: &
-         parallel             ! info for parallel communication
-
-    real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(in) ::  &
-       Auu, Auv,    &     ! assembled stiffness matrix, divided into 4 parts
-       Avu, Avv                                    
-
-    logical, dimension(nx-1,ny-1), intent(in) :: &
-       active_vertex      ! true for vertices of active cells
-
-    integer, intent(out) ::   &
-       nNonzeros          ! number of nonzero matrix elements
-
-    !----------------------------------------------------------------
-    ! Local variables
-    !----------------------------------------------------------------
-
-    integer :: i, j, m
-
-    integer :: &
-         staggered_ilo, staggered_ihi, &  ! bounds of locally owned vertices on staggered grid
-         staggered_jlo, staggered_jhi
-
-    staggered_ilo = parallel%staggered_ilo
-    staggered_ihi = parallel%staggered_ihi
-    staggered_jlo = parallel%staggered_jlo
-    staggered_jhi = parallel%staggered_jhi
-
-    nNonzeros = 0
-
-    do m = 1, nNodeNeighbors_2d
-       do j = staggered_jlo, staggered_jhi
-          do i = staggered_ilo, staggered_ihi
-             if (active_vertex(i,j)) then
-                if (Auu(i,j,m) /= 0.d0) nNonzeros = nNonzeros + 1
-                if (Auv(i,j,m) /= 0.d0) nNonzeros = nNonzeros + 1
-                if (Avu(i,j,m) /= 0.d0) nNonzeros = nNonzeros + 1
-                if (Avv(i,j,m) /= 0.d0) nNonzeros = nNonzeros + 1
-             endif     ! active_vertex
-          enddo        ! i
-       enddo           ! j
-    enddo              ! m
-
-    nNonzeros = parallel_reduce_sum(nNonzeros)
-
-  end subroutine count_nonzeros_2d
-
 !****************************************************************************
 
   subroutine check_symmetry_element_matrix(nNodesPerElement,  &
@@ -10533,31 +10533,237 @@ subroutine check_symmetry_element_matrix(nNodesPerElement,  &
        enddo
     enddo
 
-    ! check that Kvv = (Kvv)^T
+    ! check that Kvv = (Kvv)^T
+
+    do j = 1, nNodesPerElement
+       do i = j, nNodesPerElement
+          if (abs(Kvv(i,j) - Kvv(j,i)) > eps11) then
+             write(iulog,*) 'Kvv is not symmetric'
+             write(iulog,*) 'i, j, Kvv(i,j), Kvv(j,i):', i, j, Kvv(i,j), Kvv(j,i)
+             stop
+          endif    
+       enddo
+    enddo
+
+    ! Check that Kuv = (Kvu)^T
+
+    do j = 1, nNodesPerElement
+       do i = 1, nNodesPerElement
+          if (abs(Kuv(i,j) - Kvu(j,i)) > eps11) then
+             write(iulog,*) 'Kuv /= (Kvu)^T'
+             write(iulog,*) 'i, j, Kuv(i,j), Kvu(j,i):', i, j, Kuv(i,j), Kvu(j,i)
+             stop
+          endif    
+       enddo
+    enddo
+
+  end subroutine check_symmetry_element_matrix
+
+!****************************************************************************
+
+  subroutine check_symmetry_assembled_matrix_2d(nx,            ny,       &
+                                                parallel,                &
+                                                active_vertex,           &
+                                                Auu, Auv, Avu, Avv)
+
+    !------------------------------------------------------------------
+    ! Check that the assembled stiffness matrix is symmetric.
+    ! This is true provided that (1) Auu = (Auu)^T
+    !                            (2) Avv = (Avv)^T
+    !                            (3) Auv = (Avu)^T
+    ! The A matrices are assembled in a dense fashion to save storage
+    !  and preserve the i/j/k structure of the grid.
+    !
+    ! There can be small differences from perfect symmetry due to roundoff error.
+    ! These differences are fixed provided they are small enough.
+    !------------------------------------------------------------------    
+
+    integer, intent(in) ::   &
+       nx, ny                   ! horizontal grid dimensions
+
+    type(parallel_type), intent(in) :: &
+         parallel             ! info for parallel communication
+
+    logical, dimension(nx-1,ny-1), intent(in) ::   &
+       active_vertex            ! T for columns (i,j) where velocity is computed, else F
+
+    real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(inout) ::   &
+       Auu, Auv, Avu, Avv       ! components of assembled stiffness matrix
+                                !
+                                !    Auu  | Auv
+                                !    _____|____          
+                                !         |
+                                !    Avu  | Avv                                    
+
+    integer :: i, j, iA, jA, m, mm, iglobal, jglobal
+
+    real(dp) :: val1, val2          ! values of matrix coefficients
+
+    real(dp) :: maxdiff, global_maxdiff, diag_entry, avg_val
+
+    integer :: rmax, imax, jmax, mmax
+
+    integer :: &
+         staggered_ilo, staggered_ihi, &  ! bounds of locally owned vertices on staggered grid
+         staggered_jlo, staggered_jhi
+
+    staggered_ilo = parallel%staggered_ilo
+    staggered_ihi = parallel%staggered_ihi
+    staggered_jlo = parallel%staggered_jlo
+    staggered_jhi = parallel%staggered_jhi
+
+    ! Check matrix for symmetry
+
+    ! Here we correct for small differences from symmetry due to roundoff error.
+    ! The maximum departure from symmetry is set to be a small fraction
+    !  of the diagonal entry for the row.
+    ! If the departure from symmetry is larger than this, then the model prints a warning 
+    !  and/or aborts.
+
+    maxdiff = 0.d0
+    rmax = 0; imax = 0; jmax = 0; mmax = 0
+
+    ! Loop over locally owned vertices.
+    ! Each active vertex is associate with 2*nz matrix rows belonging to this processor.
+
+    do jA = -1, 1
+    do iA = -1, 1
+       m =  indxA_2d( iA, jA)
+       mm = indxA_2d(-iA,-jA)
+
+       do j = staggered_jlo, staggered_jhi
+          do i = staggered_ilo, staggered_ihi
+             if (active_vertex(i,j)) then
+
+                ! Check Auu and Auv for symmetry
+                diag_entry = Auu(i,j,indxA_2d(0,0))
+
+                !WHL - debug
+                if (diag_entry /= diag_entry) then
+                   write(iulog,*) 'WARNING: Diagonal NaN: i, j =', i, j
+                   call parallel_globalindex(i, j, iglobal, jglobal, parallel)
+                   write(iulog,*) '   iglobal, jglobal:', iglobal, jglobal
+!!                   stop
+                endif
+
+                ! Check that Auu = Auu^T
+                val1 = Auu(i,    j,    m )   ! value of Auu(row,col)
+                val2 = Auu(i+iA, j+jA, mm)   ! value of Auu(col,row)
+                if (val2 /= val1) then
+                   if (abs(val2 - val1) > maxdiff) then
+                      maxdiff = abs(val2 - val1)
+                      rmax = this_rank; imax = i; jmax = j; mmax = m
+                   endif
+                   ! if difference is small, then fix the asymmetry by averaging values
+                   !WHL - Here and below, I commented out the code to average asymmetric values.
+                   !      The hope is that the asymmetries are too small to matter.
+                   ! else print a warning and abort
+                   if ( abs(val2-val1) < eps11*abs(diag_entry) ) then
+!                      avg_val = 0.5d0 * (val1 + val2)
+!                      Auu(i,    j,    m ) = avg_val
+!                      Auu(i+iA, j+jA, mm) = avg_val
+                   else
+                      write(iulog,*) 'WARNING: Auu is not symmetric: this_rank, i, j, iA, jA =', this_rank, i, j, iA, jA
+                      write(iulog,*) 'Auu(row,col), Auu(col,row), diff/diag:', val1, val2, (val2 - val1)/diag_entry
+                      call parallel_globalindex(i, j, iglobal, jglobal, parallel)
+                      write(iulog,*) '   iglobal, jglobal:', iglobal, jglobal
+!!                      stop
+                   endif
+                endif   ! val2 /= val1
+
+                ! Check that Auv = (Avu)^T
+                val1 = Auv(i,    j,    m )   ! value of Auv(row,col)
+                val2 = Avu(i+iA, j+jA, mm)   ! value of Avu(col,row)
+                if (val2 /= val1) then
+                   if (abs(val2 - val1) > maxdiff) then
+                      maxdiff = abs(val2 - val1)
+                      rmax = this_rank; imax = i; jmax = j; mmax = m
+                   endif
+                   ! if difference is small, then fix the asymmetry by averaging values
+                   ! else print a warning and abort
+                   if ( abs(val2-val1) < eps11*abs(diag_entry) ) then
+!                      avg_val = 0.5d0 * (val1 + val2)
+!                      Auv(i,    j,    m ) = avg_val
+!                      Avu(i+iA, j+jA, mm) = avg_val
+                   else
+                      write(iulog,*) 'WARNING: Auv is not equal to (Avu)^T, this_rank, i, j, iA, jA =', this_rank, i, j, iA, jA
+                      write(iulog,*) 'Auv(row,col), Avu(col,row), diff/diag:', val1, val2, (val2 - val1)/diag_entry
+                      call parallel_globalindex(i, j, iglobal, jglobal, parallel)
+                      write(iulog,*) '   iglobal, jglobal:', iglobal, jglobal
+!!                      stop
+                   endif
+                endif  ! val2 /= val1
+
+                ! Now check Avu and Avv
+                diag_entry = Avv(i,j,indxA_2d(0,0))
+
+                ! check that Avv = (Avv)^T
+                val1 = Avv(i,    j,    m )   ! value of Avv(row,col)
+                val2 = Avv(i+iA, j+jA, mm)   ! value of Avv(col,row)
+
+                if (val2 /= val1) then
+                   if (abs(val2 - val1) > maxdiff) then
+                      maxdiff = abs(val2 - val1)
+                      rmax = this_rank; imax = i; jmax = j; mmax = m
+                   endif
+                   ! if difference is small, then fix the asymmetry by averaging values
+                   ! else print a warning and abort
+                   if ( abs(val2-val1) < eps11*abs(diag_entry) ) then
+!                      avg_val = 0.5d0 * (val1 + val2)
+!                      Avv(i,    j,    m ) = avg_val
+!                      Avv(i+iA, j+jA, mm) = avg_val
+                   else
+                      write(iulog,*) 'WARNING: Avv is not symmetric: this_rank, i, j, iA, jA =', this_rank, i, j, iA, jA
+                      write(iulog,*) 'Avv(row,col), Avv(col,row), diff/diag:', val1, val2, (val2 - val1)/diag_entry
+                      call parallel_globalindex(i, j, iglobal, jglobal, parallel)
+                      write(iulog,*) '   iglobal, jglobal:', iglobal, jglobal
+!!                      stop
+                   endif
+
+                endif   ! val2 /= val1
 
-    do j = 1, nNodesPerElement
-       do i = j, nNodesPerElement
-          if (abs(Kvv(i,j) - Kvv(j,i)) > eps11) then
-             write(iulog,*) 'Kvv is not symmetric'
-             write(iulog,*) 'i, j, Kvv(i,j), Kvv(j,i):', i, j, Kvv(i,j), Kvv(j,i)
-             stop
-          endif    
-       enddo
-    enddo
+                ! Check that Avu = (Auv)^T
+                val1 = Avu(i,    j,    m )   ! value of Avu(row,col)
+                val2 = Auv(i+iA, j+jA, mm)   ! value of Auv(col,row)
 
-    ! Check that Kuv = (Kvu)^T
+                if (val2 /= val1) then
+                   if (abs(val2 - val1) > maxdiff) then
+                      maxdiff = abs(val2 - val1)
+                      rmax = this_rank; imax = i; jmax = j; mmax = m
+                   endif
+                   ! if difference is small, then fix the asymmetry by averaging values
+                   ! else print a warning and abort
+                   if ( abs(val2-val1) < eps11*abs(diag_entry) ) then
+!                      avg_val = 0.5d0 * (val1 + val2)
+!                      Avu(i,    j,    m ) = avg_val
+!                      Auv(i+iA, j+jA, mm) = avg_val
+                   else
+                      write(iulog,*) 'WARNING: Avu is not equal to (Auv)^T, this_rank, i, j, iA, jA =', this_rank, i, j, iA, jA
+                      write(iulog,*) 'Avu(row,col), Auv(col,row), diff/diag:', val1, val2, (val2 - val1)/diag_entry
+                      call parallel_globalindex(i, j, iglobal, jglobal, parallel)
+                      write(iulog,*) '   iglobal, jglobal:', iglobal, jglobal
+!!                      stop
+                   endif
 
-    do j = 1, nNodesPerElement
-       do i = 1, nNodesPerElement
-          if (abs(Kuv(i,j) - Kvu(j,i)) > eps11) then
-             write(iulog,*) 'Kuv /= (Kvu)^T'
-             write(iulog,*) 'i, j, Kuv(i,j), Kvu(j,i):', i, j, Kuv(i,j), Kvu(j,i)
-             stop
-          endif    
-       enddo
-    enddo
+                endif  ! val2 /= val1
 
-  end subroutine check_symmetry_element_matrix
+             endif        ! active_vertex
+          enddo           ! i
+       enddo              ! j
+    enddo     ! iA
+    enddo     ! jA
+
+    if (verbose_matrix) then
+       global_maxdiff = parallel_reduce_max(maxdiff)
+       if (global_maxdiff > 0.0d0 .and. maxdiff == global_maxdiff) then
+          ! maxdiff is on this processor; compute and broadcast the global index
+          call parallel_globalindex(imax, jmax, iglobal, jglobal, parallel)
+          write(iulog,*) 'Max asymmetry =', global_maxdiff
+          write(iulog,*) '   ig, jg, m =', iglobal, jglobal, mmax
+       endif
+    endif
+
+  end subroutine check_symmetry_assembled_matrix_2d
 
 !****************************************************************************
 
@@ -10797,228 +11003,22 @@ subroutine check_symmetry_assembled_matrix_3d(nx,    ny,     nz,      &
                                  this_rank, i, j, k, iA, jA, kA
                             write(iulog,*) 'Avu(row,col), Auv(col,row), diff/diag:', val1, val2, (val2 - val1)/diag_entry
                             call parallel_globalindex(i, j, iglobal, jglobal, parallel)
-                            write(iulog,*) '   iglobal, jglobal:', iglobal, jglobal
-!!                            stop
-                         endif
-
-                      endif  ! val2 /= val1
-
-                   endif     ! k+kA in bounds
-
-                enddo        ! kA
-                enddo        ! iA
-                enddo        ! jA
-
-             enddo     ! k
-          endif        ! active_vertex
-       enddo           ! i
-    enddo              ! j
-
-    if (verbose_matrix) then
-       global_maxdiff = parallel_reduce_max(maxdiff)
-       if (global_maxdiff > 0.0d0 .and. maxdiff == global_maxdiff) then
-          ! maxdiff is on this processor; compute and broadcast the global index
-          call parallel_globalindex(imax, jmax, iglobal, jglobal, parallel)
-          write(iulog,*) 'Max asymmetry =', global_maxdiff
-          write(iulog,*) '   i, j, ig, jg, k, m =', imax, jmax, iglobal, jglobal, kmax, mmax
-       endif
-    endif
-
-  end subroutine check_symmetry_assembled_matrix_3d
-
-!****************************************************************************
-
-  subroutine check_symmetry_assembled_matrix_2d(nx,            ny,       &
-                                                parallel,                &
-                                                active_vertex,           &
-                                                Auu, Auv, Avu, Avv)
-
-    !------------------------------------------------------------------
-    ! Check that the assembled stiffness matrix is symmetric.
-    ! This is true provided that (1) Auu = (Auu)^T
-    !                            (2) Avv = (Avv)^T
-    !                            (3) Auv = (Avu)^T
-    ! The A matrices are assembled in a dense fashion to save storage
-    !  and preserve the i/j/k structure of the grid.
-    !
-    ! There can be small differences from perfect symmetry due to roundoff error.
-    ! These differences are fixed provided they are small enough.
-    !------------------------------------------------------------------    
-
-    integer, intent(in) ::   &
-       nx, ny                   ! horizontal grid dimensions
-
-    type(parallel_type), intent(in) :: &
-         parallel             ! info for parallel communication
-
-    logical, dimension(nx-1,ny-1), intent(in) ::   &
-       active_vertex            ! T for columns (i,j) where velocity is computed, else F
-
-    real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(inout) ::   &
-       Auu, Auv, Avu, Avv       ! components of assembled stiffness matrix
-                                !
-                                !    Auu  | Auv
-                                !    _____|____          
-                                !         |
-                                !    Avu  | Avv                                    
-
-    integer :: i, j, iA, jA, m, mm, iglobal, jglobal
-
-    real(dp) :: val1, val2          ! values of matrix coefficients
-
-    real(dp) :: maxdiff, global_maxdiff, diag_entry, avg_val
-
-    integer :: rmax, imax, jmax, mmax
-
-    integer :: &
-         staggered_ilo, staggered_ihi, &  ! bounds of locally owned vertices on staggered grid
-         staggered_jlo, staggered_jhi
-
-    staggered_ilo = parallel%staggered_ilo
-    staggered_ihi = parallel%staggered_ihi
-    staggered_jlo = parallel%staggered_jlo
-    staggered_jhi = parallel%staggered_jhi
-
-    ! Check matrix for symmetry
-
-    ! Here we correct for small differences from symmetry due to roundoff error.
-    ! The maximum departure from symmetry is set to be a small fraction
-    !  of the diagonal entry for the row.
-    ! If the departure from symmetry is larger than this, then the model prints a warning 
-    !  and/or aborts.
-
-    maxdiff = 0.d0
-    rmax = 0; imax = 0; jmax = 0; mmax = 0
-
-    ! Loop over locally owned vertices.
-    ! Each active vertex is associate with 2*nz matrix rows belonging to this processor.
-
-    do jA = -1, 1
-    do iA = -1, 1
-       m =  indxA_2d( iA, jA)
-       mm = indxA_2d(-iA,-jA)
-
-       do j = staggered_jlo, staggered_jhi
-          do i = staggered_ilo, staggered_ihi
-             if (active_vertex(i,j)) then
-
-                ! Check Auu and Auv for symmetry
-                diag_entry = Auu(i,j,indxA_2d(0,0))
-
-                !WHL - debug
-                if (diag_entry /= diag_entry) then
-                   write(iulog,*) 'WARNING: Diagonal NaN: i, j =', i, j
-                   call parallel_globalindex(i, j, iglobal, jglobal, parallel)
-                   write(iulog,*) '   iglobal, jglobal:', iglobal, jglobal
-!!                   stop
-                endif
-
-                ! Check that Auu = Auu^T
-                val1 = Auu(i,    j,    m )   ! value of Auu(row,col)
-                val2 = Auu(i+iA, j+jA, mm)   ! value of Auu(col,row)
-                if (val2 /= val1) then
-                   if (abs(val2 - val1) > maxdiff) then
-                      maxdiff = abs(val2 - val1)
-                      rmax = this_rank; imax = i; jmax = j; mmax = m
-                   endif
-                   ! if difference is small, then fix the asymmetry by averaging values
-                   !WHL - Here and below, I commented out the code to average asymmetric values.
-                   !      The hope is that the asymmetries are too small to matter.
-                   ! else print a warning and abort
-                   if ( abs(val2-val1) < eps11*abs(diag_entry) ) then
-!                      avg_val = 0.5d0 * (val1 + val2)
-!                      Auu(i,    j,    m ) = avg_val
-!                      Auu(i+iA, j+jA, mm) = avg_val
-                   else
-                      write(iulog,*) 'WARNING: Auu is not symmetric: this_rank, i, j, iA, jA =', this_rank, i, j, iA, jA
-                      write(iulog,*) 'Auu(row,col), Auu(col,row), diff/diag:', val1, val2, (val2 - val1)/diag_entry
-                      call parallel_globalindex(i, j, iglobal, jglobal, parallel)
-                      write(iulog,*) '   iglobal, jglobal:', iglobal, jglobal
-!!                      stop
-                   endif
-                endif   ! val2 /= val1
-
-                ! Check that Auv = (Avu)^T
-                val1 = Auv(i,    j,    m )   ! value of Auv(row,col)
-                val2 = Avu(i+iA, j+jA, mm)   ! value of Avu(col,row)
-                if (val2 /= val1) then
-                   if (abs(val2 - val1) > maxdiff) then
-                      maxdiff = abs(val2 - val1)
-                      rmax = this_rank; imax = i; jmax = j; mmax = m
-                   endif
-                   ! if difference is small, then fix the asymmetry by averaging values
-                   ! else print a warning and abort
-                   if ( abs(val2-val1) < eps11*abs(diag_entry) ) then
-!                      avg_val = 0.5d0 * (val1 + val2)
-!                      Auv(i,    j,    m ) = avg_val
-!                      Avu(i+iA, j+jA, mm) = avg_val
-                   else
-                      write(iulog,*) 'WARNING: Auv is not equal to (Avu)^T, this_rank, i, j, iA, jA =', this_rank, i, j, iA, jA
-                      write(iulog,*) 'Auv(row,col), Avu(col,row), diff/diag:', val1, val2, (val2 - val1)/diag_entry
-                      call parallel_globalindex(i, j, iglobal, jglobal, parallel)
-                      write(iulog,*) '   iglobal, jglobal:', iglobal, jglobal
-!!                      stop
-                   endif
-                endif  ! val2 /= val1
-
-                ! Now check Avu and Avv
-                diag_entry = Avv(i,j,indxA_2d(0,0))
-
-                ! check that Avv = (Avv)^T
-                val1 = Avv(i,    j,    m )   ! value of Avv(row,col)
-                val2 = Avv(i+iA, j+jA, mm)   ! value of Avv(col,row)
-
-                if (val2 /= val1) then
-                   if (abs(val2 - val1) > maxdiff) then
-                      maxdiff = abs(val2 - val1)
-                      rmax = this_rank; imax = i; jmax = j; mmax = m
-                   endif
-                   ! if difference is small, then fix the asymmetry by averaging values
-                   ! else print a warning and abort
-                   if ( abs(val2-val1) < eps11*abs(diag_entry) ) then
-!                      avg_val = 0.5d0 * (val1 + val2)
-!                      Avv(i,    j,    m ) = avg_val
-!                      Avv(i+iA, j+jA, mm) = avg_val
-                   else
-                      write(iulog,*) 'WARNING: Avv is not symmetric: this_rank, i, j, iA, jA =', this_rank, i, j, iA, jA
-                      write(iulog,*) 'Avv(row,col), Avv(col,row), diff/diag:', val1, val2, (val2 - val1)/diag_entry
-                      call parallel_globalindex(i, j, iglobal, jglobal, parallel)
-                      write(iulog,*) '   iglobal, jglobal:', iglobal, jglobal
-!!                      stop
-                   endif
-
-                endif   ! val2 /= val1
+                            write(iulog,*) '   iglobal, jglobal:', iglobal, jglobal
+!!                            stop
+                         endif
 
-                ! Check that Avu = (Auv)^T
-                val1 = Avu(i,    j,    m )   ! value of Avu(row,col)
-                val2 = Auv(i+iA, j+jA, mm)   ! value of Auv(col,row)
+                      endif  ! val2 /= val1
 
-                if (val2 /= val1) then
-                   if (abs(val2 - val1) > maxdiff) then
-                      maxdiff = abs(val2 - val1)
-                      rmax = this_rank; imax = i; jmax = j; mmax = m
-                   endif
-                   ! if difference is small, then fix the asymmetry by averaging values
-                   ! else print a warning and abort
-                   if ( abs(val2-val1) < eps11*abs(diag_entry) ) then
-!                      avg_val = 0.5d0 * (val1 + val2)
-!                      Avu(i,    j,    m ) = avg_val
-!                      Auv(i+iA, j+jA, mm) = avg_val
-                   else
-                      write(iulog,*) 'WARNING: Avu is not equal to (Auv)^T, this_rank, i, j, iA, jA =', this_rank, i, j, iA, jA
-                      write(iulog,*) 'Avu(row,col), Auv(col,row), diff/diag:', val1, val2, (val2 - val1)/diag_entry
-                      call parallel_globalindex(i, j, iglobal, jglobal, parallel)
-                      write(iulog,*) '   iglobal, jglobal:', iglobal, jglobal
-!!                      stop
-                   endif
+                   endif     ! k+kA in bounds
 
-                endif  ! val2 /= val1
+                enddo        ! kA
+                enddo        ! iA
+                enddo        ! jA
 
-             endif        ! active_vertex
-          enddo           ! i
-       enddo              ! j
-    enddo     ! iA
-    enddo     ! jA
+             enddo     ! k
+          endif        ! active_vertex
+       enddo           ! i
+    enddo              ! j
 
     if (verbose_matrix) then
        global_maxdiff = parallel_reduce_max(maxdiff)
@@ -11026,53 +11026,51 @@ subroutine check_symmetry_assembled_matrix_2d(nx,            ny,       &
           ! maxdiff is on this processor; compute and broadcast the global index
           call parallel_globalindex(imax, jmax, iglobal, jglobal, parallel)
           write(iulog,*) 'Max asymmetry =', global_maxdiff
-          write(iulog,*) '   ig, jg, m =', iglobal, jglobal, mmax
+          write(iulog,*) '   i, j, ig, jg, k, m =', imax, jmax, iglobal, jglobal, kmax, mmax
        endif
     endif
 
-  end subroutine check_symmetry_assembled_matrix_2d
+  end subroutine check_symmetry_assembled_matrix_3d
 
 !****************************************************************************
 
-  subroutine write_matrix_elements_3d(nx,    ny,   nz,     &
-                                      nNodesSolve, nodeID, &
-                                      iNodeIndex,  jNodeIndex,  &
-                                      kNodeIndex,          &
-                                      Auu,         Auv,    &
-                                      Avu,         Avv,    &
-                                      bu,          bv)
+  subroutine write_matrix_elements_2d(nx,             ny,            &
+                                      nVerticesSolve, vertexID,      &
+                                      iVertexIndex,   jVertexIndex,  &
+                                      Auu,            Auv,           &
+                                      Avu,            Avv,           &
+                                      bu,             bv)
 
     ! Write matrix elements to text files.
     ! Note: Does not work when running on more than one task.
 
     integer, intent(in) ::   &
        nx, ny,               &  ! horizontal grid dimensions
-       nz,                   &  ! number of vertical levels at which velocity is computed
-       nNodesSolve              ! number of nodes where we solve for velocity
+       nVerticesSolve           ! number of vertices where we solve for velocity
 
-    integer, dimension(nz,nx-1,ny-1), intent(in) ::  &
-       nodeID             ! ID for each node
+    integer, dimension(nx-1,ny-1), intent(in) ::  &
+       vertexID             ! ID for each vertex
 
     integer, dimension(:), intent(in) ::   &
-       iNodeIndex, jNodeIndex, kNodeIndex   ! i, j and k indices of active nodes
+       iVertexIndex, jVertexIndex   ! i and j indices of active vertices
 
-    real(dp), dimension(nNodeNeighbors_3d,nz,nx-1,ny-1), intent(in) ::  &
+    real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(in) ::  &
        Auu, Auv,    &     ! assembled stiffness matrix, divided into 4 parts
-       Avu, Avv           ! 1st dimension = node and its nearest neighbors in x, y and z direction 
-                          ! other dimensions = (k,i,j) indices
+       Avu, Avv           ! 1st dimension = vertex and its nearest neighbors in x and y direction 
+                          ! other dimensions = (i,j) indices
 
-    real(dp), dimension(nz,nx-1,ny-1), intent(in) ::  &
+    real(dp), dimension(nx-1,ny-1), intent(in) ::  &
        bu, bv             ! assembled load (rhs) vector, divided into 2 parts
 
     ! Local variables
 
     integer :: rowA, colA
-    integer :: i, j, k, m, iA, jA, kA
+    integer :: i, j, m, iA, jA
 
-    real(dp), dimension(nNodesSolve, nNodesSolve) ::   &
+    real(dp), dimension(nVerticesSolve, nVerticesSolve) ::   &
        Auu_val, Auv_val, Avu_val, Avv_val   ! dense matrices
 
-    real(dp), dimension(nNodesSolve) :: nonzeros
+    real(dp), dimension(nVerticesSolve) :: nonzeros
 
     if (tasks > 1) then
        call write_log('Error: Cannot write matrix elements to files when tasks > 1', GM_FATAL)
@@ -11083,35 +11081,29 @@ subroutine write_matrix_elements_3d(nx,    ny,   nz,     &
     Avu_val(:,:) = 0.d0
     Avv_val(:,:) = 0.d0
 
-    do rowA = 1, nNodesSolve
-
-       i = iNodeIndex(rowA)
-       j = jNodeIndex(rowA)
-       k = kNodeIndex(rowA)
+    do rowA = 1, nVerticesSolve
 
-       do kA = -1, 1
+       i = iVertexIndex(rowA)
+       j = jVertexIndex(rowA)
        do jA = -1, 1
        do iA = -1, 1
 
-          if ( (k+kA >= 1 .and. k+kA <= nz)         &
-                          .and.                     &
-               (i+iA >= 1 .and. i+iA <= nx-1)       &
+          if ( (i+iA >= 1 .and. i+iA <= nx-1)       &
                           .and.                     &
                (j+jA >= 1 .and. j+jA <= ny-1) ) then
 
-             colA = nodeID(k+kA, i+iA, j+jA)   ! ID for neighboring node
-             m = indxA_3d(iA,jA,kA)
+             colA = vertexID(i+iA, j+jA)   ! ID for neighboring vertex
+             m = indxA_2d(iA,jA)
 
              if (colA > 0) then 
-                Auu_val(rowA, colA) = Auu(m,k,i,j)
-                Auv_val(rowA, colA) = Auv(m,k,i,j)
-                Avu_val(rowA, colA) = Avu(m,k,i,j)
-                Avv_val(rowA, colA) = Avv(m,k,i,j)
+                Auu_val(rowA, colA) = Auu(i,j,m)
+                Auv_val(rowA, colA) = Auv(i,j,m)
+                Avu_val(rowA, colA) = Avu(i,j,m)
+                Avv_val(rowA, colA) = Avv(i,j,m)
              endif
 
-          endif     ! i+iA, j+jA, and k+kA in bounds
+          endif     ! i+iA and j+jA in bounds
 
-       enddo        ! kA
        enddo        ! iA
        enddo        ! jA
 
@@ -11120,9 +11112,9 @@ subroutine write_matrix_elements_3d(nx,    ny,   nz,     &
     !WHL - bug check
     write(iulog,*) ' '
     write(iulog,*) 'nonzeros per row:'
-    do rowA = 1, nNodesSolve
+    do rowA = 1, nVerticesSolve
        nonzeros(rowA) = 0
-       do colA = 1, nNodesSolve
+       do colA = 1, nVerticesSolve
           if (abs(Auu_val(rowA,colA)) > 1.d-11) then
              nonzeros(rowA) = nonzeros(rowA) + 1
           endif
@@ -11139,12 +11131,12 @@ subroutine write_matrix_elements_3d(nx,    ny,   nz,     &
     open(unit=12, file='Avu.'//matrix_label, status='unknown')
     open(unit=13, file='Avv.'//matrix_label, status='unknown')
 
-    do rowA = 1, nNodesSolve
+    do rowA = 1, nVerticesSolve
        write(10,'(i6)',advance='no') rowA
        write(11,'(i6)',advance='no') rowA
        write(12,'(i6)',advance='no') rowA
        write(13,'(i6)',advance='no') rowA
-       do colA = 1, nNodesSolve
+       do colA = 1, nVerticesSolve
           write(10,'(e16.8)',advance='no') Auu_val(rowA,colA)
           write(11,'(e16.8)',advance='no') Auv_val(rowA,colA)
           write(12,'(e16.8)',advance='no') Avu_val(rowA,colA)
@@ -11166,57 +11158,58 @@ subroutine write_matrix_elements_3d(nx,    ny,   nz,     &
     ! write load vectors to file
     open(unit=14, file='bu.'//matrix_label, status='unknown')
     open(unit=15, file='bv.'//matrix_label, status='unknown')
-    do rowA = 1, nNodesSolve
-       i = iNodeIndex(rowA)
-       j = jNodeIndex(rowA)
-       k = kNodeIndex(rowA)
-       write(14,'(i6, e16.8)') rowA, bu(k,i,j)
-       write(15,'(i6, e16.8)') rowA, bv(k,i,j)
+    do rowA = 1, nVerticesSolve
+       i = iVertexIndex(rowA)
+       j = jVertexIndex(rowA)
+       write(14,'(i6, e16.8)') rowA, bu(i,j)
+       write(15,'(i6, e16.8)') rowA, bv(i,j)
     enddo
     close(14)
     close(15)
 
-  end subroutine write_matrix_elements_3d
-  
+  end subroutine write_matrix_elements_2d
+
 !****************************************************************************
 
-  subroutine write_matrix_elements_2d(nx,             ny,            &
-                                      nVerticesSolve, vertexID,      &
-                                      iVertexIndex,   jVertexIndex,  &
-                                      Auu,            Auv,           &
-                                      Avu,            Avv,           &
-                                      bu,             bv)
+  subroutine write_matrix_elements_3d(nx,    ny,   nz,     &
+                                      nNodesSolve, nodeID, &
+                                      iNodeIndex,  jNodeIndex,  &
+                                      kNodeIndex,          &
+                                      Auu,         Auv,    &
+                                      Avu,         Avv,    &
+                                      bu,          bv)
 
     ! Write matrix elements to text files.
     ! Note: Does not work when running on more than one task.
 
     integer, intent(in) ::   &
        nx, ny,               &  ! horizontal grid dimensions
-       nVerticesSolve           ! number of vertices where we solve for velocity
+       nz,                   &  ! number of vertical levels at which velocity is computed
+       nNodesSolve              ! number of nodes where we solve for velocity
 
-    integer, dimension(nx-1,ny-1), intent(in) ::  &
-       vertexID             ! ID for each vertex
+    integer, dimension(nz,nx-1,ny-1), intent(in) ::  &
+       nodeID             ! ID for each node
 
     integer, dimension(:), intent(in) ::   &
-       iVertexIndex, jVertexIndex   ! i and j indices of active vertices
+       iNodeIndex, jNodeIndex, kNodeIndex   ! i, j and k indices of active nodes
 
-    real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(in) ::  &
+    real(dp), dimension(nNodeNeighbors_3d,nz,nx-1,ny-1), intent(in) ::  &
        Auu, Auv,    &     ! assembled stiffness matrix, divided into 4 parts
-       Avu, Avv           ! 1st dimension = vertex and its nearest neighbors in x and y direction 
-                          ! other dimensions = (i,j) indices
+       Avu, Avv           ! 1st dimension = node and its nearest neighbors in x, y and z direction 
+                          ! other dimensions = (k,i,j) indices
 
-    real(dp), dimension(nx-1,ny-1), intent(in) ::  &
+    real(dp), dimension(nz,nx-1,ny-1), intent(in) ::  &
        bu, bv             ! assembled load (rhs) vector, divided into 2 parts
 
     ! Local variables
 
     integer :: rowA, colA
-    integer :: i, j, m, iA, jA
+    integer :: i, j, k, m, iA, jA, kA
 
-    real(dp), dimension(nVerticesSolve, nVerticesSolve) ::   &
+    real(dp), dimension(nNodesSolve, nNodesSolve) ::   &
        Auu_val, Auv_val, Avu_val, Avv_val   ! dense matrices
 
-    real(dp), dimension(nVerticesSolve) :: nonzeros
+    real(dp), dimension(nNodesSolve) :: nonzeros
 
     if (tasks > 1) then
        call write_log('Error: Cannot write matrix elements to files when tasks > 1', GM_FATAL)
@@ -11227,29 +11220,35 @@ subroutine write_matrix_elements_2d(nx,             ny,            &
     Avu_val(:,:) = 0.d0
     Avv_val(:,:) = 0.d0
 
-    do rowA = 1, nVerticesSolve
+    do rowA = 1, nNodesSolve
 
-       i = iVertexIndex(rowA)
-       j = jVertexIndex(rowA)
+       i = iNodeIndex(rowA)
+       j = jNodeIndex(rowA)
+       k = kNodeIndex(rowA)
+
+       do kA = -1, 1
        do jA = -1, 1
        do iA = -1, 1
 
-          if ( (i+iA >= 1 .and. i+iA <= nx-1)       &
+          if ( (k+kA >= 1 .and. k+kA <= nz)         &
+                          .and.                     &
+               (i+iA >= 1 .and. i+iA <= nx-1)       &
                           .and.                     &
                (j+jA >= 1 .and. j+jA <= ny-1) ) then
 
-             colA = vertexID(i+iA, j+jA)   ! ID for neighboring vertex
-             m = indxA_2d(iA,jA)
+             colA = nodeID(k+kA, i+iA, j+jA)   ! ID for neighboring node
+             m = indxA_3d(iA,jA,kA)
 
              if (colA > 0) then 
-                Auu_val(rowA, colA) = Auu(i,j,m)
-                Auv_val(rowA, colA) = Auv(i,j,m)
-                Avu_val(rowA, colA) = Avu(i,j,m)
-                Avv_val(rowA, colA) = Avv(i,j,m)
+                Auu_val(rowA, colA) = Auu(m,k,i,j)
+                Auv_val(rowA, colA) = Auv(m,k,i,j)
+                Avu_val(rowA, colA) = Avu(m,k,i,j)
+                Avv_val(rowA, colA) = Avv(m,k,i,j)
              endif
 
-          endif     ! i+iA and j+jA in bounds
+          endif     ! i+iA, j+jA, and k+kA in bounds
 
+       enddo        ! kA
        enddo        ! iA
        enddo        ! jA
 
@@ -11258,9 +11257,9 @@ subroutine write_matrix_elements_2d(nx,             ny,            &
     !WHL - bug check
     write(iulog,*) ' '
     write(iulog,*) 'nonzeros per row:'
-    do rowA = 1, nVerticesSolve
+    do rowA = 1, nNodesSolve
        nonzeros(rowA) = 0
-       do colA = 1, nVerticesSolve
+       do colA = 1, nNodesSolve
           if (abs(Auu_val(rowA,colA)) > 1.d-11) then
              nonzeros(rowA) = nonzeros(rowA) + 1
           endif
@@ -11277,12 +11276,12 @@ subroutine write_matrix_elements_2d(nx,             ny,            &
     open(unit=12, file='Avu.'//matrix_label, status='unknown')
     open(unit=13, file='Avv.'//matrix_label, status='unknown')
 
-    do rowA = 1, nVerticesSolve
+    do rowA = 1, nNodesSolve
        write(10,'(i6)',advance='no') rowA
        write(11,'(i6)',advance='no') rowA
        write(12,'(i6)',advance='no') rowA
        write(13,'(i6)',advance='no') rowA
-       do colA = 1, nVerticesSolve
+       do colA = 1, nNodesSolve
           write(10,'(e16.8)',advance='no') Auu_val(rowA,colA)
           write(11,'(e16.8)',advance='no') Auv_val(rowA,colA)
           write(12,'(e16.8)',advance='no') Avu_val(rowA,colA)
@@ -11304,100 +11303,18 @@ subroutine write_matrix_elements_2d(nx,             ny,            &
     ! write load vectors to file
     open(unit=14, file='bu.'//matrix_label, status='unknown')
     open(unit=15, file='bv.'//matrix_label, status='unknown')
-    do rowA = 1, nVerticesSolve
-       i = iVertexIndex(rowA)
-       j = jVertexIndex(rowA)
-       write(14,'(i6, e16.8)') rowA, bu(i,j)
-       write(15,'(i6, e16.8)') rowA, bv(i,j)
+    do rowA = 1, nNodesSolve
+       i = iNodeIndex(rowA)
+       j = jNodeIndex(rowA)
+       k = kNodeIndex(rowA)
+       write(14,'(i6, e16.8)') rowA, bu(k,i,j)
+       write(15,'(i6, e16.8)') rowA, bv(k,i,j)
     enddo
     close(14)
     close(15)
 
-  end subroutine write_matrix_elements_2d
-
-!****************************************************************************
-  !TODO - Either delete this subroutine, or switch the indices.  Not currently used.
-  subroutine compress_3d_to_2d(nx,        ny,      nz,  &
-                               Auu,       Auv,          &
-                               Avu,       Avv,          &
-                               bu,        bv,           &
-                               Auu_2d,    Auv_2d,       &
-                               Avu_2d,    Avv_2d,       &
-                               bu_2d,     bv_2d)
-
-    !----------------------------------------------------------------
-    ! Form the 2D matrix and rhs by combining terms from the 3D matrix and rhs.
-    ! This combination is based on the assumption of no vertical shear;
-    !  i.e., uvel and vvel have the same value at each level in a given column.
-    !----------------------------------------------------------------
-
-    !----------------------------------------------------------------
-    ! Input-output arguments
-    !----------------------------------------------------------------
-
-    integer, intent(in) ::   &
-       nx, ny,               &  ! horizontal grid dimensions
-       nz                       ! number of vertical levels where velocity is computed
-
-    real(dp), dimension(nNodeNeighbors_3d,nz,nx-1,ny-1), intent(in) ::  &
-       Auu, Auv,    &     ! assembled 3D stiffness matrix, divided into 4 parts
-       Avu, Avv           
-                          
-    real(dp), dimension(nz,nx-1,ny-1), intent(in) ::  &
-       bu, bv             ! assembled 3D rhs vector, divided into 2 parts
-                          
-    real(dp), dimension(nNodeNeighbors_2d,nx-1,ny-1), intent(out) ::  &
-       Auu_2d, Auv_2d,   &! assembled 2D (SSA) stiffness matrix, divided into 4 parts
-       Avu_2d, Avv_2d           
-                          
-    real(dp), dimension(nx-1,ny-1), intent(out) ::  &
-       bu_2d, bv_2d       ! assembled 2D (SSA) rhs vector, divided into 2 parts
-                          
-    !----------------------------------------------------------------
-    ! Local variables
-    !----------------------------------------------------------------
-
-    integer :: i, j, k, iA, jA, kA, m, m2
-
-    ! Initialize 2D matrix and rhs
-
-    Auu_2d(:,:,:) = 0.d0
-    Auv_2d(:,:,:) = 0.d0
-    Avu_2d(:,:,:) = 0.d0
-    Avv_2d(:,:,:) = 0.d0
-    bu_2d(:,:) = 0.d0
-    bv_2d(:,:) = 0.d0
-
-    ! Form 2D matrix and rhs
-
-    do j = 1, ny-1
-       do i = 1, nx-1
-          do k = 1, nz
-
-             ! matrix
-             do kA = -1,1
-                do jA = -1,1
-                   do iA = -1,1
-                      m = indxA_3d(iA,jA,kA)
-                      m2 = indxA_2d(iA,jA)
-                      Auu_2d(m2,i,j) = Auu_2d(m2,i,j) + Auu(m,k,i,j)
-                      Auv_2d(m2,i,j) = Auv_2d(m2,i,j) + Auv(m,k,i,j)
-                      Avu_2d(m2,i,j) = Avu_2d(m2,i,j) + Avu(m,k,i,j)
-                      Avv_2d(m2,i,j) = Avv_2d(m2,i,j) + Avv(m,k,i,j)
-                   enddo   ! iA
-                enddo      ! jA
-             enddo         ! kA
-
-             ! rhs
-             bu_2d(i,j) = bu_2d(i,j) + bu(k,i,j)
-             bv_2d(i,j) = bv_2d(i,j) + bv(k,i,j)
-
-          enddo            ! k
-       enddo               ! i
-    enddo                  ! j
-
-  end subroutine compress_3d_to_2d
-
+  end subroutine write_matrix_elements_3d
+  
 !****************************************************************************
 
   end module glissade_velo_higher

From 40718e1997fd2d4ce6ff54be98fdc8a8968b34e8 Mon Sep 17 00:00:00 2001
From: William Lipscomb <lipscomb@ucar.edu>
Date: Fri, 23 Jan 2026 18:25:27 -0700
Subject: [PATCH 18/21] Removed some verbose diagnostics

---
 libglissade/glissade_velo_higher.F90 | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/libglissade/glissade_velo_higher.F90 b/libglissade/glissade_velo_higher.F90
index 66074862..a4e7b86b 100644
--- a/libglissade/glissade_velo_higher.F90
+++ b/libglissade/glissade_velo_higher.F90
@@ -227,8 +227,8 @@ module glissade_velo_higher
 !    logical :: verbose_bfric = .true.
     logical :: verbose_trilinos = .false.
 !    logical :: verbose_trilinos = .true.
-!    logical :: verbose_beta = .false.
-    logical :: verbose_beta = .true.
+    logical :: verbose_beta = .false.
+!    logical :: verbose_beta = .true.
     logical :: verbose_efvs = .false.
 !    logical :: verbose_efvs = .true.
     logical :: verbose_tau = .false.
@@ -6023,11 +6023,6 @@ subroutine compute_3d_velocity_L1L2(nx,               ny,              &
     uvel(1:nz-1,:,:) = 0.d0
     vvel(1:nz-1,:,:) = 0.d0
 
-    !WHL - debug
-    call point_diag(xVertex, 'xVertex', itest, jtest, rtest, 7, 7)
-    call point_diag(yVertex, 'yVertex', itest, jtest, rtest, 7, 7)
-    call point_diag(active_cell, 'active_cell', itest, jtest, rtest, 7, 7)
-
     ! Compute viscosity integral and strain rates in elements.
     ! Loop over all cells that border locally owned vertices.
 

From 69a919998759a116b8589e518832d17643f0247a Mon Sep 17 00:00:00 2001
From: William Lipscomb <lipscomb@ucar.edu>
Date: Mon, 2 Feb 2026 17:01:27 -0700
Subject: [PATCH 19/21] Fixed a bug in the Picard acceleration

Several years also I added the option which_ho_nonlinear for Picard acceleration.
The basic idea is that after each velocity solve, CISM computes a vector equal to
the difference between the new solution and the old solution. The model then tests
whether extending that vector (i.e., multiplying it by a scalar > 1) reduces the
residual A*x - B. If so, the vector is extended.

The logic is such that if an extended solution vector fails to reduce the residual,
we back off to the latest attempt that succeeded in reducing the residual.
In other words, we discard the trial matrix arrays Auu, Avv, etc., and go back
to the saved arrays Auu_sav, Avv_sav.

In addition to the matrix arrays, several other arrays must be saved and restored.
When investigating the slow convergence for certain grid cells, I realized that for
a DIVA solve, some additional arrays need to be saved: beta_eff_x, beta_eff_y, and omega_k.
Using the (bad) trial values instead of reverting to the (good) saved values means
that the subsquent DIVA 3d velocity integration is not quite correct.

This commit saves the additional arrays, which significantly improves convergence
of the nonlinear solver for Antarctic runs. I tested the changes for a new
modified School sliding law. Before this bug fix, the solver often failed to converge
in 50 or more iterations, even after spinning up for several thousand years.
With the bug fix, the solver now converges in ~20 nonlinear iterations.

Note: This commit is answer-changing for DIVA runs with Picard acceleration.
---
 libglissade/glissade_velo_higher.F90 | 240 +++++++++++++++------------
 1 file changed, 132 insertions(+), 108 deletions(-)

diff --git a/libglissade/glissade_velo_higher.F90 b/libglissade/glissade_velo_higher.F90
index a4e7b86b..1ea5b2eb 100644
--- a/libglissade/glissade_velo_higher.F90
+++ b/libglissade/glissade_velo_higher.F90
@@ -227,8 +227,8 @@ module glissade_velo_higher
 !    logical :: verbose_bfric = .true.
     logical :: verbose_trilinos = .false.
 !    logical :: verbose_trilinos = .true.
-    logical :: verbose_beta = .false.
-!    logical :: verbose_beta = .true.
+!    logical :: verbose_beta = .false.
+    logical :: verbose_beta = .true.
     logical :: verbose_efvs = .false.
 !    logical :: verbose_efvs = .true.
     logical :: verbose_tau = .false.
@@ -1058,11 +1058,13 @@ subroutine glissade_velo_higher_solve(model,                &
          uvel_2d_old, vvel_2d_old,  & ! velocity solution from previous nonlinear iteration
          duvel_2d, dvvel_2d,        & ! difference between current and previous velocity solutions
          uvel_2d_sav, vvel_2d_sav,  & ! current best value for velocity solution (smallest residual)
-         beta_internal_sav            ! beta_internal associated with saved velocity
+         beta_internal_sav,         & ! beta_internal associated with saved velocity
+         beta_eff_x_sav, beta_eff_y_sav
 
     real(dp), dimension(:,:,:), allocatable :: &
          Auu_2d_sav, Auv_2d_sav,    & ! assembled matrices associated with (uvel_2d_sav, vvel_2d_sav)
-         Avu_2d_sav, Avv_2d_sav
+         Avu_2d_sav, Avv_2d_sav,    &
+         omega_k_sav
 
     ! for an accelerated 3D solve:
     real(dp), dimension(:,:,:), allocatable :: &
@@ -1288,7 +1290,15 @@ subroutine glissade_velo_higher_solve(model,                &
        allocate(vsav_2d(nx-1,ny-1))
        allocate(resid_u_2d(nx-1,ny-1))
        allocate(resid_v_2d(nx-1,ny-1))
-       if (accel_picard) then
+    else   ! 3d
+       allocate(Auu(nNodeNeighbors_3d,nz,nx-1,ny-1))
+       allocate(Auv(nNodeNeighbors_3d,nz,nx-1,ny-1))
+       allocate(Avu(nNodeNeighbors_3d,nz,nx-1,ny-1))
+       allocate(Avv(nNodeNeighbors_3d,nz,nx-1,ny-1))
+    endif
+
+    if (accel_picard) then
+       if (solve_2d) then
           allocate(uvel_2d_old(nx-1,ny-1))
           allocate(vvel_2d_old(nx-1,ny-1))
           allocate(duvel_2d(nx-1,ny-1))
@@ -1299,14 +1309,7 @@ subroutine glissade_velo_higher_solve(model,                &
           allocate(Auv_2d_sav(nx-1,ny-1,nNodeNeighbors_2d))
           allocate(Avu_2d_sav(nx-1,ny-1,nNodeNeighbors_2d))
           allocate(Avv_2d_sav(nx-1,ny-1,nNodeNeighbors_2d))
-          allocate(beta_internal_sav(nx-1,ny-1))
-       endif
-    else
-       allocate(Auu(nNodeNeighbors_3d,nz,nx-1,ny-1))
-       allocate(Auv(nNodeNeighbors_3d,nz,nx-1,ny-1))
-       allocate(Avu(nNodeNeighbors_3d,nz,nx-1,ny-1))
-       allocate(Avv(nNodeNeighbors_3d,nz,nx-1,ny-1))
-       if (accel_picard) then
+       else   ! 3d
           allocate(uvel_old(nz,nx-1,ny-1))
           allocate(vvel_old(nz,nx-1,ny-1))
           allocate(duvel(nz,nx-1,ny-1))
@@ -1317,9 +1320,13 @@ subroutine glissade_velo_higher_solve(model,                &
           allocate(Auv_sav(nNodeNeighbors_3d,nz,nx-1,ny-1))
           allocate(Avu_sav(nNodeNeighbors_3d,nz,nx-1,ny-1))
           allocate(Avv_sav(nNodeNeighbors_3d,nz,nx-1,ny-1))
-          allocate(beta_internal_sav(nx-1,ny-1))
        endif
-    endif
+       allocate(beta_internal_sav(nx-1,ny-1))
+       !Note: The next three are used only for DIVA, but it's simpler to allocate them regardless
+       allocate(beta_eff_x_sav(nx-1,ny-1))
+       allocate(beta_eff_y_sav(nx-1,ny-1))
+       allocate(omega_k_sav(nz,nx,ny))
+    endif   ! accel_picard
 
     if (whichapprox == HO_APPROX_DIVA) then
 !!       call parallel_halo(efvs, parallel)   ! efvs halo update is in glissade_diagnostic_variable_solve
@@ -2203,8 +2210,7 @@ subroutine glissade_velo_higher_solve(model,                &
                         itest,   jtest,     rtest,                &
                         active_vertex,      diva_level_index,     &
                         ice_plus_land_mask,                       &
-                        stag_omega,         omega_k,              &
-                        beta_internal,                            &
+                        omega_k,            beta_internal,        &
                         beta_eff_x,         beta_eff_y,           &
                         stag_theta_slope_x, stag_theta_slope_y,   &
                         stag_diva_slope_factor_x,                 &
@@ -3016,6 +3022,7 @@ subroutine glissade_velo_higher_solve(model,                &
 
           if (solve_2d) then
 
+
              call t_startf('glissade_resid_vec')
              call compute_residual_vector_2d(nx,            ny,            &
                                              parallel,                     &
@@ -3032,41 +3039,40 @@ subroutine glissade_velo_higher_solve(model,                &
              call t_startf('glissade_accel_picard')
              if (accel_picard) then
 
-                if (verbose_picard) then
-                   if (this_rank == rtest) then
-                      write(iulog,*) ' '
-                      write(iulog,*) 'Saved L2 norm, new L2 norm:', L2_norm_alpha_sav, L2_norm
-                   endif
-                   call point_diag(resid_u_2d, 'resid_u_2d', itest, jtest, rtest, 7, 7, '(e10.3)')
-                   call point_diag(uvel_2d, 'uvel_2d', itest, jtest, rtest, 7, 7, '(f10.3)')
+                if (verbose_picard .and. this_rank == rtest) then
+                   write(iulog,*) ' '
+                   write(iulog,*) 'Saved L2 norm, new L2 norm:', L2_norm_alpha_sav, L2_norm
                 endif
 
                 if (counter >= 2) then
 
-                   call evaluate_accelerated_picard_2d(nx,            ny,                   &
-                                                       L2_norm,       L2_norm_large,        &
-                                                       L2_norm_alpha_sav,                   &
-                                                       alpha_accel,   alpha_accel_max,      &
-                                                       gamma_accel,   resid_reduction_threshold,  &
-                                                       uvel_2d,       vvel_2d,              &
-                                                       Auu_2d,        Auv_2d,               &
-                                                       Avu_2d,        Avv_2d,               &
-                                                       uvel_2d_old,   vvel_2d_old,          &
-                                                       duvel_2d,      dvvel_2d,             &
-                                                       uvel_2d_sav,   vvel_2d_sav,          &
-                                                       Auu_2d_sav,    Auv_2d_sav,           &
-                                                       Avu_2d_sav,    Avv_2d_sav,           &
-                                                       beta_internal, beta_internal_sav,    &
-                                                       assembly_is_done)
+                   call evaluate_accelerated_picard_2d(&
+                        whichapprox,   rtest,                &
+                        L2_norm,       L2_norm_large,        &
+                        L2_norm_alpha_sav,                   &
+                        alpha_accel,   alpha_accel_max,      &
+                        gamma_accel,   resid_reduction_threshold,  &
+                        uvel_2d,       vvel_2d,              &
+                        Auu_2d,        Auv_2d,               &
+                        Avu_2d,        Avv_2d,               &
+                        uvel_2d_old,   vvel_2d_old,          &
+                        duvel_2d,      dvvel_2d,             &
+                        uvel_2d_sav,   vvel_2d_sav,          &
+                        Auu_2d_sav,    Auv_2d_sav,           &
+                        Avu_2d_sav,    Avv_2d_sav,           &
+                        beta_internal, beta_internal_sav,    &
+                        beta_eff_x,    beta_eff_x_sav,       &
+                        beta_eff_y,    beta_eff_y_sav,       &
+                        omega_k,       omega_k_sav,          &
+                        assembly_is_done)
 
                 else   ! counter = 1
 
                    ! proceed to the matrix solution
                    assembly_is_done = .true.
 
-                   if (verbose_picard .and. main_task) then
-                      write(iulog,*) 'nonlinear counter = 1; continue to matrix solver'
-                   endif
+                   if (verbose_picard .and. this_rank == rtest) &
+                        write(iulog,*) 'nonlinear counter = 1; continue to matrix solver'
 
                 endif  ! counter >= 2
 
@@ -3107,20 +3113,22 @@ subroutine glissade_velo_higher_solve(model,                &
 
                 if (counter >= 2) then
 
-                   call evaluate_accelerated_picard_3d(L2_norm,       L2_norm_large,        &
-                                                       L2_norm_alpha_sav,                   &
-                                                       alpha_accel,   alpha_accel_max,      &
-                                                       gamma_accel,   resid_reduction_threshold,  &
-                                                       uvel,          vvel,                 &
-                                                       Auu,           Auv,                  &
-                                                       Avu,           Avv,                  &
-                                                       uvel_old,      vvel_old,             &
-                                                       duvel,         dvvel,                &
-                                                       uvel_sav,      vvel_sav,             &
-                                                       Auu_sav,       Auv_sav,              &
-                                                       Avu_sav,       Avv_sav,              &
-                                                       beta_internal, beta_internal_sav,    &
-                                                       assembly_is_done)
+                   call evaluate_accelerated_picard_3d(&
+                        rtest,                               &
+                        L2_norm,       L2_norm_large,        &
+                        L2_norm_alpha_sav,                   &
+                        alpha_accel,   alpha_accel_max,      &
+                        gamma_accel,   resid_reduction_threshold,  &
+                        uvel,          vvel,                 &
+                        Auu,           Auv,                  &
+                        Avu,           Avv,                  &
+                        uvel_old,      vvel_old,             &
+                        duvel,         dvvel,                &
+                        uvel_sav,      vvel_sav,             &
+                        Auu_sav,       Auv_sav,              &
+                        Avu_sav,       Avv_sav,              &
+                        beta_internal, beta_internal_sav,    &
+                        assembly_is_done)
 
                 else   ! counter = 1
 
@@ -3545,8 +3553,7 @@ subroutine glissade_velo_higher_solve(model,                &
                   itest,   jtest,     rtest,                &
                   active_vertex,      diva_level_index,     &
                   ice_plus_land_mask,                       &
-                  stag_omega,         omega_k,              &
-                  beta_internal,                            &
+                  omega_k,            beta_internal,        &
                   beta_eff_x,         beta_eff_y,           &
                   stag_theta_slope_x, stag_theta_slope_y,   &
                   stag_diva_slope_factor_x,                 &
@@ -3840,6 +3847,8 @@ subroutine glissade_velo_higher_solve(model,                &
           deallocate(uvel_2d_sav, vvel_2d_sav)
           deallocate(Auu_2d_sav, Auv_2d_sav, Avu_2d_sav, Avv_2d_sav)
           deallocate(beta_internal_sav)
+          deallocate(beta_eff_x_sav, beta_eff_y_sav)
+          deallocate(omega_k_sav)
        endif
     else
        deallocate(Auu, Auv, Avu, Avv)
@@ -5717,8 +5726,7 @@ subroutine compute_3d_velocity_diva(&
        itest,    jtest,     rtest,              &
        active_vertex,       diva_level_index,   &
        ice_plus_land_mask,                      &
-       stag_omega,          omega_k,            &
-       beta,                                    &
+       omega_k,             beta,               &
        beta_eff_x,          beta_eff_y,         &
        stag_theta_slope_x,  stag_theta_slope_y, &
        stag_diva_slope_factor_x,                &
@@ -5768,9 +5776,6 @@ subroutine compute_3d_velocity_diva(&
        stag_theta_slope_y, & ! slope angle (radians) in y direction at vertices
        stag_diva_slope_factor_x, & ! slope correction factor in x direction
        stag_diva_slope_factor_y, & ! slope correction factor in y direction
-       stag_omega,      & ! double integral, defined by Goldberg eq. 35 (m^2/(Pa yr))
-                          ! already interpolated to staggered grid
-                          ! Note: omega here = Goldberg's omega/H
        uvel_2d, vvel_2d   ! depth-integrated mean velocity; solution of 2D velocity solve (m/yr)
 
     real(dp), dimension(nx-1,ny-1), intent(out) ::  &
@@ -5787,12 +5792,6 @@ subroutine compute_3d_velocity_diva(&
          stag_omega_k       ! single integral, defined by Goldberg eq. 32 (m^2/(Pa yr))
                             ! interpolated to staggered grid
 
-    real(dp), dimension(nx-1,ny-1) ::  &
-         stag_integral      ! integral that relates bed velocity to uvel_2d and vvel_2d
-                            ! = stag_omega for diva_level_index = 0
-                            ! = stag_omega_k(k,:,:) for other values of diva_level_index
-
-
     real(dp) ::  &
          slope_correction_x, & ! slope-based correction for vertical shear in x direction
          slope_correction_y    ! slope-based correction for vertical shear in y direction
@@ -5812,15 +5811,6 @@ subroutine compute_3d_velocity_diva(&
                              stagger_margin_in = 1)
     enddo
 
-    ! Identify the appropriate integral for relating uvel_2d/vvel_2d to the bed velocity
-
-    if (diva_level_index == 0) then  ! solved for mean velocity
-       stag_integral(:,:) = stag_omega(:,:)
-    else
-       k = diva_level_index
-       stag_integral(:,:) = stag_omega_k(k,:,:)
-    endif
-
     !----------------------------------------------------------------
     ! Compute the 3D velocity field
     !----------------------------------------------------------------
@@ -7967,7 +7957,7 @@ subroutine compute_element_matrix(whichapprox, nNodesPerElement,     &
     !----------------------------------------------------------------
 
     integer, intent(in) :: &
-         whichapprox     ! which Stokes approximation to use (BP, SIA, SSA)
+         whichapprox     ! which Stokes approximation to use (SIA, SSA, DIVA, L1L2, BP)
 
     integer, intent(in) :: nNodesPerElement  ! number of nodes per element
 
@@ -9576,9 +9566,11 @@ subroutine compute_residual_vector_2d(nx,            ny,            &
        if (this_rank==rtest) then
           i = itest
           j = jtest
-          write(iulog,*) 'In compute_residual_vector_2d: task, i, j =', this_rank, i, j
-          write(iulog, '(a16, 2f13.7, 2e13.5)') &
-               '  u, v, ru, rv: ', uvel(i,j), vvel(i,j), resid_u(i,j), resid_v(i,j)
+          call parallel_globalindex(i, j, iglobal, jglobal, parallel)
+          write(iulog,*) ' '
+          write(iulog,*) 'In compute_residual_vector_2d: test ig, jg =', i, j
+          write(iulog, '(a15, 2f12.5, 2e13.5)') &
+               '  u, v, ru, rv:', uvel(i,j), vvel(i,j), resid_u(i,j), resid_v(i,j)
        endif
 
        ! Compute max value of (squared) residual on this task.
@@ -9592,7 +9584,7 @@ subroutine compute_residual_vector_2d(nx,            ny,            &
              do i = staggered_ilo, staggered_ihi
                 if (abs((resid_sq(i,j) - global_max_resid)/global_max_resid) < 1.0d-6) then
                    call parallel_globalindex(i, j, iglobal, jglobal, parallel)
-                   write(iulog, '(a24, 2i6, 2e13.5, e16.8)') 'ig, jg, ru, rv, rmax:', &
+                   write(iulog, '(a24, 2i6, 2e13.5, e16.8)') 'ig, jg, ru, rv, global rmax:', &
                         iglobal, jglobal, resid_u(i,j), resid_v(i,j), sqrt(global_max_resid)
                    write(iulog,*) ' '
                 endif
@@ -9839,7 +9831,7 @@ end subroutine compute_residual_vector_3d
 !****************************************************************************
 
   subroutine evaluate_accelerated_picard_2d(&
-       nx,            ny,                   &
+       whichapprox,   rtest,                &
        L2_norm,       L2_norm_large,        &
        L2_norm_alpha_sav,                   &
        alpha_accel,   alpha_accel_max,      &
@@ -9853,10 +9845,14 @@ subroutine evaluate_accelerated_picard_2d(&
        Auu_2d_sav,    Auv_2d_sav,           &
        Avu_2d_sav,    Avv_2d_sav,           &
        beta_internal, beta_internal_sav,    &
+       beta_eff_x,    beta_eff_x_sav,       &
+       beta_eff_y,    beta_eff_y_sav,       &
+       omega_k,       omega_k_sav,          &
        assembly_is_done)
 
     integer, intent(in) :: &
-         nx,  ny                    ! number of grid cells in each direction
+         rtest,                  &  ! rank for diagnostic point
+         whichapprox                ! which Stokes approximation to use (SIA, SSA, DIVA, L1L2, BP)
 
     real(dp), intent(in) :: &
          L2_norm,                &  ! latest value of L2 norm of residual
@@ -9869,7 +9865,7 @@ subroutine evaluate_accelerated_picard_2d(&
          alpha_accel,            &  ! factor for extending the vector (duvel, dvvel) to reduce the residual
          L2_norm_alpha_sav          ! value of L2 norm of residual, given the previous alpha_accel
 
-    real(dp), dimension(nx-1,ny-1), intent(inout) ::  &
+    real(dp), dimension(:,:), intent(inout) ::  &
          uvel_2d,     vvel_2d,         & ! latest guess for the velocity solution
          uvel_2d_old, vvel_2d_old,     & ! velocity solution from previous nonlinear iteration
          duvel_2d,    dvvel_2d,        & ! difference between old velocity solution and latest solution
@@ -9877,7 +9873,14 @@ subroutine evaluate_accelerated_picard_2d(&
          beta_internal,                & ! beta_internal as a function of uvel_2d and vvel_2d
          beta_internal_sav               ! beta_internal as a function of uvel_2d_sav and vvel_2d_sav
 
-    real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(inout) ::  &
+    real(dp), dimension(:,:), intent(inout) ::  &
+         beta_eff_x, beta_eff_x_sav, &
+         beta_eff_y, beta_eff_y_sav
+
+    real(dp), dimension(:,:,:), intent(inout) ::  &
+         omega_k, omega_k_sav
+
+    real(dp), dimension(:,:,:), intent(inout) ::  &
          Auu_2d,      Auv_2d,          & ! latest assembled matrices as a function of uvel_2d and vvel_2d
          Avu_2d,      Avv_2d,          &
          Auu_2d_sav,  Auv_2d_sav,      & ! assembled matrices as a function of uvel_2d_sav and vvel_2d_sav
@@ -9886,6 +9889,10 @@ subroutine evaluate_accelerated_picard_2d(&
     logical, intent(inout) :: &
          assembly_is_done                ! if true, then accept the current assembled matrices and proceed to solution
 
+    if (verbose_picard .and. this_rank == rtest) then
+       write(iulog,*) 'Evaluate Picard acceleration: L2_norm_sav, L2_norm:', L2_norm_alpha_sav, L2_norm
+    endif
+
     if (L2_norm < resid_reduction_threshold*L2_norm_alpha_sav .and. &
          alpha_accel + gamma_accel <= alpha_accel_max) then
 
@@ -9893,7 +9900,6 @@ subroutine evaluate_accelerated_picard_2d(&
        ! ("Substantially" is defined by the factor resid_reduction_threshold < 1.)
 
        ! Save the latest values of the solver inputs
-
        uvel_2d_sav = uvel_2d
        vvel_2d_sav = vvel_2d
        Auu_2d_sav = Auu_2d
@@ -9901,28 +9907,29 @@ subroutine evaluate_accelerated_picard_2d(&
        Avu_2d_sav = Avu_2d
        Avv_2d_sav = Avv_2d
        beta_internal_sav = beta_internal
+       if (whichapprox == HO_APPROX_DIVA) then
+          beta_eff_x_sav = beta_eff_x
+          beta_eff_y_sav = beta_eff_y
+          omega_k_sav = omega_k
+       endif
 
        ! Increase alpha_accel and see if the residual keeps getting smaller.
        ! If not, we will back off to the saved values above.
        alpha_accel = alpha_accel + gamma_accel
        L2_norm_alpha_sav = L2_norm
 
-       if (verbose_picard .and. main_task) then
-          write(iulog,*) 'Keep going, alpha =', alpha_accel
+       if (verbose_picard .and. this_rank == rtest) then
+          write(iulog,*) 'Increase alpha to', alpha_accel
        endif
 
        ! Since assembly_is_done = F, we now return to the start of the loop:
-       ! do while (.not.assembly_is_done)
+       !    do while (.not.assembly_is_done)
 
     elseif (L2_norm < L2_norm_alpha_sav) then
 
        ! The residual norm decreased only a little (or we have reached alpha_accel_max).
        ! Call it good and move on to the solver.
 
-       if (verbose_picard .and. main_task) then
-          write(iulog,*) 'Hold, alpha =', alpha_accel
-       endif
-
        ! Save this velocity as the starting point for the next nonlinear iteration
        uvel_2d_old = uvel_2d
        vvel_2d_old = vvel_2d
@@ -9934,8 +9941,12 @@ subroutine evaluate_accelerated_picard_2d(&
        ! proceed to the matrix solution
        assembly_is_done = .true.
 
-    else
+       if (verbose_picard .and. this_rank == rtest) then
+          write(iulog,*) 'Hold alpha at', alpha_accel
+          write(iulog,*) ' Continue to matrix solver'
+       endif
 
+    else
        ! The residual is larger than the previous value.
        ! Switch back to the previously saved velocity and matrix with the lower residual.
        uvel_2d = uvel_2d_sav
@@ -9945,16 +9956,16 @@ subroutine evaluate_accelerated_picard_2d(&
        Avu_2d = Avu_2d_sav
        Avv_2d = Avv_2d_sav
        beta_internal = beta_internal_sav
+       if (whichapprox == HO_APPROX_DIVA) then
+          beta_eff_x = beta_eff_x_sav
+          beta_eff_y = beta_eff_y_sav
+          omega_k = omega_k_sav
+       endif
 
        ! Save this velocity as the starting point for the next nonlinear iteration
        uvel_2d_old = uvel_2d
        vvel_2d_old = vvel_2d
 
-       if (verbose_picard .and. main_task) then
-          write(iulog,*) 'Back up to alpha =', alpha_accel - gamma_accel
-          write(iulog,*) 'Continue to matrix solver'
-       endif
-
        ! Reset alpha_accel and L2_norm_alpha_sav for the next nonlinear iteration
        alpha_accel = 1.0d0
        L2_norm_alpha_sav = L2_norm_large
@@ -9962,6 +9973,11 @@ subroutine evaluate_accelerated_picard_2d(&
        ! proceed to the matrix solution
        assembly_is_done = .true.
 
+       if (verbose_picard .and. this_rank == rtest) then
+          write(iulog,*) 'Reduce alpha to', alpha_accel - gamma_accel
+          write(iulog,*) 'Continue to matrix solver'
+       endif
+
     endif  ! L2_norm of residual has reduced
 
   end subroutine evaluate_accelerated_picard_2d
@@ -9969,6 +9985,7 @@ end subroutine evaluate_accelerated_picard_2d
 !****************************************************************************
 
   subroutine evaluate_accelerated_picard_3d(&
+       rtest,                               &
        L2_norm,       L2_norm_large,        &
        L2_norm_alpha_sav,                   &
        alpha_accel,   alpha_accel_max,      &
@@ -9984,6 +10001,8 @@ subroutine evaluate_accelerated_picard_3d(&
        beta_internal, beta_internal_sav,    &
        assembly_is_done)
 
+    integer, intent(in) :: rtest    ! rank for diagnostic point
+
     real(dp), intent(in) :: &
          L2_norm,                &  ! latest value of L2 norm of residual
          L2_norm_large,          &  ! large value for re-initializing the L2 norm
@@ -10035,8 +10054,8 @@ subroutine evaluate_accelerated_picard_3d(&
        alpha_accel = alpha_accel + gamma_accel
        L2_norm_alpha_sav = L2_norm
 
-       if (verbose_picard .and. main_task) then
-          write(iulog,*) 'Keep going, alpha =', alpha_accel
+       if (verbose_picard .and. this_rank == rtest) then
+          write(iulog,*) 'Increase alpha to', alpha_accel
        endif
 
        ! Since assembly_is_done = F, we now return to the start of the loop:
@@ -10062,6 +10081,11 @@ subroutine evaluate_accelerated_picard_3d(&
        ! proceed to the matrix solution
        assembly_is_done = .true.
 
+       if (verbose_picard .and. this_rank == rtest) then
+          write(iulog,*) 'Hold alpha at', alpha_accel
+          write(iulog,*) ' Continue to matrix solver'
+       endif
+
     else
 
        ! The residual is larger than the previous value.
@@ -10078,11 +10102,6 @@ subroutine evaluate_accelerated_picard_3d(&
        uvel_old = uvel
        vvel_old = vvel
 
-       if (verbose_picard .and. main_task) then
-          write(iulog,*) 'Back up to alpha =', alpha_accel - gamma_accel
-          write(iulog,*) 'Continue to matrix solver'
-       endif
-
        ! Reset alpha_accel and L2_norm_alpha_sav for the next nonlinear iteration
        alpha_accel = 1.0d0
        L2_norm_alpha_sav = L2_norm_large
@@ -10090,6 +10109,11 @@ subroutine evaluate_accelerated_picard_3d(&
        ! proceed to the matrix solution
        assembly_is_done = .true.
 
+       if (verbose_picard .and. main_task) then
+          write(iulog,*) 'Back up to alpha =', alpha_accel - gamma_accel
+          write(iulog,*) 'Continue to matrix solver'
+       endif
+
     endif  ! L2_norm of residual has reduced
 
   end subroutine evaluate_accelerated_picard_3d

From e5d65807c2063d4cc9cbfaea1e1b438d90e1656d Mon Sep 17 00:00:00 2001
From: William Lipscomb <lipscomb@ucar.edu>
Date: Mon, 2 Feb 2026 17:04:40 -0700
Subject: [PATCH 20/21] Added a modified Schoof basal sliding law

This commit adds a new sliding law similar to the Schoof law but with a simpler algebraic form:

  1/taub = 1/taub_p + 1/taub_c,

where taub_p and taub_c denote the basal shear stress for a power law (taub ~ Cp * ub^(1/m),
a coulomb law (taub ~ Cc * N). This is similar to the equation for the harmonic mean,
which is weighted toward smaller values while ignoring large outliers.
When taub_p is very large, taub asymptotes to taub_c, and when taub_c is very large,
taub asymptotes to taub_p. In this way, the new sliding law resembles the School law.

Unlike the Schoof law, the new law supports simulataneous inversion for both Cp and Cc.
Where the ice is too thin (thick), both Cp and Cc are driven to higher (lower) values.
For Antarctic tests, I chose max and min values of Cc and Cp such that taub_p will largely
determine the net taub in the interior where N is very large, and taub_c is likely to
determine taub in coastal regions where N is small.

In these tests, the pattern of Cp is very similar to the patterns obtained with
the Schoof law. But for a given value of Cp_min, the modified Schoof law has smaller
thick biases (e.g., for the Siple coast), because it has the freedom to lower Cc
as well as Cp where the bed needs to be slippery.

Also, the modified Schoof law seems to have better convergence properties
than the standard Schoof law, significantly reducing the cost of a typical AIS fun.
I suspect this is true because the Schoof law struggles with a few grid cells
where taub is insensitive to velocity, whereas the modified law always has
some velocity dependence.

I also ran some Greenland tests where the modified Schoof law doesn't converge as well.
I'll look at this later.

To run the new law, the user should set which_ho_babc = 12 in the config file.
I moved the old option 12 (which is not often used) to option 14, and removed
the previous option 14 (which wasn't parallelized and hasn't been used in years).
To invert for both Cp and Cc, simply set which_ho_coulomb_c = which_ho_powerlaw_c = 1.

Note: Others have used this law, although I'm not aware that it's widely used.
For example, Barnes and Gudmundsson (2022) tested this law in WAIS simulations;
see their Eq. (6). They call it "regularised Coulomb sliding", citing the MISMIP+
paper by Cornford et al. (2020), but I didn't find this particular formulation in
Cornford et al.
---
 libglide/glide_setup.F90                | 50 +++++++++++-----
 libglide/glide_types.F90                | 22 +++----
 libglissade/glissade_basal_traction.F90 | 77 ++++++++++++++++---------
 libglissade/glissade_inversion.F90      | 49 +++++++++-------
 4 files changed, 123 insertions(+), 75 deletions(-)

diff --git a/libglide/glide_setup.F90 b/libglide/glide_setup.F90
index 28f73706..e25fcc3e 100644
--- a/libglide/glide_setup.F90
+++ b/libglide/glide_setup.F90
@@ -1129,9 +1129,9 @@ subroutine print_options(model)
          'power law                                        ', &
          'Coulomb friction law w/ effec press              ', &
          'Schoof friction law                              ', &
+         'modified Schoof friction law                     ', &
          'min of Coulomb stress and power-law stress (Tsai)', &
          'power law using effective pressure               ', &
-         'simple pattern of beta                           ', &
          'till yield stress (Picard)                       ' /)
 
     character(len=*), dimension(0:1), parameter :: ho_whichbeta_limit = (/ &
@@ -1829,8 +1829,8 @@ subroutine print_options(model)
 
        if (model%options%use_c_space_factor) then
           if (model%options%which_ho_babc == HO_BABC_COULOMB_FRICTION .or.  &
-              model%options%which_ho_babc == HO_BABC_COULOMB_POWERLAW_SCHOOF .or. &
-              model%options%which_ho_babc == HO_BABC_COULOMB_POWERLAW_TSAI) then
+              model%options%which_ho_babc == HO_BABC_SCHOOF .or. &
+              model%options%which_ho_babc == HO_BABC_TSAI) then
              write(message,*) 'Multiplying beta by C_space_factor'
              call write_log(message)
           else
@@ -1879,18 +1879,19 @@ subroutine print_options(model)
 
        ! Inversion options
 
-       ! Note: Inversion for Cp is currently supported for the Schoof sliding law, Tsai law, and basic power law
+       ! Note: Inversion for Cp is supported for the basic power law plus the Schoof and Tsai laws
        if (model%options%which_ho_powerlaw_c == HO_POWERLAW_C_INVERSION .or. &
            model%options%which_ho_powerlaw_c == HO_POWERLAW_C_INVERSION_BASIN) then
 
-          if (model%options%which_ho_babc == HO_BABC_COULOMB_POWERLAW_SCHOOF .or.  &
-              model%options%which_ho_babc == HO_BABC_COULOMB_POWERLAW_TSAI .or.  &
-              model%options%which_ho_babc == HO_BABC_POWERLAW) then
+          if (model%options%which_ho_babc == HO_BABC_POWERLAW .or. &
+              model%options%which_ho_babc == HO_BABC_SCHOOF .or.  &
+              model%options%which_ho_babc == HO_BABC_MODIFIED_SCHOOF .or.  &
+              model%options%which_ho_babc == HO_BABC_TSAI) then
              ! inversion for Cp is supported
           else
              call write_log('Error, Cp inversion is not supported for this basal BC option')
              write(message,*) 'Cp inversion is supported for these options: ', &
-                  HO_BABC_COULOMB_POWERLAW_SCHOOF, HO_BABC_COULOMB_POWERLAW_TSAI, HO_BABC_POWERLAW
+                  HO_BABC_POWERLAW, HO_BABC_SCHOOF, HO_BABC_MODIFIED_SCHOOF, HO_BABC_TSAI
              call write_log(message, GM_FATAL)
           endif
        endif
@@ -1900,12 +1901,13 @@ subroutine print_options(model)
            model%options%which_ho_coulomb_c == HO_COULOMB_C_INVERSION_BASIN) then
 
           if (model%options%which_ho_babc == HO_BABC_ZOET_IVERSON .or. &
-              model%options%which_ho_babc == HO_BABC_PSEUDO_PLASTIC) then
+              model%options%which_ho_babc == HO_BABC_PSEUDO_PLASTIC .or. &
+              model%options%which_ho_babc == HO_BABC_MODIFIED_SCHOOF) then
              ! inversion for Cc is supported
           else
              call write_log('Error, Cc inversion is not supported for this basal BC option')
              write(message,*) 'Cc inversion is supported for these options: ', &
-                  HO_BABC_ZOET_IVERSON, HO_BABC_PSEUDO_PLASTIC
+                  HO_BABC_ZOET_IVERSON, HO_BABC_PSEUDO_PLASTIC, HO_BABC_MODIFIED_SCHOOF
              call write_log(message, GM_FATAL)
           endif
        endif
@@ -2774,7 +2776,7 @@ subroutine print_parameters(model)
        call write_log(message)
        write(message,*) 'bed bump wavelength for Coulomb friction law : ', model%basal_physics%coulomb_bump_wavelength
        call write_log(message)
-    elseif (model%options%which_ho_babc == HO_BABC_COULOMB_POWERLAW_SCHOOF) then
+    elseif (model%options%which_ho_babc == HO_BABC_SCHOOF) then
        ! Note: The Schoof law typically uses a spatially variable powerlaw_c.
        !       If so, the value written here is just the initial value.
        write(message,*) 'Cc for Schoof Coulomb law                    : ', model%basal_physics%coulomb_c_const
@@ -2787,7 +2789,24 @@ subroutine print_parameters(model)
        call write_log(message)
        write(message,*) 'm exponent for Schoof power law              : ', model%basal_physics%powerlaw_m
        call write_log(message)
-    elseif (model%options%which_ho_babc == HO_BABC_COULOMB_POWERLAW_TSAI) then
+    elseif (model%options%which_ho_babc == HO_BABC_MODIFIED_SCHOOF) then
+       ! Note: This law supports inversion for both Cc and Cp.
+       !       When inverting, the values here are just the initial values.
+       write(message,*) 'Cc for modified Schoof law                   : ', model%basal_physics%coulomb_c_const
+       call write_log(message)
+       write(message,*) 'Max Cc                                       : ', model%basal_physics%coulomb_c_max
+       call write_log(message)
+       write(message,*) 'Min Cc                                       : ', model%basal_physics%coulomb_c_min
+       call write_log(message)
+       write(message,*) 'Cp for modified Schoof law, Pa (m/yr)^(-1/3) : ', model%basal_physics%powerlaw_c_const
+       call write_log(message)
+       write(message,*) 'Max Cp                                       : ', model%basal_physics%powerlaw_c_max
+       call write_log(message)
+       write(message,*) 'Min Cp                                       : ', model%basal_physics%powerlaw_c_min
+       call write_log(message)
+       write(message,*) 'm exponent for power law                     : ', model%basal_physics%powerlaw_m
+       call write_log(message)
+    elseif (model%options%which_ho_babc == HO_BABC_TSAI) then
        ! Note: The Tsai law typically uses a spatially variable powerlaw_c. 
        !       If so, the value written here is just the initial value.
        write(message,*) 'Cc for Tsai Coulomb law                      : ', model%basal_physics%coulomb_c_const
@@ -2889,6 +2908,9 @@ subroutine print_parameters(model)
        write(message,*) 'coulomb_c min                                : ', &
             model%basal_physics%coulomb_c_min
        call write_log(message)
+       write(message,*) 'coulomb_c const                              : ', &
+            model%basal_physics%coulomb_c_const
+       call write_log(message)
        write(message,*) 'thickness scale (m) for C_c inversion        : ', &
             model%inversion%babc_thck_scale
        call write_log(message)
@@ -4027,12 +4049,12 @@ subroutine define_glide_restart_variables(model, model_id)
     ! basal sliding option
     select case (options%which_ho_babc)
        !WHL - Removed effecpress as a restart variable; it is recomputed with each velocity solve.
-!!      case (HO_BABC_POWERLAW, HO_BABC_COULOMB_FRICTION, HO_BABC_COULOMB_POWERLAW_SCHOOF)
+!!      case (HO_BABC_POWERLAW, HO_BABC_COULOMB_FRICTION, HO_BABC_SCHOOF)
 !!        ! These friction laws need effective pressure
 !!        call glide_add_to_restart_variable_list('effecpress', model_id)
 !!      case(HO_BABC_COULOMB_POWERLAW_TSAI)
 !!        call glide_add_to_restart_variable_list('effecpress', model_id)
-      case (HO_BABC_COULOMB_FRICTION, HO_BABC_COULOMB_POWERLAW_SCHOOF, HO_BABC_COULOMB_POWERLAW_TSAI)
+      case (HO_BABC_COULOMB_FRICTION, HO_BABC_SCHOOF, HO_BABC_TSAI)
          ! Note: These options compute beta internally, so it does not need to be in the restart file.
          if (options%use_c_space_factor) then
             ! c_space_factor needs to be in the restart file
diff --git a/libglide/glide_types.F90 b/libglide/glide_types.F90
index 0731616b..e52641f1 100644
--- a/libglide/glide_types.F90
+++ b/libglide/glide_types.F90
@@ -249,7 +249,7 @@ module glide_types
   integer, parameter :: HO_THERMAL_AFTER_TRANSPORT = 1
   integer, parameter :: HO_THERMAL_SPLIT_TIMESTEP = 2
   
-  !TODO - Deprecate the last two options? Rarely if ever used.
+  !TODO - Deprecate some little-used options?
   integer, parameter :: HO_BABC_BETA_CONSTANT = 0
   integer, parameter :: HO_BABC_BETA_BPMP = 1
   integer, parameter :: HO_BABC_PSEUDO_PLASTIC = 2
@@ -261,10 +261,10 @@ module glide_types
   integer, parameter :: HO_BABC_ISHOMC = 8
   integer, parameter :: HO_BABC_POWERLAW = 9
   integer, parameter :: HO_BABC_COULOMB_FRICTION = 10
-  integer, parameter :: HO_BABC_COULOMB_POWERLAW_SCHOOF = 11
-  integer, parameter :: HO_BABC_COULOMB_POWERLAW_TSAI = 12
-  integer, parameter :: HO_BABC_POWERLAW_EFFECPRESS = 13
-  integer, parameter :: HO_BABC_SIMPLE = 14
+  integer, parameter :: HO_BABC_SCHOOF = 11
+  integer, parameter :: HO_BABC_MODIFIED_SCHOOF = 12
+  integer, parameter :: HO_BABC_TSAI = 13
+  integer, parameter :: HO_BABC_POWERLAW_EFFECPRESS = 14
   integer, parameter :: HO_BABC_YIELD_PICARD = 15
 
   integer, parameter :: HO_BETA_LIMIT_ABSOLUTE = 0
@@ -851,10 +851,10 @@ module glide_types
     !> \item[8] beta field as prescribed for ISMIP-HOM test C (serial only)
     !> \item[9] power law
     !> \item[10] Coulomb friction law using effective pressure, with flwa from lowest ice layer
-    !> \item[11] Coulomb friction law using effective pressure, with constant basal flwa
-    !> \item[12] basal stress is the minimum of Coulomb and power-law values, as in Tsai et al. (2015)
-    !> \item[13] power law using effective pressure
-    !> \item[14] simple hard-coded pattern (useful for debugging)
+    !> \item[11] Schoof law that blends powerlaw and Coulomb behavior
+    !> \item[12] modified version of the Schoof law
+    !> \item[13] basal stress is the minimum of Coulomb and power-law values, as in Tsai et al. (2015)
+    !> \item[14] power law using effective pressure
     !> \item[15] treat beta value as a till yield stress (in Pa) using Picard iteration
     !> \end{description}
 
@@ -2204,7 +2204,7 @@ module glide_types
                                                                 !> Note: Defined on velocity grid, whereas temp and bpmp are on ice grid
 
 
-     ! Note: c_space_factor supported for which_ho_babc = HO_BABC_COULOMB_FRICTION, *COULOMB_POWERLAW_SCHOOF AND *COULOMB_POWERLAW_TSAI
+     ! Note: c_space_factor supported for which_ho_babc = HO_BABC_COULOMB_FRICTION, *SCHOOF AND *TSAI
      real(dp), dimension(:,:), pointer :: c_space_factor => null()      !> spatial factor for basal shear stress (no dimension)
      real(dp), dimension(:,:), pointer :: c_space_factor_stag => null() !> spatial factor for basal shear stress on staggered grid
 
@@ -2246,7 +2246,7 @@ module glide_types
           coulomb_c_hi => null(),  &             !> coulomb_c value at high bed elevation, topg >= bed_hi
           coulomb_c_lo => null()                 !> coulomb_c value at low bed elevation, topg <= bed_lo
 
-     ! parameters for power law, taub_b = C * u_b^(1/m); used for HO_BABC_COULOMB_POWERLAW_TSAI/SCHOOF
+     ! parameters for power law, taub_b = C * u_b^(1/m); used for HO_BABC_SCHOOF AND *_TSAI
      ! The default values are from Asay-Davis et al. (2016).
      ! The value of powerlaw_c suggested by Tsai et al. (2015) is 7.624d6 Pa m^(-1/3) s^(1/3).
      ! This value can be converted to CISM units by dividing by scyr^(1/3), to obtain 2.413d4 Pa m^(-1/3) yr^(1/3).
diff --git a/libglissade/glissade_basal_traction.F90 b/libglissade/glissade_basal_traction.F90
index 3ffcca40..91ec33a1 100644
--- a/libglissade/glissade_basal_traction.F90
+++ b/libglissade/glissade_basal_traction.F90
@@ -157,9 +157,10 @@ subroutine glissade_calcbeta (&
        big_lambda,         & ! bedrock characteristics
        flwa_basal_stag       ! basal flwa interpolated to the staggered grid (Pa^{-n} yr^{-1})
 
-  ! variables for Tsai et al. parameterization
-  real(dp) :: taub_powerlaw  ! basal shear stress given by a power law as in Tsai et al. (2015)
-  real(dp) :: taub_coulomb   ! basal shear stress given by Coulomb friction as in Tsai et al. (2015)
+  ! variables for mixed power/Coulomb laws
+  real(dp) :: taub           ! basal shear stress
+  real(dp) :: taub_powerlaw  ! basal shear stress given by a power law
+  real(dp) :: taub_coulomb   ! basal shear stress given by Coulomb friction
 
   ! variables for pseudo-plastic law
   real(dp) :: q              ! exponent for pseudo-plastic law (unitless)
@@ -185,6 +186,7 @@ subroutine glissade_calcbeta (&
   ! Enforce a minimum speed to prevent beta from become very large when velocity is small.
   speed(:,:) = dsqrt(thisvel(:,:)**2 + othervel(:,:)**2 + smallnum**2)
 
+  !TODO - Should this be done only for powerlaw sliding, and not for ZI or PP?
   ! If beta_powerlaw_umax is set to a nonzero value, then limit the speed to this value.
   ! Note: The actual ice speed can be greater than umax.  This is just a way of shutting off the feedback
   !        between beta and ice speed (beta down as speed up) when the ice speed is large.
@@ -504,9 +506,9 @@ subroutine glissade_calcbeta (&
           beta = 1.0d8
        end where
 
-    case(HO_BABC_COULOMB_POWERLAW_SCHOOF)
+    case(HO_BABC_SCHOOF)
 
-       ! Use the basal friction formulation of Schoof (2005), modified following Asay-Davis et al. (2016).
+       ! Use the basal friction formulation of Schoof (2005), formluated following Asay-Davis et al. (2016).
        ! This formulation uses a constant value of basal flwa, which allows several Coulomb parameters
        !  (lambda_max, m_max and flwa_basal) to be combined into a single parameter powerlaw_c, 
        !  as in the Tsai power law below.
@@ -576,16 +578,48 @@ subroutine glissade_calcbeta (&
 !         write(iulog,*) ew, ns, speed(ew,ns), basal_physics%effecpress_stag(ew,ns), beta(ew,ns), beta(ew,ns)*speed(ew,ns)
 !      enddo
 
-    case(HO_BABC_COULOMB_POWERLAW_TSAI)
+    case(HO_BABC_MODIFIED_SCHOOF)
 
-      ! Basal stress representation based on Tsai et al. (2015)
-      ! The basal stress is the minimum of two values:
-      ! (1) power law:          tau_b = powerlaw_c * |u_b|^(1/powerlaw_m)
-      ! (2) Coulomb friction:   tau_b = coulomb_c * N
-      !                             N = effective pressure = rhoi*g*(H - H_f)
-      !                           H_f = flotation thickness = (rhow/rhoi)*(eus-topg)
-      ! This value of N is obtained by setting p_ocean_penetration = 1.0 in the config file.
-      ! The other parameters (powerlaw_c, powerlaw_m and coulomb_c) can also be set in the config file.
+       ! Modified version of the Schoof law, with a simpler albebraic form
+       ! The basal stress is given by
+       ! 1/tau_b = 1/tau_p + 1/tau_c
+       ! where tau_p = powerlaw_c * |u_b|^(1/powerlaw_m)
+       !       tau_c = coulomb_c * N
+       !           N = effective pressure
+       ! Note: taub = 1/2 of the harmonic mean of tau_p and tau_c
+
+       do ns = 1, nsn-1
+          do ew = 1, ewn-1
+
+             taub_powerlaw = basal_physics%powerlaw_c(ew,ns) * speed(ew,ns)**(1.d0/basal_physics%powerlaw_m)
+             taub_coulomb  = basal_physics%coulomb_c(ew,ns) * basal_physics%effecpress_stag(ew,ns)
+
+             if (taub_coulomb > 0.0d0 .and. taub_powerlaw > 0.0d0) then
+                taub = 1.0d0 / (1.0d0/taub_powerlaw + 1.0d0/taub_coulomb)
+             elseif (taub_powerlaw > 0.0d0) then
+                taub = taub_powerlaw
+             elseif (taub_coulomb > 0.0d0) then
+                taub = taub_coulomb
+             endif
+             beta(ew,ns) = taub / speed(ew,ns)
+
+             if (verbose_beta .and. present(rtest) .and. present(itest) .and. present(jtest)) then
+                if (this_rank == rtest .and. ew == itest .and. ns == jtest) then
+                   write(iulog,*) ' '
+                   write(iulog,'(a38,3i4,4f12.3)') 'rank, i, j, tau_p, tau_c, tau_b, beta:', &
+                        this_rank, ew, ns, taub_powerlaw, taub_coulomb, taub, beta(ew,ns)
+                endif
+             endif
+          enddo   ! ew
+       enddo   ! ns
+
+    case(HO_BABC_TSAI)
+
+       ! Basal stress representation based on Tsai et al. (2015)
+       ! The basal stress is the minimum of two values:
+       ! (1) power law:          tau_p = powerlaw_c * |u_b|^(1/powerlaw_m)
+       ! (2) Coulomb friction:   tau_c = coulomb_c * N
+       !                             N = effective pressure
 
        do ns = 1, nsn-1
           do ew = 1, ewn-1
@@ -607,18 +641,6 @@ subroutine glissade_calcbeta (&
           beta(:,:) = beta(:,:) * basal_physics%c_space_factor_stag(:,:)
        endif
 
-    case(HO_BABC_SIMPLE)    ! simple pattern; also useful for debugging and test cases
-                            ! (here, a strip of weak bed surrounded by stronger bed to simulate an ice stream)
-
-      beta(:,:) = 1.d4        ! Pa yr/m
-
-      !TODO - Change this loop to work in parallel (set beta on the global grid and scatter to local)
-      do ns = 5, nsn-5
-         do ew = 1, ewn-1
-            beta(ew,ns) = 100.d0      ! Pa yr/m
-         end do
-      end do
-
     case default
        ! do nothing
 
@@ -686,11 +708,10 @@ subroutine glissade_calcbeta (&
    !TODO - Move this halo update to a higher level?
    call staggered_parallel_halo(beta, parallel)
 
-   !WHL - debug
    if (verbose_beta .and. present(rtest) .and. present(itest) .and. present(jtest)) then
       if (this_rank == rtest) then
          ew = itest; ns = jtest
-         write(iulog,*) 'End of calcbeta, r, i, j, speed, f_ground, beta:', &
+         write(iulog,'(a48,3i4,3f12.5)') 'End of calcbeta, r, i, j, speed, f_ground, beta:', &
               rtest, ew, ns, speed(ew,ns), f_ground(ew,ns), beta(ew,ns)
       endif
    endif
diff --git a/libglissade/glissade_inversion.F90 b/libglissade/glissade_inversion.F90
index d12f25ca..54c688b3 100644
--- a/libglissade/glissade_inversion.F90
+++ b/libglissade/glissade_inversion.F90
@@ -89,9 +89,6 @@ subroutine glissade_inversion_init(model)
          f_flotation,          & ! flotation function (m)
          thck_obs                ! observed ice thickness, derived from usrf_obs and topg
 
-    real(dp), dimension(model%general%ewn, model%general%nsn) ::  &
-         coulomb_c_icegrid      ! initial coulomb_c at cell centers based on masks
-
     real(dp) :: h_obs, h_flotation, h_buff   ! thck_obs, flotation thickness, and thck_flotation_buffer scaled to m
     real(dp) :: dh                           ! h_obs - h_flotation
     real(dp) :: dh_decimal                   ! decimal part remaining after subtracting the truncation of dh
@@ -226,7 +223,6 @@ subroutine glissade_inversion_init(model)
        endif
     endif  ! inversion for Cp, Cc or deltaT_ocn
 
-
     !----------------------------------------------------------------------
     ! If inverting for E, then make sure there is a target surface speed, velo_sfc_obs.
     !----------------------------------------------------------------------
@@ -254,6 +250,9 @@ subroutine glissade_inversion_init(model)
 
     !----------------------------------------------------------------------
     ! computations specific to powerlaw_c (Cp) and coulomb_c (Cc) inversion
+    ! Note: Most sliding laws have inversion for Cp or Cc, but not both.
+    !       The modified Schoof law, however, supports inversion for both.
+    !       (This could be extended to the School and Tsai laws.)
     !----------------------------------------------------------------------
 
     if (model%options%which_ho_powerlaw_c == HO_POWERLAW_C_INVERSION .or.  &
@@ -268,7 +267,11 @@ subroutine glissade_inversion_init(model)
           call point_diag(model%basal_physics%powerlaw_c, 'init_inversion for powerlaw_c', itest, jtest, rtest, 7, 7)
        endif
 
-    elseif (model%options%which_ho_coulomb_c == HO_COULOMB_C_INVERSION) then
+    endif   ! invert for powerlaw_c
+
+    !TODO - Add distinct logic for powerlaw_c_inversion_basin?
+
+    if (model%options%which_ho_coulomb_c == HO_COULOMB_C_INVERSION) then
 
        if (parallel_is_zero(model%basal_physics%coulomb_c)) then
           ! initialize coulomb_c (for which we will invert)
@@ -280,7 +283,9 @@ subroutine glissade_inversion_init(model)
                'init_inversion for coulomb_c', itest, jtest, rtest, 7, 7)
        endif
 
-    elseif (model%options%which_ho_coulomb_c == HO_COULOMB_C_INVERSION_BASIN) then
+    endif   ! invert for coulomb_c
+
+    if (model%options%which_ho_coulomb_c == HO_COULOMB_C_INVERSION_BASIN) then
 
        !TODO - Should this calculation be done in glissade_initialise?
        if (parallel_is_zero(model%basal_physics%coulomb_c_lo)) then
@@ -311,10 +316,11 @@ subroutine glissade_inversion_init(model)
                'init_inversion for basin-scale coulomb_c', itest, jtest, rtest, 7, 7)
        endif
 
-    endif
+    endif   ! invert for coulomb_c_basin
 
     !----------------------------------------------------------------------
     ! computations specific to flow_enhancement_factor inversion
+    ! TODO: Remove this inversion option?
     !----------------------------------------------------------------------
 
     if (model%options%which_ho_flow_enhancement_factor == HO_FLOW_ENHANCEMENT_FACTOR_INVERSION) then
@@ -557,7 +563,7 @@ subroutine glissade_inversion_solve(model)
        call staggered_parallel_halo(stag_thck, parallel)
        call staggered_parallel_halo(stag_dthck_dt, parallel)
 
-       ! Invert for powerlaw_c or coulomb_c
+       ! Invert for powerlaw_c and/or coulomb_c
        ! The logic is the same for each; only the max and min values and the in/out field are different.
 
        if ( model%options%which_ho_powerlaw_c == HO_POWERLAW_C_INVERSION) then
@@ -593,7 +599,9 @@ subroutine glissade_inversion_solve(model)
              call point_diag(model%basal_physics%powerlaw_c, 'New powerlaw_c', itest, jtest, rtest, 7, 7)
           endif
 
-       elseif ( model%options%which_ho_coulomb_c == HO_COULOMB_C_INVERSION) then
+       endif   ! invert for powerlaw_c
+
+       if (model%options%which_ho_coulomb_c == HO_COULOMB_C_INVERSION) then
 
           if (verbose_inversion .and. this_rank == rtest) then
              write(iulog,*) ' '
@@ -624,23 +632,18 @@ subroutine glissade_inversion_solve(model)
 
           if (verbose_inversion) then
              call point_diag(model%basal_physics%effecpress_stag, 'effecpress_stag', itest, jtest, rtest, 7, 7, '(f10.1)')
-             call point_diag(rhoi*grav*stag_thck, 'overburden', itest, jtest, rtest, 7, 7, '(f10.1)')
-             call point_diag((model%geometry%thck - thck_obs), 'thck - thck_obs (m)', itest, jtest, rtest, 7, 7)
              call point_diag(model%basal_physics%coulomb_c, 'New coulomb_c', itest, jtest, rtest, 7, 7, '(f10.5)')
           endif   ! verbose_inversion
 
-       endif  ! invert for powerlaw_c or coulomb_c
+       endif   ! invert for coulomb_c
 
-    else   ! do not invert for powerlaw_c or coulomb_c; just print optional diagnostics
+    elseif (verbose_inversion) then   ! not inverting, but print some diagnostic values
 
-       if (verbose_inversion) then
-          call point_diag(model%geometry%f_ground, 'f_ground at vertices', itest, jtest, rtest, 7, 7, '(f10.4)')
-          call point_diag(model%basal_physics%powerlaw_c, 'powerlaw_c', itest, jtest, rtest, 7, 7, '(f10.2)')
-          call point_diag(model%basal_physics%coulomb_c, 'coulomb_c', itest, jtest, rtest, 7, 7, '(f10.4)')
-       endif
-
-    endif   ! invert for powerlaw_c or coulomb_c
+       call point_diag(model%geometry%f_ground, 'f_ground at vertices', itest, jtest, rtest, 7, 7, '(f10.4)')
+       call point_diag(model%basal_physics%powerlaw_c, 'powerlaw_c', itest, jtest, rtest, 7, 7, '(f10.2)')
+       call point_diag(model%basal_physics%coulomb_c, 'coulomb_c', itest, jtest, rtest, 7, 7, '(f10.4)')
 
+    endif
 
     ! If inverting for powerlaw_c or coulomb_c at the basin scale, then update it here
 
@@ -1261,6 +1264,8 @@ subroutine invert_basal_friction(&
           if (f_ground(i,j) > 0.0d0) then  ! ice is at least partly grounded
 
              ! Compute tendency terms based on the thickness target
+             !TODO: Try putting max(babc_thck_scale, stag_dthck_obs) in the denominator
+             !      Alex Robinson says this might improve convergence
              term_thck = -stag_dthck(i,j) / (babc_thck_scale*babc_timescale)
              term_dHdt = -stag_dthck_dt(i,j) * 2.0d0 / babc_thck_scale
 
@@ -1330,8 +1335,8 @@ subroutine invert_basal_friction(&
        call point_diag(stag_dthck, 'stag_thck - stag_thck_obs', itest, jtest, rtest, 7, 7)
        call point_diag(stag_dthck_dt*scyr, 'stag_dthck_dt (m/yr)', itest, jtest, rtest, 7, 7)
        call point_diag(f_ground, 'f_ground', itest, jtest, rtest, 7, 7)
-       call point_diag(del2_logc, 'del2(logC)', itest, jtest, rtest, 7, 7, '(e12.3)')
-       call point_diag(logC, 'logC', itest, jtest, rtest, 7, 7)
+!!       call point_diag(del2_logc, 'del2(logC)', itest, jtest, rtest, 7, 7, '(e12.3)')
+!!       call point_diag(logC, 'logC', itest, jtest, rtest, 7, 7)
        call point_diag(dlogc, 'dlogC', itest, jtest, rtest, 7, 7, '(e12.3)')
     endif
 

From 2edf195ea52b485a847fde65e556686c99d2b601 Mon Sep 17 00:00:00 2001
From: Katetc <katec@ucar.edu>
Date: Thu, 19 Feb 2026 17:34:25 -0700
Subject: [PATCH 21/21] Changes needed to compile with the GNU compiler

---
 libglimmer/glimmer_utils.F90 | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/libglimmer/glimmer_utils.F90 b/libglimmer/glimmer_utils.F90
index 00a3859e..45038da6 100644
--- a/libglimmer/glimmer_utils.F90
+++ b/libglimmer/glimmer_utils.F90
@@ -621,13 +621,13 @@ subroutine double_to_binary(&
     binary_number = transfer(x, binary_number)
 
     ! Get the sign bit (bit 64)
-    sign_bit = ishft(binary_number, -63) .and. 1
+    sign_bit = IAND(ishft(binary_number, -63), INT(1, 8))
 
     ! Get the exponent bits (bits 63–53)
-    exponent_bits = ishft(binary_number, -52) .and. Z'7FF'
+    exponent_bits = IAND(ishft(binary_number, -52), INT(Z'7FF', 8))
 
     ! Extract mantissa (fraction) bits (bits 52–1)
-    mantissa_bits = binary_number .and. Z'FFFFFFFFFFFFF'
+    mantissa_bits = IAND(binary_number, INT(Z'FFFFFFFFFFFFF', 8))
 
     if (present(binary_full)) binary_full = binary_number
     if (present(binary_sign)) binary_sign = sign_bit
@@ -655,7 +655,7 @@ subroutine double_to_binary(&
     binary_str = concat(bin)
     if (verbose_binary) then
        write(iulog,*) 'Full 64-bit binary:'
-       write(iulog,*), ' ', binary_str
+       write(iulog,*) ' ', binary_str
     endif
 
   end subroutine double_to_binary