From 6a259ea92af78e8001323ea8b5a5546ac41989a9 Mon Sep 17 00:00:00 2001 From: William Lipscomb Date: Thu, 13 Nov 2025 08:21:47 -0700 Subject: [PATCH 01/21] Fixed a temperature initialization bug For temp_init = 1, 2 or 3, the initial ice temperature profile depends on artm, which therefore needs to be set correctly before initializing the temperature. For artm_input_function = ARTM_INPUT_FUNCTION_XY = 0 (the default), artm is read from the input file at startup with the correct value. But for the other artm_input_function options (XY_GRADZ, XYZ, XY_LAPSE), artm is derived from input fields such as artm_ref, artm_gradz and artm_3d. With this commit, CISM computes artm (if needed) at initialization, before computing the ice temperature profile. Without this computation, the temperature was initialized based on an erroneous artm = 0 and was too warm. I also added an SMB computation at initialization. This is not strictly needed but can be a helpful diagnostic. I moved the downscaling computations to new subroutines called downscale_artm and downscale_smb to avoid repeating code. --- libglissade/glissade_mass_balance.F90 | 567 +++++++++++++++++--------- 1 file changed, 366 insertions(+), 201 deletions(-) diff --git a/libglissade/glissade_mass_balance.F90 b/libglissade/glissade_mass_balance.F90 index 09c1b3ce..ea4c5fac 100644 --- a/libglissade/glissade_mass_balance.F90 +++ b/libglissade/glissade_mass_balance.F90 @@ -68,8 +68,8 @@ subroutine glissade_mass_balance_init(model) ! Initialize some fields related to the surface mass balance - use glimmer_paramets, only: eps11 use glimmer_physcon, only: rhow, rhoi, scyr + use cism_parallel, only: parallel_is_zero ! input/output arguments @@ -77,9 +77,23 @@ subroutine glissade_mass_balance_init(model) ! local variables - real(dp) :: local_maxval, global_maxval character(len=100) :: message - + + ! Initialize artm for the case that we are reading in artm_ref or artm_3d. + ! For some temp_init options, this is needed for correct interior temperatures. + if (model%options%artm_input_function /= ARTM_INPUT_FUNCTION_XY) then + call downscale_artm(model) + endif + + ! Initialize smb for the case that we are reading in smb_ref or smb_3d. + ! This is not strictly needed, since the SMB will be recomputed before it is used, + ! but can be a helpful diagnostic. + !TODO - Do this also for the PDD option? + if (model%options%smb_input_function == SMB_INPUT_FUNCTION_XY_GRADZ .or. & + model%options%smb_input_function == SMB_INPUT_FUNCTION_XYZ) then + call downscale_smb(model) + endif + ! Initialize acab, if SMB (with different units) was read in if (model%options%smb_input == SMB_INPUT_MMYR_WE) then ! Convert units from mm/yr w.e. to m/s ice @@ -87,6 +101,7 @@ subroutine glissade_mass_balance_init(model) endif ! Initialize artm_corrected. This is equal to artm, plus any prescribed temperature anomaly. + !TODO - Not sure this is needed model%climate%artm_corrected(:,:) = model%climate%artm(:,:) if (model%options%enable_artm_anomaly) then @@ -96,9 +111,7 @@ subroutine glissade_mass_balance_init(model) ! Note: The artm_anomaly field does not change during the run, ! but it is possible to ramp up the anomaly using artm_anomaly_timescale. - local_maxval = maxval(abs(model%climate%artm_anomaly)) - global_maxval = parallel_reduce_max(local_maxval) - if (global_maxval < eps11) then + if (parallel_is_zero(model%climate%artm_anomaly)) then model%climate%artm_anomaly = model%climate%artm_anomaly_const write(message,*) & 'Setting artm_anomaly = constant value (degC):', model%climate%artm_anomaly_const @@ -109,7 +122,6 @@ subroutine glissade_mass_balance_init(model) endif endif endif - !TODO - Write a short utility function to compute global_maxval of any field. !TODO - Repeat for snow and precip anomalies ! If acab is to be overwritten for some cells, then set overwrite_acab_mask = 1 for these cells. @@ -206,52 +218,9 @@ subroutine glissade_prepare_climate_forcing(model) ! Downscaling of artm_ref to artm (at the ice surface) happens below, followed by the SMB calculation. !------------------------------------------------------- - ! Downscale artm to the current surface elevation if needed. - ! The downscaling options are: - ! (0) artm(x,y); no dependence on surface elevation - ! (1) artm(x,y) + d(artm)/dz(x,y) * dz; artm depends on input field at reference elevation, plus vertical correction - ! (2) artm(x,y,z); artm obtained by linear interpolation between values prescribed at adjacent vertical levels - ! (3) artm(x,y) adjusted with a uniform lapse rate - ! For options (1) - (3), the elevation-dependent artm is computed here. - - if (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XY_GRADZ) then - - ! compute artm by a lapse-rate correction to the reference value - model%climate%artm(:,:) = model%climate%artm_ref(:,:) + & - (model%geometry%usrf(:,:) - model%climate%usrf_ref(:,:)) * model%climate%artm_gradz(:,:) - - elseif (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XYZ) then - - if (parallel_is_zero(model%climate%artm_3d)) then - write(message,*) 'Error: artm_3d = 0 everywhere with artm_input_function =', model%options%artm_input_function - call write_log(trim(message), GM_FATAL) - endif - - ! Note: With linear_extrapolate_in = T, the values outside the range are obtained by linear extrapolation - ! from the top two or bottom two values. - ! For temperature, which varies roughly linearly with elevation, this is more accurate - ! than simply extending the top and bottom values. - ! This call includes a halo update. - - call glissade_vertical_interpolate(& - ewn, nsn, & - nzatm, model%climate%zatm, & - model%geometry%usrf, & - model%climate%artm_3d, & - model%climate%artm, & - linear_extrapolate_in = .true.) - - elseif (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XY_LAPSE) then - - ! compute artm by a lapse-rate correction to artm_ref - ! T_lapse is defined as positive for T decreasing with height - - model%climate%artm(:,:) = model%climate%artm_ref(:,:) - & - (model%geometry%usrf(:,:) - model%climate%usrf_ref(:,:)) * model%climate%t_lapse - - endif ! artm_input_function - - call parallel_halo(model%climate%artm, parallel) + if (model%options%artm_input_function /= ARTM_INPUT_FUNCTION_XY) then + call downscale_artm(model) + endif ! Optionally, add an anomaly to the surface air temperature ! Typically, artm_corrected = artm, but sometimes (e.g., for ISMIP6 forcing experiments), @@ -348,155 +317,10 @@ subroutine glissade_prepare_climate_forcing(model) ! which is passed to the main mass balance driver. !------------------------------------------------------------------------- - ! ------------------------------------------------------------------------ - ! Depending on the SMB input options, compute model%climate%acab at the ice surface. - ! The options are: - ! (0) SMB(x,y); no dependence on surface elevation - ! (1) SMB(x,y) + dSMB/dz(x,y) * dz; SMB depends on input field at reference elevation, plus vertical correction - ! (2) SMB(x,y,z); SMB obtained by linear interpolation between values prescribed at adjacent vertical levels - ! (3) SMB obtained from precip and artm using a positive-degree scheme - ! - ! Options (1) and (2) require input fields with SMB units of mm/yr w.e. (SMB_INPUT_MMYR_WE) - ! For these options, the elevation-dependent SMB is computed here. - ! ------------------------------------------------------------------------ - - if (model%options%smb_input_function == SMB_INPUT_FUNCTION_XY_GRADZ) then - - ! downscale SMB to the local surface elevation - model%climate%smb(:,:) = model%climate%smb_ref(:,:) + & - (model%geometry%usrf(:,:) - model%climate%usrf_ref(:,:)) * model%climate%smb_gradz(:,:) - - elseif (model%options%smb_input_function == SMB_INPUT_FUNCTION_XYZ) then - - ! downscale SMB to the local surface elevation - ! Note: With linear_extrapolate_in = F, the values at top and bottom levels are simply extended upward and downward. - ! For SMB, this is safer than linear extrapolation (especially when extrapolating upward). - - if (parallel_is_zero(model%climate%smb_3d)) then - write(message,*) 'Error: smb_3d = 0 everywhere with smb_input_function =', model%options%smb_input_function - call write_log(trim(message), GM_FATAL) - endif - - call glissade_vertical_interpolate(& - ewn, nsn, & - nzatm, model%climate%zatm, & - model%geometry%usrf, & - model%climate%smb_3d, & - model%climate%smb, & - linear_extrapolate_in = .false.) - - elseif (model%options%smb_input_function == SMB_INPUT_FUNCTION_PDD) then - - ! Compute SMB using a simple PDD scheme: - ! (1) Partition precip as rain or snow based on the downscaled artm - ! (2) Compute ablation based on artm and a degree factor - ! Assume that artm has already been downscaled, if needed, based on artm_input_function. - - ! Note: This is similar to the SMB calculation for glaciers, but that calculation is done in the glacier module. - ! TODO: Put the glacier values of snow_threshold_min and snow_threshold_max in the climate derived type. - - ! compute snow accumulation (mm/yr w.e.) - where (model%climate%artm > model%climate%snow_threshold_max) - model%climate%snow = 0.0d0 ! all precip falls as rain - elsewhere (model%climate%artm < model%climate%snow_threshold_min) - model%climate%snow = model%climate%precip ! all precip falls as snow - elsewhere (model%climate%artm > model%climate%snow_threshold_min) - model%climate%snow = model%climate%precip * (model%climate%snow_threshold_max - model%climate%artm) & - / (model%climate%snow_threshold_max - model%climate%snow_threshold_min) - endwhere - - ! compute ablation (mm/yr w.e.) - ! Note: degree_factor has units of mm/yr w.e./degC to be consistent with other mass-balance variables. - ! It is like mu_star for glaciers. - model%climate%ablation = model%climate%degree_factor * max(model%climate%artm - model%climate%tmlt, 0.0d0) - - ! compute smb (mm/yr w.e.) - model%climate%smb = model%climate%snow - model%climate%ablation - - ! set smb = 0 for open ocean - where (model%geometry%thck == 0.0d0 .and. (model%geometry%topg - model%climate%eus) < 0.0d0) - model%climate%smb = 0.0d0 - endwhere - - endif ! smb_input_function - - ! For the non-default smb_input_function options, make sure that model%climate%smb is nonzero somewhere; else abort. - ! For the default option, do not abort, since idealized tests often have a zero SMB. - - call parallel_halo(model%climate%smb, parallel) - - if (model%options%smb_input_function == SMB_INPUT_FUNCTION_XY_GRADZ .or. & - model%options%smb_input_function == SMB_INPUT_FUNCTION_XYZ .or. & - model%options%smb_input_function == SMB_INPUT_FUNCTION_PDD) then - if (parallel_is_zero(model%climate%smb)) then - write(message,*) 'Error: smb = 0 everywhere with smb_input_function =', model%options%smb_input_function - call write_log(trim(message), GM_FATAL) - endif + if (model%options%smb_input_function /= SMB_INPUT_FUNCTION_XY) then + call downscale_smb(model) endif - ! optional diagnostics - if (verbose_smb) then - - if (this_rank == rtest) then - write(iulog,*) 'Computing runtime smb with smb_input_function =', model%options%smb_input_function - endif - call point_diag(model%geometry%usrf, 'usrf (m)', itest, jtest, rtest, 7, 7) - - if (model%options%smb_input_function == SMB_INPUT_FUNCTION_XY) then - call point_diag(model%climate%smb, 'smb (mm/yr)', itest, jtest, rtest, 7, 7) - elseif (model%options%smb_input_function == SMB_INPUT_FUNCTION_XY_GRADZ) then - call point_diag(model%geometry%usrf - model%climate%usrf_ref, 'usrf - usrf_ref (m)', & - itest, jtest, rtest, 7, 7) - call point_diag(model%climate%smb_ref, 'reference smb (mm/yr)', itest, jtest, rtest, 7, 7) - call point_diag(model%climate%smb_gradz, 'smb_gradz (mm/yr per m)', itest, jtest, rtest, 7, 7) - call point_diag(model%climate%smb, 'downscaled smb (mm/yr)', itest, jtest, rtest, 7, 7) - elseif (model%options%smb_input_function == SMB_INPUT_FUNCTION_XYZ) then - if (this_rank == rtest) then - write(iulog,*) ' ' - write(iulog,*) 'smb_3d at each level:' - endif - do k = 1, nzatm - if (this_rank == rtest) then - write(iulog,*) ' ' - write(iulog,*) 'k =', k - endif - call point_diag(model%climate%smb_3d(k,:,:), 'smb_3d (mm/yr)', itest, jtest, rtest, 7, 7) - enddo - call point_diag(model%climate%smb, 'downscaled smb (mm/yr)', itest, jtest, rtest, 7, 7) - elseif (model%options%smb_input_function == SMB_INPUT_FUNCTION_PDD) then - call point_diag(model%climate%artm, 'artm (deg C)', itest, jtest, rtest, 7, 7) - call point_diag(model%climate%precip, 'precip (mm/yr)', itest, jtest, rtest, 7, 7) - call point_diag(model%climate%snow, 'snow (mm/yr)', itest, jtest, rtest, 7, 7) - call point_diag(model%climate%ablation, 'ablation (mm/yr)', itest, jtest, rtest, 7, 7) - call point_diag(model%climate%smb,'smb (mm/yr)', itest, jtest, rtest, 7, 7) - endif ! smb_input_function - - if (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XY) then - call point_diag(model%climate%artm, 'artm (deg C)', itest, jtest, rtest, 7, 7) - elseif (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XY_GRADZ) then - call point_diag(model%climate%artm_ref, 'reference artm (deg C)', itest, jtest, rtest, 7, 7) - call point_diag(model%climate%artm_gradz*1000.d0, 'artm_gradz (deg C per km)', itest, jtest, rtest, 7, 7) - call point_diag(model%climate%artm, 'downscaled artm (deg C)', itest, jtest, rtest, 7, 7) - elseif (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XYZ) then - if (this_rank == rtest) then - write(iulog,*) ' ' - write(iulog,*) 'artm_3d at each level:' - endif - do k = 1, nzatm - if (this_rank == rtest) then - write(iulog,*) ' ' - write(iulog,*) 'k =', k - endif - call point_diag(model%climate%artm_3d(k,:,:), 'artm_3d (deg C)', itest, jtest, rtest, 7, 7) - enddo - call point_diag(model%climate%artm, 'downscaled artm (deg C)', itest, jtest, rtest, 7, 7) - elseif (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XY_LAPSE) then - call point_diag(model%climate%artm_ref, 'reference artm (deg C)', itest, jtest, rtest, 7, 7) - call point_diag(model%climate%artm, 'downscaled artm (deg C)', itest, jtest, rtest, 7, 7) - endif ! artm_input_function - - endif ! verbose_smb - ! Compute a corrected smb field that includes any anomalies or correction factors. ! initialize @@ -614,13 +438,354 @@ subroutine glissade_prepare_climate_forcing(model) end subroutine glissade_prepare_climate_forcing +!======================================================================= + + subroutine downscale_artm(model) + + use glissade_grid_operators, only: glissade_vertical_interpolate + use cism_parallel, only: parallel_is_zero + + ! input/output arguments + + type(glide_global_type), intent(inout) :: model ! model instance + + ! local variables + + integer :: itest, jtest, rtest ! coordinates of diagnostic cell + integer :: i, j, k + integer :: ewn, nsn + integer :: nzatm ! number of atmosphere levels at which smb_3d and artm_3d are provided + + type(parallel_type) :: parallel ! info for parallel communication + + character(len=100) :: message + + ! initialize + + rtest = -999 + itest = 1 + jtest = 1 + if (this_rank == model%numerics%rdiag_local) then + rtest = model%numerics%rdiag_local + itest = model%numerics%idiag_local + jtest = model%numerics%jdiag_local + endif + + ewn = model%general%ewn + nsn = model%general%nsn + nzatm = model%climate%nzatm + parallel = model%parallel + + ! Downscale artm to the current surface elevation if needed. + ! The downscaling options are: + ! (0) artm(x,y); no dependence on surface elevation + ! (1) artm(x,y) + d(artm)/dz(x,y) * dz; artm depends on input field at reference elevation, plus vertical correction + ! (2) artm(x,y,z); artm obtained by linear interpolation between values prescribed at adjacent vertical levels + ! (3) artm(x,y) adjusted with a uniform lapse rate + ! For options (1) - (3), the elevation-dependent artm is computed here. + + ! Make sure the required input fields are present with nonzero values + + if (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XY_GRADZ) then + + if (parallel_is_zero(model%climate%artm_ref) .or. & + parallel_is_zero(model%climate%usrf_ref) .or. & + parallel_is_zero(model%climate%artm_gradz)) then + write(message,*) & + 'Error: Must have nonzero artm_ref, artm_gradz and usrf_ref with artm_input_function =', & + model%options%artm_input_function + call write_log(trim(message), GM_FATAL) + endif + + elseif (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XYZ) then + + if (parallel_is_zero(model%climate%artm_3d)) then + write(message,*) & + 'Error: Must have nonzero artm_3d with artm_input_function =', & + model%options%artm_input_function + call write_log(trim(message), GM_FATAL) + endif + + elseif (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XY_LAPSE) then + + if (parallel_is_zero(model%climate%artm_ref) .or. & + parallel_is_zero(model%climate%usrf_ref) .or. & + model%climate%t_lapse <= 0.0d0) then + write(message,*) & + 'Error: Must have t_lapse > 0 and nonzero artm_ref, usrf_ref with artm_input_function =', & + model%options%artm_input_function + call write_log(trim(message), GM_FATAL) + endif + + endif + + ! Do the downscaling + + if (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XY_GRADZ) then + + ! compute artm by a lapse-rate correction to the reference value + model%climate%artm(:,:) = model%climate%artm_ref(:,:) + & + (model%geometry%usrf(:,:) - model%climate%usrf_ref(:,:)) * model%climate%artm_gradz(:,:) + + elseif (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XYZ) then + + ! Note: With linear_extrapolate_in = T, the values outside the range are obtained + ! by linear extrapolation from the top two or bottom two values. + ! For temperature, which varies roughly linearly with elevation, this is more accurate + ! than simply extending the top and bottom values. + + call glissade_vertical_interpolate(& + ewn, nsn, & + nzatm, model%climate%zatm, & + model%geometry%usrf, & + model%climate%artm_3d, & + model%climate%artm, & + linear_extrapolate_in = .true.) + + elseif (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XY_LAPSE) then + + ! compute artm by a lapse-rate correction to artm_ref + ! T_lapse is defined as positive for T decreasing with height + + model%climate%artm(:,:) = model%climate%artm_ref(:,:) - & + (model%geometry%usrf(:,:) - model%climate%usrf_ref(:,:)) * model%climate%t_lapse + + endif ! artm_input_function + + call parallel_halo(model%climate%artm, parallel) + + ! optional diagnostics + + if (verbose_smb) then + if (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XY) then + call point_diag(model%climate%artm, 'artm (deg C)', itest, jtest, rtest, 7, 7) + elseif (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XY_GRADZ) then + call point_diag(model%climate%artm_ref, 'reference artm (deg C)', itest, jtest, rtest, 7, 7) + call point_diag(model%climate%artm_gradz*1000.d0, 'artm_gradz (deg C per km)', itest, jtest, rtest, 7, 7) + call point_diag(model%climate%artm, 'downscaled artm (deg C)', itest, jtest, rtest, 7, 7) + elseif (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XYZ) then + if (this_rank == rtest) then + write(iulog,*) ' ' + write(iulog,*) 'artm_3d at each level:' + endif + do k = 1, nzatm + if (this_rank == rtest) then + write(iulog,*) ' ' + write(iulog,*) 'k =', k + endif + call point_diag(model%climate%artm_3d(k,:,:), 'artm_3d (deg C)', itest, jtest, rtest, 7, 7) + enddo + call point_diag(model%climate%artm, 'downscaled artm (deg C)', itest, jtest, rtest, 7, 7) + elseif (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XY_LAPSE) then + call point_diag(model%climate%artm_ref, 'reference artm (deg C)', itest, jtest, rtest, 7, 7) + call point_diag(model%climate%artm, 'downscaled artm (deg C)', itest, jtest, rtest, 7, 7) + endif ! artm_input_function + endif + + end subroutine downscale_artm + +!======================================================================= + + subroutine downscale_smb(model) + + use glissade_grid_operators, only: glissade_vertical_interpolate + use cism_parallel, only: parallel_is_zero + + ! input/output arguments + + type(glide_global_type), intent(inout) :: model ! model instance + + ! local variables + + integer :: itest, jtest, rtest ! coordinates of diagnostic cell + integer :: i, j, k + integer :: ewn, nsn + integer :: nzatm ! number of atmosphere levels at which smb_3d and artm_3d are provided + + type(parallel_type) :: parallel ! info for parallel communication + + character(len=100) :: message + + ! initialize + + rtest = -999 + itest = 1 + jtest = 1 + if (this_rank == model%numerics%rdiag_local) then + rtest = model%numerics%rdiag_local + itest = model%numerics%idiag_local + jtest = model%numerics%jdiag_local + endif + + ewn = model%general%ewn + nsn = model%general%nsn + nzatm = model%climate%nzatm + parallel = model%parallel + + ! ------------------------------------------------------------------------ + ! Depending on the SMB input options, compute model%climate%acab at the ice surface. + ! The options are: + ! (0) SMB(x,y); no dependence on surface elevation + ! (1) SMB(x,y) + dSMB/dz(x,y) * dz; SMB depends on input field at reference elevation, plus vertical correction + ! (2) SMB(x,y,z); SMB obtained by linear interpolation between values prescribed at adjacent vertical levels + ! (3) SMB obtained from precip and artm using a positive-degree scheme + ! + ! Options (1) and (2) require input fields with SMB units of mm/yr w.e. (SMB_INPUT_MMYR_WE) + ! For these options, the elevation-dependent SMB is computed here. + ! ------------------------------------------------------------------------ + + ! Make sure the required input fields are present with nonzero values + + if (model%options%smb_input_function == SMB_INPUT_FUNCTION_XY_GRADZ) then + + if (parallel_is_zero(model%climate%smb_ref) .or. & + parallel_is_zero(model%climate%usrf_ref) .or. & + parallel_is_zero(model%climate%smb_gradz)) then + write(message,*) & + 'Error: Must have nonzero smb_ref, smb_gradz and usrf_ref with smb_input_function =', & + model%options%smb_input_function + call write_log(trim(message), GM_FATAL) + endif + + elseif (model%options%smb_input_function == SMB_INPUT_FUNCTION_XYZ) then + + if (parallel_is_zero(model%climate%smb_3d)) then + write(message,*) & + 'Error: Must have nonzero smb_3d with smb_input_function =', & + model%options%smb_input_function + call write_log(trim(message), GM_FATAL) + endif + + elseif (model%options%smb_input_function == SMB_INPUT_FUNCTION_PDD) then + + if (parallel_is_zero(model%climate%artm) .or. & + parallel_is_zero(model%climate%precip)) then + write(message,*) & + 'Error: Must have nonzero artm and precip with smb_input_function =', & + model%options%smb_input_function + call write_log(trim(message), GM_FATAL) + endif + + endif + + ! Do the downscaling + + if (model%options%smb_input_function == SMB_INPUT_FUNCTION_XY_GRADZ) then + + ! downscale SMB to the local surface elevation + model%climate%smb(:,:) = model%climate%smb_ref(:,:) + & + (model%geometry%usrf(:,:) - model%climate%usrf_ref(:,:)) * model%climate%smb_gradz(:,:) + + elseif (model%options%smb_input_function == SMB_INPUT_FUNCTION_XYZ) then + + ! downscale SMB to the local surface elevation + ! Note: With linear_extrapolate_in = F, the values at top and bottom levels are simply extended upward and downward. + ! For SMB, this is safer than linear extrapolation (especially when extrapolating upward). + + call glissade_vertical_interpolate(& + ewn, nsn, & + nzatm, model%climate%zatm, & + model%geometry%usrf, & + model%climate%smb_3d, & + model%climate%smb, & + linear_extrapolate_in = .false.) + + elseif (model%options%smb_input_function == SMB_INPUT_FUNCTION_PDD) then + + ! Compute SMB using a simple PDD scheme: + ! (1) Partition precip as rain or snow based on the downscaled artm + ! (2) Compute ablation based on artm and a degree factor + ! Assume that artm has already been downscaled, if needed, based on artm_input_function. + + ! Note: This is similar to the SMB calculation for glaciers, but that calculation is done in the glacier module. + ! TODO: Put the glacier values of snow_threshold_min and snow_threshold_max in the climate derived type. + + ! compute snow accumulation (mm/yr w.e.) + where (model%climate%artm > model%climate%snow_threshold_max) + model%climate%snow = 0.0d0 ! all precip falls as rain + elsewhere (model%climate%artm < model%climate%snow_threshold_min) + model%climate%snow = model%climate%precip ! all precip falls as snow + elsewhere (model%climate%artm > model%climate%snow_threshold_min) + model%climate%snow = model%climate%precip * (model%climate%snow_threshold_max - model%climate%artm) & + / (model%climate%snow_threshold_max - model%climate%snow_threshold_min) + endwhere + + ! compute ablation (mm/yr w.e.) + ! Note: degree_factor has units of mm/yr w.e./degC to be consistent with other mass-balance variables. + ! It is like mu_star for glaciers. + model%climate%ablation = model%climate%degree_factor * max(model%climate%artm - model%climate%tmlt, 0.0d0) + + ! compute smb (mm/yr w.e.) + model%climate%smb = model%climate%snow - model%climate%ablation + + ! set smb = 0 for open ocean + where (model%geometry%thck == 0.0d0 .and. (model%geometry%topg - model%climate%eus) < 0.0d0) + model%climate%smb = 0.0d0 + endwhere + + endif ! smb_input_function + + call parallel_halo(model%climate%smb, parallel) + + ! For the non-default smb_input_function options, make sure the SMB is nonzero somewhere. + ! For the default option, do not abort, since idealized tests often have a zero SMB. + + if (model%options%smb_input_function /= SMB_INPUT_FUNCTION_XY) then + if (parallel_is_zero(model%climate%smb)) then + write(message,*) 'Error: smb = 0 everywhere with smb_input_function =', & + model%options%smb_input_function + call write_log(trim(message), GM_FATAL) + endif + endif + + ! optional diagnostics + + if (verbose_smb) then + + if (this_rank == rtest) then + write(iulog,*) 'Computing runtime smb with smb_input_function =', model%options%smb_input_function + endif + call point_diag(model%geometry%usrf, 'usrf (m)', itest, jtest, rtest, 7, 7) + + if (model%options%smb_input_function == SMB_INPUT_FUNCTION_XY) then + call point_diag(model%climate%smb, 'smb (mm/yr)', itest, jtest, rtest, 7, 7) + elseif (model%options%smb_input_function == SMB_INPUT_FUNCTION_XY_GRADZ) then + call point_diag(model%geometry%usrf - model%climate%usrf_ref, 'usrf - usrf_ref (m)', & + itest, jtest, rtest, 7, 7) + call point_diag(model%climate%smb_ref, 'reference smb (mm/yr)', itest, jtest, rtest, 7, 7) + call point_diag(model%climate%smb_gradz, 'smb_gradz (mm/yr per m)', itest, jtest, rtest, 7, 7) + call point_diag(model%climate%smb, 'downscaled smb (mm/yr)', itest, jtest, rtest, 7, 7) + elseif (model%options%smb_input_function == SMB_INPUT_FUNCTION_XYZ) then + if (this_rank == rtest) then + write(iulog,*) ' ' + write(iulog,*) 'smb_3d at each level:' + endif + do k = 1, nzatm + if (this_rank == rtest) then + write(iulog,*) ' ' + write(iulog,*) 'k =', k + endif + call point_diag(model%climate%smb_3d(k,:,:), 'smb_3d (mm/yr)', itest, jtest, rtest, 7, 7) + enddo + call point_diag(model%climate%smb, 'downscaled smb (mm/yr)', itest, jtest, rtest, 7, 7) + elseif (model%options%smb_input_function == SMB_INPUT_FUNCTION_PDD) then + call point_diag(model%climate%artm, 'artm (deg C)', itest, jtest, rtest, 7, 7) + call point_diag(model%climate%precip, 'precip (mm/yr)', itest, jtest, rtest, 7, 7) + call point_diag(model%climate%snow, 'snow (mm/yr)', itest, jtest, rtest, 7, 7) + call point_diag(model%climate%ablation, 'ablation (mm/yr)', itest, jtest, rtest, 7, 7) + call point_diag(model%climate%smb,'smb (mm/yr)', itest, jtest, rtest, 7, 7) + endif ! smb_input_function + + endif ! verbose_smb + + end subroutine downscale_smb + !======================================================================= subroutine glissade_apply_smb(model) ! Apply the SMB at the upper and lower surfaces, and recompute tracer values. - use glimmer_paramets, only: eps11 use glimmer_physcon, only: rhow, rhoi, scyr use glissade_masks, only: glissade_get_masks use glissade_calving, only: verbose_calving From 6db094c75ef2119c50a0efd7fe3e44ee151dc33a Mon Sep 17 00:00:00 2001 From: William Lipscomb Date: Sun, 30 Nov 2025 10:34:18 -0700 Subject: [PATCH 02/21] Work toward reproducible global sums Until now, CISM results have varied with the number of processors. This is because CISM computes many global sums, not just for diagnostics but also for prognostic calculations (e.g., in the PCG velocity solver). Two runs with different processor counts will have roundoff-level differences in the results of global sums, which propagate to the rest of the code. We want CISM to be able to compute global sums in a reproducible way, independent of processor count, like other CESM components. Reproducible sums are not yet operational, but this commit adds most of the infrastructure: * There is a new config option called 'reproducible_sums', which is false by default. To activate reproducible sums, the user simply sets this to true. * The parallel derived type now includes a logical variable called reprosum, whose value is set at initialization based on the config option 'reproducible_sums'. * There are two new files in libglimmer: cism_reprosum_mod.F90 and cism_infnan_mod.F90. These are adapted, with minor changes, from the files shr_reprosum_mod and shr_infnan_mod.F90 in the CESM shared code written by Pat Worley. * Many global sums now pass through an interface that supports reproducible sums: parallel_global_sum, parallel_global_sum_stagger, or parallel_global_sum_patch. These interfaces contain functions with reprosum logic. If parallel%reprosum is true, these functions will call subroutime cism_reprosum_calcs, which converts sums of floating-point variables to sums involving integers only, and in this way computed global sums independent of processor count. If parallel%reprosum is false, these subroutines do a local sum and then call parallel_reduce_sum, as before. Note the following changes: * Replaced the interface parallel_global_sum_staggered (consisting of four subroutines) with parallel_global_sum_stagger (consisting of four functions). The new functions yield the same results as the old subroutines if reprosum = F. * Replaced many calls to parallel_reduce_sum with calls to function parallel_global_sum, which can call either cism_reprosum_calcs or parallel_reduce_sum. The remaining direct calls to parallel_reduce_sum are mostly sums over integers, which are independent of processor count. * Added an interface called parallel_global_sum_patch. The parallel_global_sum_patch functions compute the global sum of a given field on an arbitrary number of 'patches', each of which has a patch ID denoted by a positive integer. These functions are useful for computing sums over (1) glaciers and (2) ocean basins, where each glacier or basin is considered a patch. * Added a subroutine parallel_reprosum_calc in the cism_parallel module; still to be tested. * Removed subroutines get_area_vol and calc_iareaf_iareag in glide_mask.F90 Collectively, these commits are answer-changing at the roundoff level for both prognostic and diagnostic variables. A future commit will add the calls to cism_reprosum_calc and verify that the sums are reproducible. --- libglide/glide.F90 | 25 +- libglide/glide_diagnostics.F90 | 190 +-- libglide/glide_mask.F90 | 111 +- libglide/glide_setup.F90 | 5 + libglide/glide_types.F90 | 4 + libglimmer/cism_infnan_mod.F90 | 423 +++++ libglimmer/cism_reprosum_mod.F90 | 1789 ++++++++++++++++++++++ libglimmer/glimmer_global.F90 | 14 +- libglimmer/ncdf_template.F90.in | 64 +- libglimmer/parallel_mpi.F90 | 666 +++++--- libglissade/glissade.F90 | 70 +- libglissade/glissade_basal_water.F90 | 3 +- libglissade/glissade_bmlt_float.F90 | 3 + libglissade/glissade_calving.F90 | 22 +- libglissade/glissade_glacier.F90 | 456 +++--- libglissade/glissade_inversion.F90 | 25 + libglissade/glissade_mass_balance.F90 | 26 +- libglissade/glissade_transport.F90 | 44 +- libglissade/glissade_utils.F90 | 56 +- libglissade/glissade_velo_higher.F90 | 11 +- libglissade/glissade_velo_higher_pcg.F90 | 152 +- 21 files changed, 3176 insertions(+), 983 deletions(-) create mode 100644 libglimmer/cism_infnan_mod.F90 create mode 100644 libglimmer/cism_reprosum_mod.F90 diff --git a/libglide/glide.F90 b/libglide/glide.F90 index 69f7f5e3..aca02953 100644 --- a/libglide/glide.F90 +++ b/libglide/glide.F90 @@ -384,8 +384,7 @@ subroutine glide_initialise(model) call glide_set_mask(model%numerics, & model%geometry%thck, model%geometry%topg, & model%general%ewn, model%general%nsn, & - model%climate%eus, model%geometry%thkmask, & - model%geometry%iarea, model%geometry%ivol) + model%climate%eus, model%geometry%thkmask) ! calculate lower and upper ice surface call glide_calclsrf(model%geometry%thck, model%geometry%topg, model%climate%eus,model%geometry%lsrf) @@ -503,16 +502,10 @@ subroutine glide_init_state_diagnostic(model, evolve_ice) call glide_set_mask(model%numerics, & model%geometry%thck, model%geometry%topg, & model%general%ewn, model%general%nsn, & - model%climate%eus, model%geometry%thkmask, & - model%geometry%iarea, model%geometry%ivol) + model%climate%eus, model%geometry%thkmask) endif ! calving_init - ! Compute total areas of grounded and floating ice - call calc_iareaf_iareag(model%numerics%dew, model%numerics%dns, & - model%geometry%thkmask, & - model%geometry%iareaf, model%geometry%iareag) - ! ------------------------------------------------------------------------ ! ***Part 2: Calculate geometry related fields ! ------------------------------------------------------------------------ @@ -874,13 +867,10 @@ subroutine glide_tstep_p2(model) call glide_prof_start(model,model%glide_prof%ice_mask2) - !TODO - Calculate area and vol separately from glide_set_mask? - call glide_set_mask(model%numerics, & model%geometry%thck, model%geometry%topg, & model%general%ewn, model%general%nsn, & - model%climate%eus, model%geometry%thkmask, & - model%geometry%iarea, model%geometry%ivol) + model%climate%eus, model%geometry%thkmask) call glide_prof_stop(model,model%glide_prof%ice_mask2) @@ -909,14 +899,7 @@ subroutine glide_tstep_p2(model) call glide_set_mask(model%numerics, & model%geometry%thck, model%geometry%topg, & model%general%ewn, model%general%nsn, & - model%climate%eus, model%geometry%thkmask, & - model%geometry%iarea, model%geometry%ivol) - endif ! oldglide = F - - if (.not. oldglide) then ! calculate area of floating and grounded ice - call calc_iareaf_iareag(model%numerics%dew, model%numerics%dns, & - model%geometry%thkmask, & - model%geometry%iareaf, model%geometry%iareag) + model%climate%eus, model%geometry%thkmask) endif ! oldglide = F ! ------------------------------------------------------------------------ diff --git a/libglide/glide_diagnostics.F90 b/libglide/glide_diagnostics.F90 index e49ebb2f..cd72557c 100644 --- a/libglide/glide_diagnostics.F90 +++ b/libglide/glide_diagnostics.F90 @@ -39,7 +39,7 @@ module glide_diagnostics use cism_parallel, only: this_rank, main_task, lhalo, uhalo, nhalo, & parallel_type, broadcast, & parallel_localindex, parallel_globalindex, & - parallel_reduce_sum, parallel_reduce_max, & + parallel_global_sum, parallel_reduce_max, & parallel_reduce_maxloc, parallel_reduce_minloc, & parallel_is_zero @@ -231,6 +231,7 @@ subroutine glide_write_diag (model, time) integer, dimension(model%general%ewn,model%general%nsn) :: & ice_mask, & ! = 1 where ice is present with thck > minthck, else = 0 floating_mask, & ! = 1 where ice is present and floating, else = 0 + grounded_mask, & ! = 1 where ice is present and grounded, else = 0 glacier_ice_mask ! = 1 where glacier ice is present, initially and/or currently real(dp), dimension(model%general%upn) :: & @@ -273,8 +274,12 @@ subroutine glide_write_diag (model, time) velo_ew_ubound, velo_ns_ubound ! upper bounds for velocity variables real(dp), dimension(model%general%ewn, model%general%nsn) :: & - velo_sfc, & ! surface ice speed - thck_obs ! observed ice thickness, derived from usrf_obs and topg + mass_above_flotation,& ! ice mass above flotation (kg) + velo_sfc, & ! surface ice speed (m/s) + thck_obs ! observed ice thickness (m), derived from usrf_obs and topg + + real(dp), dimension(:,:,:), allocatable :: & + local_energy ! internal energy (J) per layer in a column character(len=100) :: message @@ -337,12 +342,15 @@ subroutine glide_write_diag (model, time) ice_mask(i,j) = 1 if (model%geometry%topg(i,j) - model%climate%eus < (-rhoi/rhoo)*model%geometry%thck(i,j)) then floating_mask(i,j) = 1 + grounded_mask(i,j) = 0 else floating_mask(i,j) = 0 + grounded_mask(i,j) = 1 endif else ice_mask(i,j) = 0 floating_mask(i,j) = 0 + grounded_mask(i,j) = 0 endif enddo enddo @@ -358,109 +366,75 @@ subroutine glide_write_diag (model, time) call write_log(' ') ! total ice area (m^2) - - tot_area = 0.d0 - tot_area_ground = 0.d0 - tot_area_float = 0.d0 - do j = lhalo+1, nsn-uhalo - do i = lhalo+1, ewn-uhalo - if (ice_mask(i,j) == 1) then - tot_area = tot_area + cell_area(i,j) - if (floating_mask(i,j) == 1) then - tot_area_float = tot_area_float + cell_area(i,j) - else - tot_area_ground = tot_area_ground + cell_area(i,j) - endif - endif - enddo - enddo - - tot_area = parallel_reduce_sum(tot_area) - tot_area_ground = parallel_reduce_sum(tot_area_ground) - tot_area_float = parallel_reduce_sum(tot_area_float) + tot_area = parallel_global_sum(cell_area, parallel, ice_mask) + tot_area_float = parallel_global_sum(cell_area, parallel, floating_mask) + tot_area_ground = parallel_global_sum(cell_area, parallel, grounded_mask) ! total ice volume (m^3) - - tot_volume = 0.d0 - do j = lhalo+1, nsn-uhalo - do i = lhalo+1, ewn-uhalo - if (ice_mask(i,j) == 1) then - tot_volume = tot_volume + model%geometry%thck(i,j) * cell_area(i,j) - endif - enddo - enddo - tot_volume = parallel_reduce_sum(tot_volume) + tot_volume = parallel_global_sum(model%geometry%thck*cell_area, parallel, ice_mask) ! total ice mass (kg) tot_mass = tot_volume * rhoi ! total ice mass above flotation (kg) - tot_mass_above_flotation = 0.d0 - - do j = lhalo+1, nsn-uhalo - do i = lhalo+1, ewn-uhalo - if (ice_mask(i,j) == 1) then - if (floating_mask(i,j) == 0) then ! grounded ice - if (model%geometry%topg(i,j) - model%climate%eus < 0.0d0) then ! grounded below sea level - thck_floating = (-rhoo/rhoi) * (model%geometry%topg(i,j) - model%climate%eus) ! exactly floating - thck_above_flotation = model%geometry%thck(i,j) - thck_floating - tot_mass_above_flotation = tot_mass_above_flotation & - + thck_above_flotation * cell_area(i,j) - else ! grounded above sea level - tot_mass_above_flotation = tot_mass_above_flotation & - + model%geometry%thck(i,j) * cell_area(i,j) - endif + mass_above_flotation = 0.0d0 + do j = 1, nsn + do i = 1, ewn + if (ice_mask(i,j) == 1 .and. floating_mask(i,j) == 0) then + if (model%geometry%topg(i,j) - model%climate%eus < 0.0d0) then ! grounded below sea level + thck_floating = (-rhoo/rhoi) * (model%geometry%topg(i,j) - model%climate%eus) ! exactly floating + mass_above_flotation(i,j) = (model%geometry%thck(i,j) - thck_floating) * cell_area(i,j) + else ! grounded above sea level + mass_above_flotation(i,j) = model%geometry%thck(i,j) * cell_area(i,j) endif endif enddo enddo - - tot_mass_above_flotation = tot_mass_above_flotation * rhoi ! convert from m^3 to kg - tot_mass_above_flotation = parallel_reduce_sum(tot_mass_above_flotation) + mass_above_flotation = mass_above_flotation * rhoi + tot_mass_above_flotation = parallel_global_sum(mass_above_flotation, parallel) ! total ice energy relative to T = 0 deg C (J) - - tot_energy = 0.d0 - if (size(model%temper%temp,1) == upn+1) then ! temps are staggered in vertical - do j = lhalo+1, nsn-uhalo - do i = lhalo+1, ewn-uhalo + local_energy = 0.0d0 + if (size(model%temper%temp,1) == upn+1) then ! temps are staggered in vertical, located at layer centers + allocate(local_energy(model%general%upn-1, model%general%ewn, model%general%nsn)) + do j = 1, nsn + do i = 1, ewn if (ice_mask(i,j) == 1) then - do k = 1, upn-1 - tot_energy = tot_energy + & - model%geometry%thck(i,j) * model%temper%temp(k,i,j) * cell_area(i,j) & - *(model%numerics%sigma(k+1) - model%numerics%sigma(k)) + do k = 1, upn-1 ! (upn-1) layers + local_energy(k,i,j) = & + model%geometry%thck(i,j) * model%temper%temp(k,i,j) * cell_area(i,j) & + * (model%numerics%sigma(k+1) - model%numerics%sigma(k)) enddo endif enddo enddo - - else ! temps are unstaggered in vertical - do j = lhalo+1, nsn-uhalo - do i = lhalo+1, ewn-uhalo + else ! temps are unstaggered in vertical, located at layer interfaces + allocate(local_energy(model%general%upn, model%general%ewn, model%general%nsn)) + do j = 1, nsn + do i = 1, ewn if (ice_mask(i,j) == 1) then - ! upper half-layer, T = upper sfc temp - tot_energy = tot_energy + & - model%geometry%thck(i,j) * model%temper%temp(1,i,j) * cell_area(i,j) & - * 0.5d0 * model%numerics%sigma(2) - do k = 2, upn-1 - tot_energy = tot_energy + & - model%geometry%thck(i,j) * model%temper%temp(k,i,j) * cell_area(i,j) & - * 0.5d0*(model%numerics%sigma(k+1) - model%numerics%sigma(k-1)) + k = 1 ! top interface; assign this temperature to the top half layer + local_energy(k,i,j) = & + model%geometry%thck(i,j) * model%temper%temp(k,i,j) * cell_area(i,j) & + * 0.5d0 * model%numerics%sigma(k+1) + do k = 2, upn-1 ! interior layers + local_energy(k,i,j) = & + model%geometry%thck(i,j) * model%temper%temp(k,i,j) * cell_area(i,j) & + * 0.5d0*(model%numerics%sigma(k+1) - model%numerics%sigma(k-1)) enddo - ! lower half-layer, T = lower sfc temp - tot_energy = tot_energy + & - model%geometry%thck(i,j) * model%temper%temp(upn,i,j) * cell_area(i,j) & - * 0.5d0 * (1.0d0 - model%numerics%sigma(upn-1)) + k = upn ! bottom interface; assign this temperature to the bottom half layer + local_energy(k,i,j) = & + model%geometry%thck(i,j) * model%temper%temp(k,i,j) * cell_area(i,j) & + * 0.5d0 * (1.0d0 - model%numerics%sigma(k-1)) endif enddo enddo endif - - tot_energy = tot_energy * rhoi * shci - tot_energy = parallel_reduce_sum(tot_energy) + local_energy = local_energy * rhoi * shci + tot_energy = parallel_global_sum(local_energy, parallel, ice_mask) + deallocate(local_energy) ! mean thickness - if (tot_area > eps) then mean_thck = tot_volume/tot_area else @@ -468,7 +442,6 @@ subroutine glide_write_diag (model, time) endif ! mean temperature - if (tot_volume > eps) then mean_temp = tot_energy/ (rhoi*shci*tot_volume) else @@ -490,19 +463,11 @@ subroutine glide_write_diag (model, time) if (model%options%whichdycore == DYCORE_GLISSADE) then ! total surface accumulation/ablation rate (m^3/yr ice) - - tot_acab = 0.d0 - do j = lhalo+1, nsn-uhalo - do i = lhalo+1, ewn-uhalo - tot_acab = tot_acab + model%climate%acab_applied(i,j) * cell_area(i,j) - enddo - enddo - - tot_acab = tot_acab * scyr ! convert to m^3/yr - tot_acab = parallel_reduce_sum(tot_acab) + tot_acab = parallel_global_sum(model%climate%acab_applied*cell_area, parallel) + tot_acab = tot_acab * scyr ! convert from m^3/s to m^3/yr ! total surface mass balance flux (kg/s) - tot_smb_flux = tot_acab * rhoi / scyr ! convert m^3/yr to kg/s + tot_smb_flux = tot_acab * rhoi / scyr ! convert m^3/yr to kg/s ! mean accumulation/ablation rate (m/yr) ! Note: This will be only approximate if some ice has melted completely during the time step @@ -513,15 +478,8 @@ subroutine glide_write_diag (model, time) endif ! total basal melting rate (positive for ice loss) - tot_bmlt = 0.d0 - do j = lhalo+1, nsn-uhalo - do i = lhalo+1, ewn-uhalo - tot_bmlt = tot_bmlt + model%basal_melt%bmlt_applied(i,j) * cell_area(i,j) - enddo - enddo - - tot_bmlt = tot_bmlt * scyr ! convert to m^3/yr - tot_bmlt = parallel_reduce_sum(tot_bmlt) + tot_bmlt = parallel_global_sum(model%basal_melt%bmlt_applied*cell_area, parallel) + tot_bmlt = tot_bmlt * scyr ! convert from m^3/s to m^3/yr ! total basal mass balance (kg/s, positive for freeze-on, negative for melt) tot_bmb_flux = -tot_bmlt * rhoi / scyr ! convert m^3/yr to kg/s @@ -536,14 +494,7 @@ subroutine glide_write_diag (model, time) ! total calving rate (m^3/yr ice) ! Note: calving%calving_rate has units of m/yr ice - - tot_calving = 0.d0 - do j = lhalo+1, nsn-uhalo - do i = lhalo+1, ewn-uhalo - tot_calving = tot_calving + model%calving%calving_rate(i,j) * cell_area(i,j) ! m^3/yr ice - enddo - enddo - tot_calving = parallel_reduce_sum(tot_calving) + tot_calving = parallel_global_sum(model%calving%calving_rate*cell_area, parallel) ! total calving mass balance flux (kg/s, negative for ice loss by calving) tot_calving_flux = -tot_calving * rhoi / scyr ! convert m^3/yr to kg/s @@ -559,15 +510,10 @@ subroutine glide_write_diag (model, time) ! total grounding line mass balance flux (< 0 by definition) ! Note: At this point, gl_flux_east and gl_flux_north are already dimensionalized in kg/m/s, ! so tot_gl_flux will have units of kg/s - - tot_gl_flux = 0.d0 - do j = lhalo+1, nsn-uhalo - do i = lhalo+1, ewn-uhalo - tot_gl_flux = tot_gl_flux - abs(model%geometry%gl_flux_east(i,j)) * model%numerics%dns & - - abs(model%geometry%gl_flux_north(i,j)) * model%numerics%dew - enddo - enddo - tot_gl_flux = parallel_reduce_sum(tot_gl_flux) + tot_gl_flux = parallel_global_sum(abs(model%geometry%gl_flux_east) * model%numerics%dns & + + abs(model%geometry%gl_flux_north) * model%numerics%dew, & + parallel) + tot_gl_flux = -tot_gl_flux ! negative by definition ! total rate of change of ice mass (kg/s) ! Note: dthck_dt has units of m/s @@ -575,14 +521,8 @@ subroutine glide_write_diag (model, time) ! in successive time steps, instead of summing over dthck_dt. ! Note that dthck_dt does not account for global outflow fluxes (i.e., removal of ice ! near the global boundary in halo updates). - tot_dmass_dt = 0.d0 - do j = lhalo+1, nsn-uhalo - do i = lhalo+1, ewn-uhalo - tot_dmass_dt = tot_dmass_dt + model%geometry%dthck_dt(i,j) * cell_area(i,j) - enddo - enddo + tot_dmass_dt = parallel_global_sum(model%geometry%dthck_dt*cell_area, parallel) tot_dmass_dt = tot_dmass_dt * rhoi ! convert to kg/s - tot_dmass_dt = parallel_reduce_sum(tot_dmass_dt) ! mass conservation error ! Note: For most runs, this should be close to zero. diff --git a/libglide/glide_mask.F90 b/libglide/glide_mask.F90 index c08dfd17..1e4ef585 100644 --- a/libglide/glide_mask.F90 +++ b/libglide/glide_mask.F90 @@ -41,14 +41,11 @@ module glide_mask contains -!TODO - Remove iarea and ivol calculations? They are now computed in glide_write_diag.. - !TODO - Write a new subroutine (in addition to glide_set_mask) to compute mask for staggered grid? ! This subroutine is now called from glissade_velo_driver with stagthck and stagtopg ! as input arguments. - subroutine glide_set_mask(numerics, thck, topg, ewn, nsn, eus, mask, iarea, ivol, & - exec_serial, parallel) + subroutine glide_set_mask(numerics, thck, topg, ewn, nsn, eus, mask) use glide_types use glimmer_physcon, only : rhoi, rhoo @@ -60,14 +57,9 @@ subroutine glide_set_mask(numerics, thck, topg, ewn, nsn, eus, mask, iarea, ivol integer, intent(in) :: ewn, nsn ! Grid size real(dp), intent(in) :: eus ! Sea level integer, dimension(:,:), intent(inout) :: mask ! Output mask - real(dp), intent(inout), optional :: ivol, iarea ! Area and volume of ice - - logical, optional :: exec_serial !JEFF If executing in serial in MPI program. - type(parallel_type), optional :: parallel ! info for parallel communication ! local variables integer ew,ns - logical :: exec_serial_flag !Note - This array may not be needed, at least in parallel. @@ -76,21 +68,8 @@ subroutine glide_set_mask(numerics, thck, topg, ewn, nsn, eus, mask, iarea, ivol integer, dimension(0:ewn+1,0:nsn+1) :: maskWithBounds; - !TODO - What is the exec_serial option? Is it still needed? - - !JEFF Handle exec_serial optional parameter - if ( present(exec_serial) ) then - exec_serial_flag = exec_serial - else - ! Default to off - exec_serial_flag = .FALSE. - endif - mask = 0 - if (present(iarea)) iarea = 0.d0 - if (present(ivol)) ivol = 0.d0 - !Note - This mask is confusing. Wondering if we should replace it by a series of logical masks. ! Would need the following: @@ -138,10 +117,6 @@ subroutine glide_set_mask(numerics, thck, topg, ewn, nsn, eus, mask, iarea, ivol mask = ior(mask, GLIDE_MASK_LAND) ! GLIDE_MASK_LAND = 4 endwhere - if (present(iarea) .and. present(ivol)) then - call get_area_vol(thck, numerics%dew, numerics%dns, numerics%thklim, iarea, ivol, exec_serial_flag) - end if - !TODO - Replace the following with a halo call for 'mask', with appropriate global BC? maskWithBounds = 0 @@ -185,14 +160,9 @@ subroutine glide_set_mask(numerics, thck, topg, ewn, nsn, eus, mask, iarea, ivol end do end do - !JEFF Don't call halo update if running in serial mode - !WHL - I think the halo update will now work in serial mode. - if (.NOT. exec_serial_flag .and. present(parallel)) then - call parallel_halo(mask, parallel) - endif - end subroutine glide_set_mask + subroutine augment_kinbc_mask(mask, kinbcmask) ! Augments the Glide mask with the location of kinematic (dirichlet) boundary @@ -217,83 +187,8 @@ subroutine augment_kinbc_mask(mask, kinbcmask) endwhere end subroutine augment_kinbc_mask - subroutine get_area_vol(thck, dew, dns, thklim, iarea, ivol, exec_serial) - - implicit none - real(dp), dimension(:,:) :: thck - real(dp) :: dew, dns, thklim - real(dp) :: iarea, ivol, sum(2) - logical :: exec_serial - - integer :: i,j - - do i = 1+lhalo, size(thck,1)-uhalo - do j = 1+lhalo, size(thck,2)-uhalo - if (thck(i,j) > thklim ) then - iarea = iarea + 1 - ivol = ivol + thck(i,j) - end if - end do - end do - - iarea = iarea * dew * dns - ivol = ivol * dew * dns - - if (.NOT. exec_serial) then - sum(1) = iarea - sum(2) = ivol - sum = parallel_reduce_sum(sum) - iarea = sum(1) - ivol = sum(2) - endif - - end subroutine get_area_vol - - subroutine calc_iareaf_iareag(dew, dns, mask, iareaf, iareag, exec_serial) - - implicit none - real(dp), intent(in) :: dew, dns - real(dp), intent(out) :: iareaf, iareag - integer, dimension(:,:), intent(in) :: mask - logical, optional :: exec_serial ! If executing in serial in MPI program. - - integer :: i,j - logical :: exec_serial_flag - real(dp) :: sum(2) - - !TODO - exec_serial option may not be needed - if ( present(exec_serial) ) then - exec_serial_flag = exec_serial - else - ! Default to off - exec_serial_flag = .FALSE. - endif - - iareaf = 0.d0 - iareag = 0.d0 - - !loop over locally owned scalars - do j = 1+lhalo, size(mask,2)-uhalo - do i = 1+lhalo, size(mask,1)-uhalo - if (GLIDE_IS_FLOAT(mask(i,j))) then - iareaf = iareaf + dew * dns - else if(GLIDE_IS_GROUND_OR_GNDLINE(mask(i,j))) then - iareag = iareag + dew * dns - end if - end do - end do - - if (.NOT. exec_serial_flag) then - sum(1) = iareaf - sum(2) = iareag - sum = parallel_reduce_sum(sum) - iareaf = sum(1) - iareag = sum(2) - endif - - end subroutine calc_iareaf_iareag - subroutine glide_marine_margin_normal(thck, mask, marine_bc_normal, & + subroutine glide_marine_margin_normal(thck, mask, marine_bc_normal, & exec_serial, parallel) !TODO - Remove subroutine glide_marine_margin_normal? Old PBJ routine. diff --git a/libglide/glide_setup.F90 b/libglide/glide_setup.F90 index 1f5fd0a2..3b2aa1e4 100644 --- a/libglide/glide_setup.F90 +++ b/libglide/glide_setup.F90 @@ -907,6 +907,7 @@ subroutine handle_ho_options(section, model) call GetValue(section, 'linear_solve_ncheck', model%options%linear_solve_ncheck) call GetValue(section, 'linear_maxiters', model%options%linear_maxiters) call GetValue(section, 'linear_tolerance', model%options%linear_tolerance) + call GetValue(section, 'reproducible_sums', model%options%reproducible_sums) end subroutine handle_ho_options @@ -2183,6 +2184,10 @@ subroutine print_options(model) write(message,*) 'linear_tolerance : ',model%options%linear_tolerance call write_log(message) + if (model%options%reproducible_sums) then + call write_log('Global sums will be reproducible') + endif + end if ! DYCORE_GLISSADE if (model%options%whichdycore == DYCORE_GLISSADE .and. & diff --git a/libglide/glide_types.F90 b/libglide/glide_types.F90 index 0be4b0fe..2d001f06 100644 --- a/libglide/glide_types.F90 +++ b/libglide/glide_types.F90 @@ -1157,6 +1157,10 @@ module glide_types real(dp) :: linear_tolerance = 1.0d-08 !> error tolerance for linear solver + logical :: reproducible_sums = .false. + !> if true, then compute reproducible global sums + !> (independent of the number of tasks) + ! The remaining options are not currently supported !integer :: which_bproc = 0 diff --git a/libglimmer/cism_infnan_mod.F90 b/libglimmer/cism_infnan_mod.F90 new file mode 100644 index 00000000..124b051f --- /dev/null +++ b/libglimmer/cism_infnan_mod.F90 @@ -0,0 +1,423 @@ +! Flag representing compiler support of Fortran 2003's +! ieee_arithmetic intrinsic module. +#if defined CPRIBM || defined CPRPGI || defined CPRINTEL || defined CPRCRAY || defined CPRNAG +#define HAVE_IEEE_ARITHMETIC +#endif + +!WHL, Nov. 2025: Adapted from shr_infnan_mod.F90, part of CESM shared code +! Changed 'shr' to 'cism' to avoid name conflicts with shared code +! I kept only the r8 interfaces (not r4, i8 or i4). +! Also, I assumed that all input arrays are 1d. + +module cism_infnan_mod +!--------------------------------------------------------------------- +! Module to test for IEEE Inf and NaN values, which also provides a +! method of setting +/-Inf and signaling or quiet NaN. +! +! All functions are elemental, and thus work on arrays. +!--------------------------------------------------------------------- +! To test for these values, just call the corresponding function, e.g: +! +! var_is_nan = cism_infnan_isnan(x) +! +! You can also use it on arrays: +! +! array_contains_nan = any(cism_infnan_isnan(my_array)) +! +!--------------------------------------------------------------------- +! To generate these values, assign one of the provided derived-type +! variables to a real: +! +! use cism_infnan_mod, only: nan => cism_infnan_nan, & +! inf => cism_infnan_inf, & +! assignment(=) +! real(r4) :: my_nan +! real(r8) :: my_inf_array(2,2) +! my_nan = nan +! my_inf_array = inf +! +! Keep in mind that "cism_infnan_nan" and "cism_infnan_inf" cannot be +! passed to functions that expect real arguments. To pass a real +! NaN, you will have to use cism_infnan_nan to set a local real of +! the correct kind. +!--------------------------------------------------------------------- + + use glimmer_global, only: r4 => sp, r8 => dp + use glimmer_global, only: i4, i8 +!!use shr_kind_mod, only: & +!! r4 => SHR_KIND_R4, & +!! r8 => SHR_KIND_R8 + +#ifdef HAVE_IEEE_ARITHMETIC + +! If we have IEEE_ARITHMETIC, the NaN test is provided for us. +use, intrinsic :: ieee_arithmetic, only: & + cism_infnan_isnan => ieee_is_nan + +#else + +! Integers of correct size for bit patterns below. +!!use shr_kind_mod, only: i4 => shr_kind_i4, i8 => shr_kind_i8 + +#endif + +implicit none +private +save + +! Test functions for NaN/Inf values. +public :: cism_infnan_isnan +public :: cism_infnan_isinf +public :: cism_infnan_isposinf +public :: cism_infnan_isneginf + +! Locally defined isnan. +#ifndef HAVE_IEEE_ARITHMETIC +interface cism_infnan_isnan + ! TYPE double,real + module procedure cism_infnan_isnan_r8 +end interface +#endif + +interface cism_infnan_isinf + ! TYPE double,real + module procedure cism_infnan_isinf_r8 +end interface + +interface cism_infnan_isposinf + ! TYPE double,real + module procedure cism_infnan_isposinf_r8 +end interface + +interface cism_infnan_isneginf + ! TYPE double,real + module procedure cism_infnan_isneginf_r8 +end interface + +! Derived types for generation of NaN/Inf +! Even though there's no reason to "use" the types directly, some compilers +! might have trouble with an object being used without its type. +public :: cism_infnan_nan_type +public :: cism_infnan_inf_type +public :: assignment(=) +public :: cism_infnan_to_r4 +public :: cism_infnan_to_r8 + +! Type representing Not A Number. +type :: cism_infnan_nan_type + logical :: quiet = .false. +end type cism_infnan_nan_type + +! Type representing +/-Infinity. +type :: cism_infnan_inf_type + logical :: positive = .true. +end type cism_infnan_inf_type + +! Allow assigning reals to NaN or Inf. +interface assignment(=) + ! TYPE double,real + ! DIMS 0,1,2,3,4,5,6,7 + module procedure set_nan_1d_r8 + ! TYPE double,real + ! DIMS 0,1,2,3,4,5,6,7 + module procedure set_inf_1d_r8 +end interface + +! Conversion functions. +interface cism_infnan_to_r8 + module procedure nan_r8 + module procedure inf_r8 +end interface + +interface cism_infnan_to_r4 + module procedure nan_r4 + module procedure inf_r4 +end interface + +! Initialize objects of NaN/Inf type for other modules to use. + +! Default NaN is signaling, but also provide snan and qnan to choose +! explicitly. +type(cism_infnan_nan_type), public, parameter :: cism_infnan_nan = & + cism_infnan_nan_type(.false.) +type(cism_infnan_nan_type), public, parameter :: cism_infnan_snan = & + cism_infnan_nan_type(.false.) +type(cism_infnan_nan_type), public, parameter :: cism_infnan_qnan = & + cism_infnan_nan_type(.true.) + +! Default Inf is positive, but provide posinf to go with neginf. +type(cism_infnan_inf_type), public, parameter :: cism_infnan_inf = & + cism_infnan_inf_type(.true.) +type(cism_infnan_inf_type), public, parameter :: cism_infnan_posinf = & + cism_infnan_inf_type(.true.) +type(cism_infnan_inf_type), public, parameter :: cism_infnan_neginf = & + cism_infnan_inf_type(.false.) + +! Bit patterns for implementation without ieee_arithmetic. +! Note that in order to satisfy gfortran's range check, we have to use +! ibset to set the sign bit from a BOZ pattern. +#ifndef HAVE_IEEE_ARITHMETIC +! Single precision. +integer(i4), parameter :: ssnan_pat = int(Z'7FA00000',i4) +integer(i4), parameter :: sqnan_pat = int(Z'7FC00000',i4) +integer(i4), parameter :: sposinf_pat = int(Z'7F800000',i4) +integer(i4), parameter :: sneginf_pat = ibset(sposinf_pat,bit_size(1_i4)-1) +! Double precision. +integer(i8), parameter :: dsnan_pat = int(Z'7FF4000000000000',i8) +integer(i8), parameter :: dqnan_pat = int(Z'7FF8000000000000',i8) +integer(i8), parameter :: dposinf_pat = int(Z'7FF0000000000000',i8) +integer(i8), parameter :: dneginf_pat = ibset(dposinf_pat,bit_size(1_i8)-1) +#endif + +contains + +!--------------------------------------------------------------------- +! TEST FUNCTIONS +!--------------------------------------------------------------------- +! The "isinf" function simply calls "isposinf" and "isneginf". +!--------------------------------------------------------------------- + +! TYPE double,real +elemental function cism_infnan_isinf_r8(x) result(isinf) + real(r8), intent(in) :: x + logical :: isinf + + isinf = cism_infnan_isposinf(x) .or. cism_infnan_isneginf(x) + +end function cism_infnan_isinf_r8 + +#ifdef HAVE_IEEE_ARITHMETIC + +!--------------------------------------------------------------------- +! The "isposinf" and "isneginf" functions get the IEEE class of a +! real, and test to see if the class is equal to ieee_positive_inf +! or ieee_negative_inf. +!--------------------------------------------------------------------- + +! TYPE double,real +elemental function cism_infnan_isposinf_r8(x) result(isposinf) + use, intrinsic :: ieee_arithmetic, only: & + ieee_class, & + ieee_positive_inf, & + operator(==) + real(r8), intent(in) :: x + logical :: isposinf + + isposinf = (ieee_positive_inf == ieee_class(x)) + +end function cism_infnan_isposinf_r8 + +! TYPE double,real +elemental function cism_infnan_isneginf_r8(x) result(isneginf) + use, intrinsic :: ieee_arithmetic, only: & + ieee_class, & + ieee_negative_inf, & + operator(==) + real(r8), intent(in) :: x + logical :: isneginf + + isneginf = (ieee_negative_inf == ieee_class(x)) + +end function cism_infnan_isneginf_r8 + +#else +! Don't have ieee_arithmetic. + +!!#ifdef CPRGNU !WHL - Assume this is true +! NaN testing on gfortran. +! TYPE double,real +elemental function cism_infnan_isnan_r8(x) result(is_nan) + real(r8), intent(in) :: x + logical :: is_nan + + is_nan = isnan(x) + +end function cism_infnan_isnan_r8 +! End GNU section. +!!#endif + +!--------------------------------------------------------------------- +! The "isposinf" and "isneginf" functions just test against a known +! bit pattern if we don't have ieee_arithmetic. +!--------------------------------------------------------------------- + +! TYPE double,real +elemental function cism_infnan_isposinf_r8(x) result(isposinf) + real(r8), intent(in) :: x + logical :: isposinf +!!#if ({ITYPE} == TYPEREAL) +!! integer(i4), parameter :: posinf_pat = sposinf_pat +!!#else + integer(i8), parameter :: posinf_pat = dposinf_pat +!!#endif + + isposinf = (x == transfer(posinf_pat,x)) + +end function cism_infnan_isposinf_r8 + +! TYPE double,real +elemental function cism_infnan_isneginf_r8(x) result(isneginf) + real(r8), intent(in) :: x + logical :: isneginf +!!#if ({ITYPE} == TYPEREAL) +!! integer(i4), parameter :: neginf_pat = sneginf_pat +!!#else + integer(i8), parameter :: neginf_pat = dneginf_pat +!!#endif + + isneginf = (x == transfer(neginf_pat,x)) + +end function cism_infnan_isneginf_r8 + +! End ieee_arithmetic conditional. +#endif + +!--------------------------------------------------------------------- +! GENERATION FUNCTIONS +!--------------------------------------------------------------------- +! Two approaches for generation of NaN and Inf values: +! 1. With Fortran 2003, use the ieee_value intrinsic to get a value +! from the corresponding class. These are: +! - ieee_signaling_nan +! - ieee_quiet_nan +! - ieee_positive_inf +! - ieee_negative_inf +! 2. Without Fortran 2003, set the IEEE bit patterns directly. +! Use BOZ literals to get an integer with the correct bit +! pattern, then use "transfer" to transfer those bits into a +! real. +!--------------------------------------------------------------------- + +! TYPE double,real +! DIMS 0,1,2,3,4,5,6,7 +pure subroutine set_nan_1d_r8(output, nan) +#ifdef HAVE_IEEE_ARITHMETIC + use, intrinsic :: ieee_arithmetic, only: & + ieee_signaling_nan, & + ieee_quiet_nan, & + ieee_value +#else +!!#if ({ITYPE} == TYPEREAL) +!! integer(i4), parameter :: snan_pat = ssnan_pat +!! integer(i4), parameter :: qnan_pat = sqnan_pat +!!#else + integer(i8), parameter :: snan_pat = dsnan_pat + integer(i8), parameter :: qnan_pat = dqnan_pat +!!#endif +#endif +!! real(r8), intent(out) :: output{DIMSTR} + real(r8), intent(out) :: output + type(cism_infnan_nan_type), intent(in) :: nan + + ! Use scalar temporary for performance reasons, to reduce the cost of + ! the ieee_value call. + real(r8) :: tmp + +#ifdef HAVE_IEEE_ARITHMETIC + if (nan%quiet) then + tmp = ieee_value(tmp, ieee_quiet_nan) + else + tmp = ieee_value(tmp, ieee_signaling_nan) + end if +#else + if (nan%quiet) then + tmp = transfer(qnan_pat, tmp) + else + tmp = transfer(snan_pat, tmp) + end if +#endif + + output = tmp + +end subroutine set_nan_1d_r8 + +! TYPE double,real +! DIMS 0,1,2,3,4,5,6,7 +pure subroutine set_inf_1d_r8(output, inf) +#ifdef HAVE_IEEE_ARITHMETIC + use, intrinsic :: ieee_arithmetic, only: & + ieee_positive_inf, & + ieee_negative_inf, & + ieee_value +#else +!!#if ({ITYPE} == TYPEREAL) +!! integer(i4), parameter :: posinf_pat = sposinf_pat +!! integer(i4), parameter :: neginf_pat = sneginf_pat +!!#else + integer(i8), parameter :: posinf_pat = dposinf_pat + integer(i8), parameter :: neginf_pat = dneginf_pat +!!#endif +#endif +!! real(r8), intent(out) :: output{DIMSTR} + real(r8), intent(out) :: output + type(cism_infnan_inf_type), intent(in) :: inf + + ! Use scalar temporary for performance reasons, to reduce the cost of + ! the ieee_value call. + real(r8) :: tmp + +#ifdef HAVE_IEEE_ARITHMETIC + if (inf%positive) then + tmp = ieee_value(tmp,ieee_positive_inf) + else + tmp = ieee_value(tmp,ieee_negative_inf) + end if +#else + if (inf%positive) then + tmp = transfer(posinf_pat, tmp) + else + tmp = transfer(neginf_pat, tmp) + end if +#endif + + output = tmp + +end subroutine set_inf_1d_r8 + +!--------------------------------------------------------------------- +! CONVERSION INTERFACES. +!--------------------------------------------------------------------- +! Function methods to get reals from nan/inf types. +!--------------------------------------------------------------------- + +pure function nan_r8(nan) result(output) + class(cism_infnan_nan_type), intent(in) :: nan + real(r8) :: output + +!! output = nan + !WHL kluge + output = 0._r8 + +end function nan_r8 + +pure function nan_r4(nan) result(output) + class(cism_infnan_nan_type), intent(in) :: nan + real(r4) :: output + +!! output = nan + !WHL kluge + output = 0._r8 + +end function nan_r4 + +pure function inf_r8(inf) result(output) + class(cism_infnan_inf_type), intent(in) :: inf + real(r8) :: output + +!! output = inf + !WHL kluge + output = 0._r8 + +end function inf_r8 + +pure function inf_r4(inf) result(output) + class(cism_infnan_inf_type), intent(in) :: inf + real(r4) :: output + +!! output = inf + !WHL kluge + output = 0._r8 + +end function inf_r4 + +end module cism_infnan_mod diff --git a/libglimmer/cism_reprosum_mod.F90 b/libglimmer/cism_reprosum_mod.F90 new file mode 100644 index 00000000..55a034c1 --- /dev/null +++ b/libglimmer/cism_reprosum_mod.F90 @@ -0,0 +1,1789 @@ +!WHL, Nov. 2025: Adapted from shr_reprosum_mod.F90, part of CESM shared code +! Changed 'shr' to 'cism' to avoid name conflicts with shared code + +module cism_reprosum_mod +!----------------------------------------------------------------------- +! +! Purpose: +! Compute reproducible global sums of a set of arrays across an MPI +! subcommunicator +! +! Methods: +! Compute using either or both a scalable, reproducible algorithm and a +! scalable, nonreproducible algorithm: +! * Reproducible (scalable): +! Convert to fixed point (integer vector representation) to enable +! reproducibility when using MPI_Allreduce +! * Alternative usually reproducible (scalable): +! Use parallel double-double algorithm due to Helen He and +! Chris Ding, based on David Bailey's/Don Knuth's DDPDD algorithm +! * Nonreproducible (scalable): +! Floating point and MPI_Allreduce based. +! If computing both reproducible and nonreproducible sums, compare +! these and report relative difference (if absolute difference +! less than sum) or absolute difference back to calling routine. +! +! Author: P. Worley (based on suggestions from J. White for fixed +! point algorithm and on He/Ding paper for ddpdd +! algorithm) +! +!----------------------------------------------------------------------- + +!----------------------------------------------------------------------- +!- use statements ------------------------------------------------------ +!----------------------------------------------------------------------- +!!#if ( defined noI8 ) +!! ! Workaround for when shr_kind_i8 is not supported. +!! use shr_kind_mod, only: r8 => shr_kind_r8, i8 => shr_kind_i4 +!!#else +!! use shr_kind_mod, only: r8 => shr_kind_r8, i8 => shr_kind_i8 +!!#endif + !WHL - Use CISM modules instead of shared modules if possible + use glimmer_global, only: r8 => dp + use glimmer_global, only: i8 + use glimmer_paramets, only: iulog +!! use cism_parallel, only: main_task +! use shr_log_mod, only: s_loglev => shr_log_Level +! use shr_log_mod, only: s_logunit => shr_log_Unit +! use shr_sys_mod, only: shr_sys_abort + + use cism_infnan_mod,only: cism_infnan_inf_type, assignment(=), & + cism_infnan_posinf, cism_infnan_neginf, & + cism_infnan_nan, & + cism_infnan_isnan, cism_infnan_isinf, & + cism_infnan_isposinf, cism_infnan_isneginf +#ifdef TIMING + use perf_mod +#endif +!----------------------------------------------------------------------- +!- module boilerplate -------------------------------------------------- +!----------------------------------------------------------------------- + implicit none + private + +!----------------------------------------------------------------------- +!- include statements -------------------------------------------------- +!----------------------------------------------------------------------- +#include + + save + +!----------------------------------------------------------------------- +! Public interfaces ---------------------------------------------------- +!----------------------------------------------------------------------- + public :: & +!! shr_reprosum_setopts, &! set runtime options +!! shr_reprosum_calc, &! calculate distributed sum +!! shr_reprosum_tolExceeded ! utility function to check relative +!! ! differences against the tolerance + cism_reprosum_setopts, &! set runtime options + cism_reprosum_calc, &! calculate distributed sum + cism_reprosum_tolExceeded ! utility function to check relative + ! differences against the tolerance + +!----------------------------------------------------------------------- +! Public data ---------------------------------------------------------- +!----------------------------------------------------------------------- + logical, public :: cism_reprosum_recompute = .false. + + real(r8), public :: cism_reprosum_reldiffmax = -1.0_r8 + + logical, parameter, public :: verbose_reprosum = .false. +!! logical, parameter, public :: verbose_reprosum = .true. + +!----------------------------------------------------------------------- +! Private interfaces ---------------------------------------------------- +!----------------------------------------------------------------------- + private :: & + ddpdd, &! double-double sum routine + split_indices ! split indices among OMP threads + +!----------------------------------------------------------------------- +! Private data ---------------------------------------------------------- +!----------------------------------------------------------------------- + + !---------------------------------------------------------------------------- + ! cism_reprosum_mod options + !---------------------------------------------------------------------------- + logical :: repro_sum_use_ddpdd = .false. + + logical :: repro_sum_allow_infnan = .false. + + !WHLmod +!! integer :: s_logunit = 6 + + CONTAINS + +! +!======================================================================== +! + subroutine cism_reprosum_setopts(repro_sum_use_ddpdd_in, & + repro_sum_allow_infnan_in, & + repro_sum_rel_diff_max_in, & + repro_sum_recompute_in, & + repro_sum_master, & + repro_sum_logunit ) + +!----------------------------------------------------------------------- +! Purpose: Set runtime options +! Author: P. Worley +!----------------------------------------------------------------------- +!------------------------------Arguments-------------------------------- + ! Use DDPDD algorithm instead of fixed precision algorithm + logical, intent(in), optional :: repro_sum_use_ddpdd_in + + ! Allow INF or NaN in summands + logical, intent(in), optional :: repro_sum_allow_infnan_in + + ! maximum permissible difference between reproducible and + ! nonreproducible sums + real(r8), intent(in), optional :: repro_sum_rel_diff_max_in + + ! recompute using different algorithm when difference between + ! reproducible and nonreproducible sums is too great + logical, intent(in), optional :: repro_sum_recompute_in + + ! flag indicating whether this process should output log messages + logical, intent(in), optional :: repro_sum_master + + ! unit number for log messages + integer, intent(in), optional :: repro_sum_logunit + +!---------------------------Local Workspace----------------------------- + integer logunit ! unit number for log messages + logical master ! local master? + logical,save :: firstcall = .true. ! first call + integer :: ierr ! MPI error return +!----------------------------------------------------------------------- + + if ( present(repro_sum_master) ) then + master = repro_sum_master + else + master = .false. !WHL master = main_task? + endif + + if ( present(repro_sum_logunit) ) then + logunit = repro_sum_logunit + else +!! logunit = s_logunit + logunit = iulog ! CISM default + endif + + if (.not. firstcall) then +!! call shr_sys_abort('shr_reprosum_setopts ERROR: multiple calls') + write(logunit,*) 'cism_reprosum_setopts: ERROR can only be called once' + call mpi_abort(MPI_COMM_WORLD, 1001, ierr) + endif + firstcall = .false. + + if ( present(repro_sum_use_ddpdd_in) ) then + repro_sum_use_ddpdd = repro_sum_use_ddpdd_in + endif + if ( present(repro_sum_allow_infnan_in) ) then + repro_sum_allow_infnan = repro_sum_allow_infnan_in + endif + if ( present(repro_sum_rel_diff_max_in) ) then + cism_reprosum_reldiffmax = repro_sum_rel_diff_max_in + endif + if ( present(repro_sum_recompute_in) ) then + cism_reprosum_recompute = repro_sum_recompute_in + endif + if (master) then + if ( repro_sum_use_ddpdd ) then + write(logunit,*) 'CISM_REPROSUM_SETOPTS: ',& + 'Using double-double-based (scalable) usually reproducible ', & + 'distributed sum algorithm' + else + write(logunit,*) 'CISM_REPROSUM_SETOPTS: ',& + 'Using fixed-point-based (scalable) reproducible ', & + 'distributed sum algorithm' + endif + + if ( repro_sum_allow_infnan ) then + write(logunit,*) 'CISM_REPROSUM_SETOPTS: ',& + 'Will calculate sum when INF or NaN are included in summands' + else + write(logunit,*) 'CISM_REPROSUM_SETOPTS: ',& + 'Will abort if INF or NaN are included in summands' + endif + + if (cism_reprosum_reldiffmax >= 0._r8) then + write(logunit,*) ' ',& + 'with a maximum relative error tolerance of ', & + cism_reprosum_reldiffmax + if (cism_reprosum_recompute) then + write(logunit,*) ' ',& + 'If tolerance exceeded, sum is recomputed using ', & + 'a serial algorithm.' + else + write(logunit,*) ' ',& + 'If tolerance exceeded, fixed-precision is sum used ', & + 'but a warning is output.' + endif + else + write(logunit,*) ' ',& + 'and not comparing with floating point algorithms.' + endif + + endif + end subroutine cism_reprosum_setopts + +! +!======================================================================== +! + + subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands, & + nflds, allow_infnan, ddpdd_sum, & + arr_gbl_max, arr_gbl_max_out, & + arr_max_levels, arr_max_levels_out, & + gbl_max_nsummands, gbl_max_nsummands_out,& + gbl_count, repro_sum_validate, & + repro_sum_stats, rel_diff, commid ) +!---------------------------------------------------------------------- +! +! Purpose: +! Compute the global sum of each field in "arr" using the indicated +! communicator with a reproducible yet scalable implementation based +! on a fixed point algorithm. An alternative is to use an "almost +! always reproducible" floating point algorithm, as described below. +! +! The accuracy of the fixed point algorithm is controlled by the +! number of "levels" of integer expansion. The algorithm will calculate +! the number of levels that is required for the sum to be essentially +! exact. (The sum as represented by the integer expansion will be exact, +! but roundoff may perturb the least significant digit of the returned +! real*8 representation of the sum.) The optional parameter arr_max_levels +! can be used to override the calculated value. The optional parameter +! arr_max_levels_out can be used to return the values used. +! +! The algorithm also requires an upper bound on +! the maximum summand (in absolute value) for each field, and will +! calculate this internally. However, if the optional parameters +! arr_max_levels and arr_gbl_max are both set, then the algorithm will +! use the values in arr_gbl_max for the upper bounds instead. If these +! are not upper bounds, or if the upper bounds are not tight enough +! to achieve the requisite accuracy, and if the optional parameter +! repro_sum_validate is NOT set to .false., the algorithm will repeat the +! computation with appropriate upper bounds. If only arr_gbl_max is present, +! then the maxima are computed internally (and the specified values are +! ignored). The optional parameter arr_gbl_max_out can be +! used to return the values used. +! +! Finally, the algorithm requires an upper bound on the number of +! local summands across all processes. This will be calculated internally, +! using an MPI collective, but the value in the optional argument +! gbl_max_nsummands will be used instead if (1) it is present, (2) +! it is > 0, and (3) the maximum value and required number of levels +! are also specified. (If the maximum value is calculated, the same +! MPI collective is used to determine the maximum number of local +! summands.) The accuracy of the user-specified value is not checked. +! However, if set to < 1, the value will instead be calculated. If the +! optional parameter gbl_max_nsummands_out is present, then the value +! used (gbl_max_nsummands if >= 1; calculated otherwise) will be +! returned. +! +! If requested (by setting cism_reprosum_reldiffmax >= 0.0 and passing in +! the optional rel_diff parameter), results are compared with a +! nonreproducible floating point algorithm. +! +! Note that the cost of the algorithm is not strongly correlated with +! the number of levels, which primarily shows up as a (modest) increase +! in cost of the MPI_Allreduce as a function of vector length. Rather the +! cost is more a function of (a) the number of integers required to +! represent an individual summand and (b) the number of MPI_Allreduce +! calls. The number of integers required to represent an individual +! summand is 1 or 2 when using 8-byte integers for 8-byte real summands +! when the number of local summands is not too large. As the number of +! local summands increases, the number of integers required increases. +! The number of MPI_Allreduce calls is either 2 (specifying nothing) or +! 1 (specifying gbl_max_nsummands, arr_max_levels, and arr_gbl_max +! correctly). When specifying arr_max_levels and arr_gbl_max +! incorrectly, 3 or 4 MPI_Allreduce calls will be required. +! +! The alternative algorithm is a minor modification of a parallel +! implementation of David Bailey's routine DDPDD by Helen He +! and Chris Ding. Bailey uses the Knuth trick to implement quadruple +! precision summation of double precision values with 10 double +! precision operations. The advantage of this algorithm is that +! it requires a single MPI_Allreduce and is less expensive per summand +! than is the fixed precision algorithm. The disadvantage is that it +! is not guaranteed to be reproducible (though it is reproducible +! much more often than is the standard algorithm). This alternative +! is used when the optional parameter ddpdd_sum is set to .true. It is +! also used if the fixed precision algorithm radix assumption does not +! hold. +! +!---------------------------------------------------------------------- +! +! Arguments +! + integer, intent(in) :: nsummands ! number of local summands + integer, intent(in) :: dsummands ! declared first dimension + integer, intent(in) :: nflds ! number of fields + real(r8), intent(in) :: arr(dsummands,nflds) + ! input array + + real(r8), intent(out):: arr_gsum(nflds) + ! global sums + + logical, intent(in), optional :: ddpdd_sum + ! use ddpdd algorithm instead + ! of fixed precision algorithm + + logical, intent(in), optional :: allow_infnan + ! if .true., allow INF or NaN input values. + ! if .false. (the default), then abort. + + real(r8), intent(in), optional :: arr_gbl_max(nflds) + ! upper bound on max(abs(arr)) + + real(r8), intent(out), optional :: arr_gbl_max_out(nflds) + ! calculated upper bound on + ! max(abs(arr)) + + integer, intent(in), optional :: arr_max_levels(nflds) + ! maximum number of levels of + ! integer expansion to use + + integer, intent(out), optional :: arr_max_levels_out(nflds) + ! output of number of levels of + ! integer expansion to used + + integer, intent(in), optional :: gbl_max_nsummands + ! maximum of nsummand over all + ! processes + + integer, intent(out), optional :: gbl_max_nsummands_out + ! calculated maximum nsummands + ! over all processes + + integer, intent(in), optional :: gbl_count + ! was total number of summands; + ! now is ignored; use + ! gbl_max_nsummands instead + + logical, intent(in), optional :: repro_sum_validate + ! flag enabling/disabling testing that gmax and max_levels are + ! accurate/sufficient. Default is enabled. + + integer, intent(inout), optional :: repro_sum_stats(6) + ! increment running totals for + ! (1) one-reduction repro_sum + ! (2) two-reduction repro_sum + ! (3) both types in one call + ! (4) nonrepro_sum + ! (5) global max nsummands reduction + ! (6) global lor 3*nflds reduction + + real(r8), intent(out), optional :: rel_diff(2,nflds) + ! relative and absolute + ! differences between fixed + ! and floating point sums + + integer, intent(in), optional :: commid + ! MPI communicator + +! +! Local workspace +! + logical :: abort_inf_nan ! flag indicating whether to + ! abort if INF or NaN found in input + logical :: use_ddpdd_sum ! flag indicating whether to + ! use cism_reprosum_ddpdd or not + logical :: recompute ! flag indicating need to + ! determine gmax/gmin before + ! computing sum + logical :: validate ! flag indicating need to + ! verify gmax and max_levels + ! are accurate/sufficient + logical :: nan_check, inf_check ! flag on whether there are + ! NaNs and INFs in input array + logical :: inf_nan_lchecks(3,nflds)! flags on whether there are + ! NaNs, positive INFs, or negative INFs + ! for each input field locally + logical :: inf_nan_gchecks(3,nflds)! flags on whether there are + ! NaNs, positive INFs, or negative INFs + ! for each input field + logical :: arr_gsum_infnan(nflds) ! flag on whether field sum is a + ! NaN or INF + + integer :: gbl_lor_red ! global lor reduction? (0/1) + integer :: gbl_max_red ! global max reduction? (0/1) + integer :: repro_sum_fast ! 1 reduction repro_sum? (0/1) + integer :: repro_sum_slow ! 2 reduction repro_sum? (0/1) + integer :: repro_sum_both ! both fast and slow? (0/1) + integer :: nonrepro_sum ! nonrepro_sum? (0/1) + + integer :: nan_count, inf_count ! local count of NaNs and INFs in + ! input array + integer :: omp_nthreads ! number of OpenMP threads + integer :: mpi_comm ! MPI subcommunicator + integer :: mypid ! MPI process ID (COMM_WORLD) + integer :: tasks ! number of MPI processes + integer :: ierr ! MPI error return + integer :: ifld, isum, ithread ! loop variables + integer :: max_nsummands ! max nsummands over all processes + ! or threads (used in both ways) + + integer, allocatable :: isum_beg(:), isum_end(:) + ! range of summand indices for each + ! OpenMP thread + integer, allocatable :: arr_tlmin_exp(:,:) + ! per thread local exponent minima + integer, allocatable :: arr_tlmax_exp(:,:) + ! per thread local exponent maxima + integer :: arr_exp, arr_exp_tlmin, arr_exp_tlmax + ! summand exponent and working min/max + integer :: arr_lmin_exp(nflds) ! local exponent minima + integer :: arr_lmax_exp(nflds) ! local exponent maxima + integer :: arr_lextremes(0:nflds,2)! local exponent extrema + integer :: arr_gextremes(0:nflds,2)! global exponent extrema + + integer :: arr_gmax_exp(nflds) ! global exponents maxima + integer :: arr_gmin_exp(nflds) ! global exponents minima + integer :: arr_max_shift ! maximum safe exponent for + ! value < 1 (so that sum does + ! not overflow) + integer :: max_levels(nflds) ! maximum number of levels of + ! integer expansion to use + integer :: max_level ! maximum value in max_levels + + real(r8) :: xmax_nsummands ! dble of max_nsummands + real(r8) :: arr_lsum(nflds) ! local sums + real(r8) :: arr_gsum_fast(nflds) ! global sum calculated using + ! fast, nonreproducible, + ! floating point alg. + real(r8) :: abs_diff ! absolute difference between + ! fixed and floating point + ! sums + +#ifdef _OPENMP + integer omp_get_max_threads + external omp_get_max_threads +#endif + + !WHL - debug + logical :: test_sum = .false. + real(r8), dimension(dsummands,nflds) :: arr_test + arr_test(:,:) = 0.0d0 + call mpi_comm_rank(MPI_COMM_WORLD, mypid, ierr) + if (verbose_reprosum .and. mypid == 0) then + write(iulog,*) 'In cism_reprosum_mod' + endif +! +!----------------------------------------------------------------------- +! +! initialize local statistics variables + gbl_lor_red = 0 + gbl_max_red = 0 + repro_sum_fast = 0 + repro_sum_slow = 0 + repro_sum_both = 0 + nonrepro_sum = 0 + +! set MPI communicator + if ( present(commid) ) then + mpi_comm = commid + else + mpi_comm = MPI_COMM_WORLD + endif +#ifdef TIMING + call t_barrierf('sync_repro_sum',mpi_comm) +#endif +! check whether should abort if input contains NaNs or INFs + abort_inf_nan = .not. repro_sum_allow_infnan + if ( present(allow_infnan) ) then + abort_inf_nan = .not. allow_infnan + endif +#ifdef TIMING + call t_startf('cism_reprosum_INF_NaN_Chk') +#endif +! initialize flags to indicate that no NaNs or INFs are present in the input data + inf_nan_gchecks = .false. + arr_gsum_infnan = .false. + + !TODO - Remove the inf_nan option; assume abort_inf_nan = T, as in CICE + if (abort_inf_nan) then + + if (verbose_reprosum .and. mypid == 0) then + write(iulog,*) ' abort_inf_nan check' + endif + +! check whether input contains NaNs or INFs, and abort if so + nan_check = any(cism_infnan_isnan(arr)) + inf_check = any(cism_infnan_isinf(arr)) + + if (nan_check .or. inf_check) then + + nan_count = count(cism_infnan_isnan(arr)) + inf_count = count(cism_infnan_isinf(arr)) + + if ((nan_count > 0) .or. (inf_count > 0)) then + call mpi_comm_rank(MPI_COMM_WORLD, mypid, ierr) +!! write(s_logunit,37) real(nan_count,r8), real(inf_count,r8), mypid + write(iulog,37) real(nan_count,r8), real(inf_count,r8), mypid +37 format("CISM_REPROSUM_CALC: Input contains ",e12.5, & + " NaNs and ", e12.5, " INFs on process ", i7) +!! call shr_sys_abort("shr_reprosum_calc ERROR: NaNs or INFs in input") + write(iulog,*) "cism_reprosum_calc ERROR: NaNs or INFs in input" + call mpi_abort(MPI_COMM_WORLD, 1001, ierr) + endif + + endif + + else + +! determine whether any fields contain NaNs or INFs, and avoid processing them +! via integer expansions + inf_nan_lchecks = .false. + + do ifld=1,nflds + inf_nan_lchecks(1,ifld) = any(cism_infnan_isnan(arr(:,ifld))) + inf_nan_lchecks(2,ifld) = any(cism_infnan_isposinf(arr(:,ifld))) + inf_nan_lchecks(3,ifld) = any(cism_infnan_isneginf(arr(:,ifld))) + end do +#ifdef TIMING + call t_startf("repro_sum_allr_lor") +#endif + call mpi_allreduce (inf_nan_lchecks, inf_nan_gchecks, 3*nflds, & + MPI_LOGICAL, MPI_LOR, mpi_comm, ierr) + gbl_lor_red = 1 +#ifdef TIMING + call t_stopf("repro_sum_allr_lor") +#endif + do ifld=1,nflds + arr_gsum_infnan(ifld) = any(inf_nan_gchecks(:,ifld)) + enddo + + endif +#ifdef TIMING + call t_stopf('cism_reprosum_INF_NaN_Chk') +#endif +! check whether should use cism_reprosum_ddpdd algorithm + use_ddpdd_sum = repro_sum_use_ddpdd + if ( present(ddpdd_sum) ) then + use_ddpdd_sum = ddpdd_sum + endif + +! check whether intrinsic-based algorithm will work on this system +! (requires floating point and integer bases to be the same) +! If not, always use ddpdd. + use_ddpdd_sum = use_ddpdd_sum .or. (radix(0._r8) /= radix(0_i8)) + + if ( use_ddpdd_sum ) then +#ifdef TIMING + call t_startf('cism_reprosum_ddpdd') +#endif + call cism_reprosum_ddpdd(arr, arr_gsum, nsummands, dsummands, & + nflds, mpi_comm) + repro_sum_fast = 1 +#ifdef TIMING + call t_stopf('cism_reprosum_ddpdd') +#endif + else +#ifdef TIMING + call t_startf('cism_reprosum_int') +#endif +! get number of MPI tasks + call mpi_comm_size(mpi_comm, tasks, ierr) + + if (verbose_reprosum .and. mypid == 0) then + write(iulog,*) 'Starting reprosum, tasks =', tasks + endif +! get number of OpenMP threads +#ifdef _OPENMP + omp_nthreads = omp_get_max_threads() +#else + omp_nthreads = 1 +#endif + +! see if have sufficient information to not require max/min allreduce + recompute = .true. + validate = .false. + if ( present(arr_gbl_max) .and. present(arr_max_levels) ) then + recompute = .false. + +! setting lower bound on max_level*nflds to be 64 to improve OpenMP +! performance for loopb in cism_reprosum_int + max_level = (64/nflds) + 1 + do ifld=1,nflds + if ((arr_gbl_max(ifld) .ge. 0.0_r8) .and. & + (arr_max_levels(ifld) > 0)) then + + arr_gmax_exp(ifld) = exponent(arr_gbl_max(ifld)) + if (max_level < arr_max_levels(ifld)) & + max_level = arr_max_levels(ifld) + + else + recompute = .true. + endif + enddo + + if (.not. recompute) then + +! determine maximum number of summands in local phases of the +! algorithm +#ifdef TIMING + call t_startf("repro_sum_allr_max") +#endif + if ( present(gbl_max_nsummands) ) then + if (gbl_max_nsummands < 1) then + call mpi_allreduce (nsummands, max_nsummands, 1, & + MPI_INTEGER, MPI_MAX, mpi_comm, ierr) + gbl_max_red = 1 + else + max_nsummands = gbl_max_nsummands + endif + else + call mpi_allreduce (nsummands, max_nsummands, 1, & + MPI_INTEGER, MPI_MAX, mpi_comm, ierr) + gbl_max_red = 1 + endif +#ifdef TIMING + call t_stopf("repro_sum_allr_max") +#endif +! determine maximum shift. Shift needs to be small enough that summation +! does not exceed maximum number of digits in i8. + +! if requested, return max_nsummands before it is redefined + if ( present( gbl_max_nsummands_out) ) then + gbl_max_nsummands_out = max_nsummands + endif + +! summing within each thread first + max_nsummands = (max_nsummands/omp_nthreads) + 1 +! then over threads and tasks + max_nsummands = max(max_nsummands, tasks*omp_nthreads) +! A 'max' is used in the above calculation because the partial sum for +! each thread, calculated in cism_reprosum_int, is postprocessed so that +! each integer in the corresponding vector of integers is reduced in +! magnitude to be less than (radix(IX_8)**arr_max_shift). Therefore, +! the maximum shift can be calculated separately for per thread sums +! and sums over threads and tasks, and the smaller value used. This is +! equivalent to using max_nsummands as defined above. + + xmax_nsummands = dble(max_nsummands) + arr_max_shift = digits(0_i8) - (exponent(xmax_nsummands) + 1) + if (arr_max_shift < 2) then +!! call shr_sys_abort('repro_sum failed: number of summands too '// & +!! 'large for fixed precision algorithm' ) + write(iulog,*) & + 'repro_sum failed: number of summands too large for fixed precision algorithm' + call mpi_abort(MPI_COMM_WORLD, 1001, ierr) + endif + +! calculate sum + if (present(repro_sum_validate)) then + validate = repro_sum_validate + else + validate = .true. + endif + call cism_reprosum_int(arr, arr_gsum, nsummands, dsummands, & + nflds, arr_max_shift, arr_gmax_exp, & + arr_max_levels, max_level, arr_gsum_infnan, & + validate, recompute, omp_nthreads, mpi_comm) + +! record statistics, etc. + repro_sum_fast = 1 + if (recompute) then + repro_sum_both = 1 + else +! if requested, return specified levels and upper bounds on maxima + if ( present(arr_max_levels_out) ) then + do ifld=1,nflds + arr_max_levels_out(ifld) = arr_max_levels(ifld) + enddo + endif + if ( present(arr_gbl_max_out) ) then + do ifld=1,nflds + arr_gbl_max_out(ifld) = arr_gbl_max(ifld) + enddo + endif + endif + endif + endif + +! do not have sufficient information; calculate global max/min and +! use to compute required number of levels + if (recompute) then + +! record statistic + repro_sum_slow = 1 + +! determine maximum and minimum (non-zero) summand values and +! maximum number of local summands + +! allocate thread-specific work space + allocate(arr_tlmax_exp(nflds,omp_nthreads)) + allocate(arr_tlmin_exp(nflds,omp_nthreads)) + allocate(isum_beg(omp_nthreads)) + allocate(isum_end(omp_nthreads)) + +! split summand index range over OpenMP threads +!! call split_indices(nsummands, omp_nthreads, isum_beg, isum_end) + !WHL - debug + call split_indices(dsummands, omp_nthreads, isum_beg, isum_end) + +!$omp parallel do & +!$omp default(shared) & +!$omp private(ithread, ifld, isum, arr_exp, arr_exp_tlmin, arr_exp_tlmax) + do ithread=1,omp_nthreads +#ifdef TIMING + call t_startf('repro_sum_loopa') +#endif + do ifld=1,nflds + arr_exp_tlmin = MAXEXPONENT(1._r8) + arr_exp_tlmax = MINEXPONENT(1._r8) + if (.not. arr_gsum_infnan(ifld)) then + do isum=isum_beg(ithread),isum_end(ithread) + if (arr(isum,ifld) .ne. 0.0_r8) then + arr_exp = exponent(arr(isum,ifld)) + arr_exp_tlmin = min(arr_exp,arr_exp_tlmin) + arr_exp_tlmax = max(arr_exp,arr_exp_tlmax) + endif + end do + endif + arr_tlmin_exp(ifld,ithread) = arr_exp_tlmin + arr_tlmax_exp(ifld,ithread) = arr_exp_tlmax + end do +#ifdef TIMING + call t_stopf('repro_sum_loopa') +#endif + end do + + do ifld=1,nflds + arr_lmax_exp(ifld) = maxval(arr_tlmax_exp(ifld,:)) + arr_lmin_exp(ifld) = minval(arr_tlmin_exp(ifld,:)) + end do + deallocate(arr_tlmin_exp,arr_tlmax_exp,isum_beg,isum_end) + + arr_lextremes(0,:) = -nsummands + arr_lextremes(1:nflds,1) = -arr_lmax_exp(:) + arr_lextremes(1:nflds,2) = arr_lmin_exp(:) +#ifdef TIMING + call t_startf("repro_sum_allr_minmax") +#endif + call mpi_allreduce (arr_lextremes, arr_gextremes, 2*(nflds+1), & + MPI_INTEGER, MPI_MIN, mpi_comm, ierr) +#ifdef TIMING + call t_stopf("repro_sum_allr_minmax") +#endif + max_nsummands = -arr_gextremes(0,1) + arr_gmax_exp(:) = -arr_gextremes(1:nflds,1) + arr_gmin_exp(:) = arr_gextremes(1:nflds,2) + +! if a field is identically zero or contains INFs or NaNs, arr_gmin_exp +! still equals MAXEXPONENT and arr_gmax_exp still equals MINEXPONENT. +! In this case, set arr_gmin_exp = arr_gmax_exp = MINEXPONENT + do ifld=1,nflds + arr_gmin_exp(ifld) = min(arr_gmax_exp(ifld),arr_gmin_exp(ifld)) + enddo + +! if requested, return upper bounds on observed maxima + if ( present(arr_gbl_max_out) ) then + do ifld=1,nflds + arr_gbl_max_out(ifld) = scale(1.0_r8,arr_gmax_exp(ifld)) + enddo + endif + +! if requested, return max_nsummands before it is redefined + if ( present( gbl_max_nsummands_out) ) then + gbl_max_nsummands_out = max_nsummands + endif + +! determine maximum shift (same as in previous branch, but with calculated +! max_nsummands). Shift needs to be small enough that summation does not +! exceed maximum number of digits in i8. + +! summing within each thread first + max_nsummands = (max_nsummands/omp_nthreads) + 1 +! then over threads and tasks + max_nsummands = max(max_nsummands, tasks*omp_nthreads) +! A 'max' is used in the above calculation because the partial sum for +! each thread, calculated in cism_reprosum_int, is postprocessed so that +! each integer in the corresponding vector of integers is reduced in +! magnitude to be less than (radix(IX_8)**arr_max_shift). Therefore, +! the maximum shift can be calculated separately for per thread sums +! and sums over threads and tasks, and the smaller value used. This is +! equivalent to using max_nsummands as defined above. + + xmax_nsummands = dble(max_nsummands) + arr_max_shift = digits(0_i8) - (exponent(xmax_nsummands) + 1) + if (arr_max_shift < 2) then +!! call shr_sys_abort('repro_sum failed: number of summands too '// & +!! 'large for fixed precision algorithm' ) + write(iulog,*) & + 'repro_sum failed: number of summands too large for fixed precision algorithm' + call mpi_abort(MPI_COMM_WORLD, 1001, ierr) + endif + +! determine maximum number of levels required for each field +! ((digits(0.0_r8) + (arr_gmax_exp(ifld)-arr_gmin_exp(ifld))) / arr_max_shift) +! + 1 because first truncation probably does not involve a maximal shift +! + 1 to guarantee that the integer division rounds up (not down) +! (setting lower bound on max_level*nflds to be 64 to improve OpenMP +! performance for loopb in cism_reprosum_int) + max_level = (64/nflds) + 1 + do ifld=1,nflds + max_levels(ifld) = 2 + & + ((digits(0.0_r8) + (arr_gmax_exp(ifld)-arr_gmin_exp(ifld))) & + / arr_max_shift) + if ( present(arr_max_levels) .and. (.not. validate) ) then +! if validate true, then computation with arr_max_levels failed +! previously + if ( arr_max_levels(ifld) > 0 ) then + max_levels(ifld) = & + min(arr_max_levels(ifld),max_levels(ifld)) + endif + endif + if (max_level < max_levels(ifld)) & + max_level = max_levels(ifld) + enddo + +! if requested, return calculated levels + if ( present(arr_max_levels_out) ) then + do ifld=1,nflds + arr_max_levels_out(ifld) = max_levels(ifld) + enddo + endif + + if (verbose_reprosum .and. mypid == 0) then + write(iulog,*) ' max_nsummands =', max_nsummands + write(iulog,*) ' max_levels(1), max_level =', max_levels(1), max_level + write(iulog,*) ' call reprosum_int, pid =', mypid + endif + +! calculate sum + validate = .false. + call cism_reprosum_int(arr, arr_gsum, nsummands, dsummands, & + nflds, arr_max_shift, arr_gmax_exp, & + max_levels, max_level, arr_gsum_infnan, & + validate, recompute, omp_nthreads, mpi_comm) + + endif +#ifdef TIMING + call t_stopf('cism_reprosum_int') +#endif + endif + +! compare fixed and floating point results + if ( present(rel_diff) ) then + if (cism_reprosum_reldiffmax >= 0.0_r8) then +#ifdef TIMING + call t_barrierf('sync_nonrepro_sum',mpi_comm) + call t_startf('nonrepro_sum') +#endif +! record statistic + nonrepro_sum = 1 +! compute nonreproducible sum + arr_lsum(:) = 0._r8 +!$omp parallel do & +!$omp default(shared) & +!$omp private(ifld, isum) + do ifld=1,nflds + if (.not. arr_gsum_infnan(ifld)) then + do isum=1,nsummands + arr_lsum(ifld) = arr(isum,ifld) + arr_lsum(ifld) + end do + endif + end do +#ifdef TIMING + call t_startf("nonrepro_sum_allr_r8") +#endif + call mpi_allreduce (arr_lsum, arr_gsum_fast, nflds, & + MPI_REAL8, MPI_SUM, mpi_comm, ierr) +#ifdef TIMING + call t_stopf("nonrepro_sum_allr_r8") + + call t_stopf('nonrepro_sum') +#endif +! determine differences +!$omp parallel do & +!$omp default(shared) & +!$omp private(ifld, abs_diff) + do ifld=1,nflds + abs_diff = abs(arr_gsum_fast(ifld)-arr_gsum(ifld)) + if (abs(arr_gsum(ifld)) > abs_diff) then + rel_diff(1,ifld) = abs_diff/abs(arr_gsum(ifld)) + else + rel_diff(1,ifld) = abs_diff + endif + rel_diff(2,ifld) = abs_diff + enddo + else + rel_diff(:,:) = 0.0_r8 + endif + endif + +! Set field sums to NaN and INF, as needed + do ifld=1,nflds + if (arr_gsum_infnan(ifld)) then + if (inf_nan_gchecks(1,ifld)) then + ! NaN => NaN + arr_gsum(ifld) = cism_infnan_nan + else if (inf_nan_gchecks(2,ifld) .and. inf_nan_gchecks(3,ifld)) then + ! posINF and negINF => NaN + arr_gsum(ifld) = cism_infnan_nan + else if (inf_nan_gchecks(2,ifld)) then + ! posINF only => posINF + arr_gsum(ifld) = cism_infnan_posinf + else if (inf_nan_gchecks(3,ifld)) then + ! negINF only => negINF + arr_gsum(ifld) = cism_infnan_neginf + endif + endif + end do + +! return statistics + if ( present(repro_sum_stats) ) then + repro_sum_stats(1) = repro_sum_stats(1) + repro_sum_fast + repro_sum_stats(2) = repro_sum_stats(2) + repro_sum_slow + repro_sum_stats(3) = repro_sum_stats(3) + repro_sum_both + repro_sum_stats(4) = repro_sum_stats(4) + nonrepro_sum + repro_sum_stats(5) = repro_sum_stats(5) + gbl_max_red + repro_sum_stats(6) = repro_sum_stats(6) + gbl_lor_red + endif + + !WHL - debug + if (verbose_reprosum .and. mypid == 0) then + write(iulog,*) 'Exiting cism_reprosum_calc, arr_gsum =', arr_gsum(:) + endif + + end subroutine cism_reprosum_calc + +! +!======================================================================== +! + + subroutine cism_reprosum_int (arr, arr_gsum, nsummands, dsummands, nflds, & + arr_max_shift, arr_gmax_exp, max_levels, & + max_level, skip_field, validate, recompute, & + omp_nthreads, mpi_comm ) +!---------------------------------------------------------------------- +! +! Purpose: +! Compute the global sum of each field in "arr" using the indicated +! communicator with a reproducible yet scalable implementation based +! on a fixed point algorithm. The accuracy of the fixed point algorithm +! is controlled by the number of "levels" of integer expansion, the +! maximum value of which is specified by max_level. +! +!---------------------------------------------------------------------- +! +! Arguments +! + integer, intent(in) :: nsummands ! number of local summands + integer, intent(in) :: dsummands ! declared first dimension + integer, intent(in) :: nflds ! number of fields + integer, intent(in) :: arr_max_shift ! maximum safe exponent for + ! value < 1 (so that sum + ! does not overflow) + integer, intent(in) :: arr_gmax_exp(nflds) + ! exponents of global maxima + integer, intent(in) :: max_levels(nflds) + ! maximum number of levels + ! of integer expansion + integer, intent(in) :: max_level ! maximum value in + ! max_levels + integer, intent(in) :: omp_nthreads ! number of OpenMP threads + integer, intent(in) :: mpi_comm ! MPI subcommunicator + + real(r8), intent(in) :: arr(dsummands,nflds) + ! input array + + logical, intent(in) :: skip_field(nflds) + ! flag indicating whether the sum for this field should be + ! computed or not (used to skip over fields containing + ! NaN or INF summands) + + logical, intent(in) :: validate + ! flag indicating that accuracy of solution generated from + ! arr_gmax_exp and max_levels should be tested + + logical, intent(out):: recompute + ! flag indicating that either the upper bounds are inaccurate, + ! or max_levels and arr_gmax_exp do not generate accurate + ! enough sums + + real(r8), intent(out):: arr_gsum(nflds) ! global sums +! +! Local workspace +! + integer, parameter :: max_jlevel = & + 1 + (digits(0_i8)/digits(0.0_r8)) + + integer(i8) :: i8_arr_tlsum_level(0:max_level,nflds,omp_nthreads) + ! integer vector representing local + ! sum (per thread, per field) + integer(i8) :: i8_arr_lsum_level((max_level+3)*nflds) + ! integer vector representing local + ! sum + integer(i8) :: i8_arr_level ! integer part of summand for current + ! expansion level + integer(i8) :: i8_arr_gsum_level((max_level+3)*nflds) + ! integer vector representing global + ! sum + integer(i8) :: IX_8 ! integer representation of current + ! jlevels of X_8 ('part' of + ! i8_arr_gsum_level) + integer(i8) :: i8_sign ! sign global sum + integer(i8) :: i8_radix ! radix for i8 variables + + integer :: max_error(nflds,omp_nthreads) + ! accurate upper bound on data? + integer :: not_exact(nflds,omp_nthreads) + ! max_levels sufficient to + ! capture all digits? + integer :: isum_beg(omp_nthreads), isum_end(omp_nthreads) + ! range of summand indices for each + ! OpenMP thread + integer :: ifld, isum, ithread + ! loop variables + integer :: arr_exp ! exponent of summand + integer :: arr_shift ! exponent used to generate integer + ! for current expansion level + integer :: ilevel ! current integer expansion level + integer :: offset(nflds) ! beginning location in + ! i8_arr_{g,l}sum_level for integer + ! expansion of current ifld + integer :: voffset ! modification to offset used to + ! include validation metrics + integer :: ioffset ! offset(ifld) + integer :: jlevel ! number of floating point 'pieces' + ! extracted from a given i8 integer + integer :: ierr ! MPI error return + integer :: LX(max_jlevel) ! exponent of X_8 (see below) + integer :: veclth ! total length of i8_arr_lsum_level + integer :: sum_digits ! lower bound on number of significant + ! in integer expansion of sum + integer :: curr_exp ! exponent of partial sum during + ! reconstruction from integer vector + integer :: corr_exp ! exponent of current summand in + ! reconstruction from integer vector + + real(r8) :: arr_frac ! fraction of summand + real(r8) :: arr_remainder ! part of summand remaining after + ! current level of integer expansion + real(r8) :: X_8(max_jlevel) ! r8 vector representation of current + ! i8_arr_gsum_level + real(r8) :: RX_8 ! r8 representation of difference + ! between current i8_arr_gsum_level + ! and current jlevels of X_8 + ! (== IX_8). Also used in final + ! scaling step + + logical :: first ! flag used to indicate that just + ! beginning reconstruction of sum + ! from integer vector + + !WHL - debug + integer :: mypid, k + call mpi_comm_rank(MPI_COMM_WORLD, mypid, ierr) + + if (verbose_reprosum .and. mypid == 0) then + write(iulog,*) 'In cism_reprosum_int, pid, arr =', mypid, arr(:,:) + write(iulog,*) 'dsummands, nsummands =', dsummands, nsummands + write(iulog,*) 'size(arr) =', size(arr,1), size(arr,2) + endif + +! +!----------------------------------------------------------------------- +! Save radix of i8 variables in an i8 variable + i8_radix = radix(IX_8) + +! If validating upper bounds, reserve space for validation metrics +! In both cases, reserve an extra level for overflow from the top level + if (validate) then + voffset = 3 + else + voffset = 1 + endif + + ! compute offsets for each field + offset(1) = voffset + do ifld=2,nflds + offset(ifld) = offset(ifld-1) & + + (max_levels(ifld-1) + voffset) + enddo + veclth = offset(nflds) + max_levels(nflds) + +! split summand index range over OpenMP threads +! WHL - Should this be dsummands instead of nsummands? +!! call split_indices(nsummands, omp_nthreads, isum_beg, isum_end) + call split_indices(dsummands, omp_nthreads, isum_beg, isum_end) + + if (verbose_reprosum .and. mypid == 0) then + write(iulog,*) 'pid, i8_radix, voffset, veclth, nflds =', mypid, i8_radix, voffset, veclth, nflds + write(iulog,*) ' max_levels =', max_levels(:) + write(iulog,*) ' dsummands, omp_nthreads =', dsummands, omp_nthreads + write(iulog,*) ' isum_beg/end =', isum_beg, isum_end +! write(iulog,*) ' size(i8_arr_lsum_level) = ', size(i8_arr_lsum_level) +! write(iulog,*) ' size(i8_arr_gsum_level) = ', size(i8_arr_gsum_level) + endif + + +! convert local summands to vector of integers and sum +! (Using scale instead of set_exponent because arr_remainder may not be +! "normal" after level 1 calculation) + i8_arr_lsum_level(:) = 0_i8 + + !WHLmod - initialized the gsum also. + ! This avoids having some garbage (lots of nonzero values) in the vector returned from mpi_reduce_sum. + ! (Though I don't think the nonzero values do any harm + i8_arr_gsum_level(:) = 0_i8 + !end WHLmod + +!$omp parallel do & +!$omp default(shared) & +!$omp private(ithread, ifld, ioffset, isum, arr_frac, arr_exp, & +!$omp arr_shift, ilevel, i8_arr_level, arr_remainder, RX_8, IX_8) + do ithread=1,omp_nthreads +#ifdef TIMING + call t_startf('repro_sum_loopb') +#endif + do ifld=1,nflds + ioffset = offset(ifld) + + max_error(ifld,ithread) = 0 + not_exact(ifld,ithread) = 0 + i8_arr_tlsum_level(:,ifld,ithread) = 0_i8 + + if (skip_field(ifld)) cycle + + do isum=isum_beg(ithread),isum_end(ithread) + + if (verbose_reprosum .and. mypid == 1) then + write (iulog,*) ' pid, isum, ifld, arr(isum,ifld) =', mypid, isum, ifld, arr(isum,ifld) + endif + + arr_remainder = 0.0_r8 + + !WHL - debug + ! If isum_end > size(arr,1) = dsummands, then the next 'if' will try to access nonexistent memory. + ! Adding some logic to exit the do loop first. + ! Not necessary if split_indices is called with dsummands + !TODO: Check the split_indices logic for isum_end +!!!! if (isum > size(arr,1)) exit + + if (arr(isum,ifld) .ne. 0.0_r8) then + arr_exp = exponent(arr(isum,ifld)) + arr_frac = fraction(arr(isum,ifld)) + + if (verbose_reprosum) then +!!! write(iulog,*) ' pid, isum, arr_exp, arr_frac:', mypid, isum, arr_exp, arr_frac + endif + +! test that global maximum upper bound is an upper bound + if (arr_exp > arr_gmax_exp(ifld)) then + max_error(ifld,ithread) = 1 + exit + endif + +! calculate first shift + arr_shift = arr_max_shift - (arr_gmax_exp(ifld)-arr_exp) + +! determine first (probably) nonzero level (assuming initial fraction is +! 'normal' - algorithm still works if this is not true) +! NOTE: this is critical; scale will set to zero if min exponent is too small. + if (arr_shift < 1) then + ilevel = (1 + (arr_gmax_exp(ifld)-arr_exp))/arr_max_shift + arr_shift = ilevel*arr_max_shift - (arr_gmax_exp(ifld)-arr_exp) + + do while (arr_shift < 1) + arr_shift = arr_shift + arr_max_shift + ilevel = ilevel + 1 + enddo + else + ilevel = 1 + endif + + if (verbose_reprosum .and. mypid == 1) then + write (iulog,*) ' pid, ilevel, max_levels =', mypid, ilevel, max_levels(ifld) + endif + + if (ilevel .le. max_levels(ifld)) then +! apply first shift/truncate, add it to the relevant running +! sum, and calculate the remainder. + arr_remainder = scale(arr_frac,arr_shift) + i8_arr_level = int(arr_remainder,i8) + i8_arr_tlsum_level(ilevel,ifld,ithread) = & + i8_arr_tlsum_level(ilevel,ifld,ithread) + i8_arr_level + arr_remainder = arr_remainder - i8_arr_level + + if (verbose_reprosum .and. mypid <= 1) then + write(iulog,*) ' pid, arr_old, i8_arr_level, arr_new:', & + mypid, scale(arr_frac,arr_shift), i8_arr_level, arr_remainder + endif + +! while the remainder is non-zero, continue to shift, truncate, +! sum, and calculate new remainder + do while ((arr_remainder .ne. 0.0_r8) & + .and. (ilevel < max_levels(ifld))) + ilevel = ilevel + 1 + if (verbose_reprosum .and. mypid <= 1) then + write(iulog,*) ' pid, ilevel, arr_old =', mypid, ilevel, arr_remainder + endif + arr_remainder = scale(arr_remainder,arr_max_shift) + i8_arr_level = int(arr_remainder,i8) + i8_arr_tlsum_level(ilevel,ifld,ithread) = & + i8_arr_tlsum_level(ilevel,ifld,ithread) + i8_arr_level + + if (verbose_reprosum .and. mypid <= 1) then + write(iulog,*) ' pid, arr_scaled, i8_arr_level:', & + mypid, arr_remainder, i8_arr_level + endif + + arr_remainder = arr_remainder - i8_arr_level + + enddo ! arr_remainder /= 0.0_r8 + + endif + endif ! arr(isum,ifld) /= 0.0_r8 + + if (arr_remainder .ne. 0.0_r8) then + not_exact(ifld,ithread) = 1 + endif + + enddo ! isum + +! postprocess integer vector to eliminate potential for overlap in the following +! sums over threads and processes: if value larger than or equal to +! (radix(IX_8)**arr_max_shift), add this 'overlap' to next larger integer in +! vector, resulting in nonoverlapping ranges for each component. Note that +! "ilevel-1==0" corresponds to an extra level used to guarantee that the sums +! over threads and processes do not overflow for ilevel==1. + do ilevel=max_levels(ifld),1,-1 + RX_8 = i8_arr_tlsum_level(ilevel,ifld,ithread) + IX_8 = int(scale(RX_8,-arr_max_shift),i8) + if (IX_8 .ne. 0_i8) then + i8_arr_tlsum_level(ilevel-1,ifld,ithread) = & + i8_arr_tlsum_level(ilevel-1,ifld,ithread) + IX_8 + IX_8 = IX_8*(i8_radix**arr_max_shift) + i8_arr_tlsum_level(ilevel,ifld,ithread) = & + i8_arr_tlsum_level(ilevel,ifld,ithread) - IX_8 + endif + enddo + enddo +#ifdef TIMING + call t_stopf('repro_sum_loopb') +#endif + enddo ! ithread + +! sum contributions from different threads + do ifld=1,nflds + + if (verbose_reprosum .and. mypid == 0) then +!! write(iulog,*) 'ifld =', ifld + endif + + ioffset = offset(ifld) + do ithread = 1,omp_nthreads + do ilevel = 0,max_levels(ifld) + i8_arr_lsum_level(ioffset+ilevel) = & + i8_arr_lsum_level(ioffset+ilevel) & + + i8_arr_tlsum_level(ilevel,ifld,ithread) + + if (verbose_reprosum) then +!! write(iulog,*) ' pid, ilevel, ioffset, i8_arr_lsum_level =', & +!! mypid, ilevel, i8_arr_lsum_level(ioffset+ilevel) + endif + + enddo + enddo + enddo + +! record if upper bound was inaccurate or if level expansion stopped +! before full accuracy was achieved + if (validate) then + do ifld=1,nflds + ioffset = offset(ifld) + i8_arr_lsum_level(ioffset-voffset+1) = maxval(max_error(ifld,:)) + i8_arr_lsum_level(ioffset-voffset+2) = maxval(not_exact(ifld,:)) + enddo + endif + +! sum integer vector element-wise +#if ( defined noI8 ) + ! Workaround for when shr_kind_i8 is not supported. +#ifdef TIMING + call t_startf("repro_sum_allr_i4") +#endif + call mpi_allreduce (i8_arr_lsum_level, i8_arr_gsum_level, & + veclth, MPI_INTEGER, MPI_SUM, mpi_comm, ierr) + +#ifdef TIMING + call t_stopf("repro_sum_allr_i4") +#endif +#else +#ifdef TIMING + call t_startf("repro_sum_allr_i8") +#endif + + call mpi_allreduce (i8_arr_lsum_level, i8_arr_gsum_level, & + veclth, MPI_INTEGER8, MPI_SUM, mpi_comm, ierr) + + if (verbose_reprosum .and. mypid < 5) then + do k = 1, size(i8_arr_lsum_level) + if (i8_arr_lsum_level(k) /= 0) then + write(iulog,*) 'pid, k, i8_arr_lsum_level =', mypid, k, i8_arr_lsum_level(k) + endif + enddo + endif + + if (verbose_reprosum .and. mypid == 0) then + do k = 1, size(i8_arr_gsum_level) + if (i8_arr_gsum_level(k) /= 0) then + write(iulog,*) 'pid, k, i8_arr_gsum_level =', mypid, k, i8_arr_gsum_level(k) + endif + enddo + endif + +#ifdef TIMING + call t_stopf("repro_sum_allr_i8") +#endif +#endif + +! Construct global sum from integer vector representation: +! 1) arr_max_shift is the shift applied to fraction(arr_gmax) . +! When shifting back, need to "add back in" true arr_gmax exponent. This was +! removed implicitly by working only with the fraction . +! 2) want to add levels into sum in reverse order (smallest to largest). However, +! even this can generate floating point rounding errors if signs of integers +! alternate. To avoid this, do some arithmetic with integer vectors so that all +! components have the same sign. This should keep relative difference between +! using different integer sizes (e.g. i8 and i4) to machine epsilon +! 3) assignment to X_8 will usually lose accuracy since maximum integer +! size is greater than the max number of 'digits' in r8 value (if xmax_nsummands +! correction not very large). Calculate remainder and add in first (since +! smaller). One correction is sufficient for r8 (53 digits) and i8 (63 digits). +! For r4 (24 digits) may need to correct twice. Code is written in a general +! fashion, to work no matter how many corrections are necessary (assuming +! max_jlevel parameter calculation is correct). + + recompute = .false. + do ifld=1,nflds + arr_gsum(ifld) = 0.0_r8 + ioffset = offset(ifld) + +! if validate is .true., test whether the summand upper bound +! was exceeded on any of the processes + if (validate) then + if (i8_arr_gsum_level(ioffset-voffset+1) .ne. 0_i8) then + recompute = .true. + endif + endif + + if (.not. recompute) then + +! preprocess integer vector: +! a) if value larger than or equal to (radix(IX_8)**arr_max_shift), add this 'overlap' +! to next larger integer in vector, resulting in nonoverlapping ranges for each +! component. Note that have "ilevel-1=0" level here as described above. + do ilevel=max_levels(ifld),1,-1 + RX_8 = i8_arr_gsum_level(ioffset+ilevel) + IX_8 = int(scale(RX_8,-arr_max_shift),i8) + if (IX_8 .ne. 0_i8) then + i8_arr_gsum_level(ioffset+ilevel-1) = i8_arr_gsum_level(ioffset+ilevel-1) & + + IX_8 + IX_8 = IX_8*(i8_radix**arr_max_shift) + i8_arr_gsum_level(ioffset+ilevel) = i8_arr_gsum_level(ioffset+ilevel) & + - IX_8 + endif + enddo +! b) subtract +/- 1 from larger and add +/- 1 to smaller when necessary +! so that all vector components have the same sign (eliminating loss +! of accuracy arising from difference of large values when +! reconstructing r8 sum from integer vector) + ilevel = 0 + do while ((i8_arr_gsum_level(ioffset+ilevel) .eq. 0_i8) & + .and. (ilevel < max_levels(ifld))) + ilevel = ilevel + 1 + enddo +! + if (ilevel < max_levels(ifld)) then + if (i8_arr_gsum_level(ioffset+ilevel) > 0_i8) then + i8_sign = 1_i8 + else + i8_sign = -1_i8 + endif + do jlevel=ilevel,max_levels(ifld)-1 + if (sign(1_i8,i8_arr_gsum_level(ioffset+jlevel)) & + .ne. sign(1_i8,i8_arr_gsum_level(ioffset+jlevel+1))) then + i8_arr_gsum_level(ioffset+jlevel) = i8_arr_gsum_level(ioffset+jlevel) & + - i8_sign + i8_arr_gsum_level(ioffset+jlevel+1) = i8_arr_gsum_level(ioffset+jlevel+1) & + + i8_sign*(i8_radix**arr_max_shift) + endif + enddo + endif + +! start with maximum shift, and work up to larger values + arr_shift = arr_gmax_exp(ifld) & + - max_levels(ifld)*arr_max_shift + curr_exp = 0 + first = .true. + do ilevel=max_levels(ifld),0,-1 + + if (i8_arr_gsum_level(ioffset+ilevel) .ne. 0_i8) then + jlevel = 1 + +! r8 representation of higher order bits in integer + X_8(jlevel) = i8_arr_gsum_level(ioffset+ilevel) + LX(jlevel) = exponent(X_8(jlevel)) + +! calculate remainder + IX_8 = int(X_8(jlevel),i8) + RX_8 = (i8_arr_gsum_level(ioffset+ilevel) - IX_8) + +! repeat using remainder + do while (RX_8 .ne. 0.0_r8) + jlevel = jlevel + 1 + X_8(jlevel) = RX_8 + LX(jlevel) = exponent(RX_8) + IX_8 = IX_8 + int(RX_8,i8) + RX_8 = (i8_arr_gsum_level(ioffset+ilevel) - IX_8) + enddo + +! add in contributions, smaller to larger, rescaling for each +! addition to guarantee that exponent of working summand is always +! larger than minexponent + do while (jlevel > 0) + if (first) then + curr_exp = LX(jlevel) + arr_shift + arr_gsum(ifld) = fraction(X_8(jlevel)) + first = .false. + else + corr_exp = curr_exp - (LX(jlevel) + arr_shift) + arr_gsum(ifld) = fraction(X_8(jlevel)) & + + scale(arr_gsum(ifld),corr_exp) + curr_exp = LX(jlevel) + arr_shift + endif + jlevel = jlevel - 1 + enddo + + endif + + arr_shift = arr_shift + arr_max_shift + enddo + +! apply final exponent correction, scaling first if exponent is too small +! to apply directly + corr_exp = curr_exp + exponent(arr_gsum(ifld)) + if (corr_exp .ge. MINEXPONENT(1._r8)) then + arr_gsum(ifld) = set_exponent(arr_gsum(ifld),corr_exp) + else + RX_8 = set_exponent(arr_gsum(ifld), & + corr_exp-MINEXPONENT(1._r8)) + arr_gsum(ifld) = scale(RX_8,MINEXPONENT(1._r8)) + endif + + if (verbose_reprosum .and. mypid == 0) then + write(iulog,*) ' ifld, arr_gsum =', ifld, arr_gsum(ifld) + endif + +! if validate is .true. and some precision lost, test whether 'too much' +! was lost, due to too loose an upper bound, too stringent a limit on number +! of levels of expansion, cancellation, .... Calculated by comparing lower +! bound on number of sigificant digits with number of digits in 1.0_r8 . + if (validate) then + if (i8_arr_gsum_level(ioffset-voffset+2) .ne. 0_i8) then + +! find first nonzero level and use exponent for this level, then assume all +! subsequent levels contribute arr_max_shift digits. + sum_digits = 0 + do ilevel=0,max_levels(ifld) + if (sum_digits .eq. 0) then + if (i8_arr_gsum_level(ioffset+ilevel) .ne. 0_i8) then + X_8(1) = i8_arr_gsum_level(ioffset+ilevel) + LX(1) = exponent(X_8(1)) + sum_digits = LX(1) + endif + else + sum_digits = sum_digits + arr_max_shift + endif + enddo + + if (sum_digits < digits(1.0_r8)) then + recompute = .true. + endif + endif + endif + + endif + + enddo + + if (verbose_reprosum .and. mypid == 0) then + write(iulog,*) ' Done in cism_reprosum_int, pid =', mypid + k = 1 + write(iulog,*) ' arr_gsum, exp, frac =', arr_gsum(k), & + exponent(arr_gsum(k)), fraction(arr_gsum(k)) + endif + + end subroutine cism_reprosum_int + +! +!======================================================================== +! + + logical function cism_reprosum_tolExceeded (name, nflds, master, & + logunit, rel_diff ) +!---------------------------------------------------------------------- +! +! Purpose: +! Test whether distributed sum exceeds tolerance and print out a +! warning message. +! +!---------------------------------------------------------------------- +! +! Arguments +! + character(len=*), intent(in) :: name ! distributed sum identifier + integer, intent(in) :: nflds ! number of fields + logical, intent(in) :: master ! process that will write + ! warning messages? + integer, optional, intent(in) :: logunit! unit warning messages + ! written to + real(r8), intent(in) :: rel_diff(2,nflds) + ! relative and absolute + ! differences between fixed + ! and floating point sums + +! +! Local workspace +! + integer :: llogunit ! local log unit + integer :: ifld ! field index + integer :: exceeds_limit ! number of fields whose + ! sum exceeds tolerance + real(r8) :: max_rel_diff ! maximum relative difference + integer :: max_rel_diff_idx ! field index for max. rel. diff. + real(r8) :: max_abs_diff ! maximum absolute difference + integer :: max_abs_diff_idx ! field index for max. abs. diff. +! +!----------------------------------------------------------------------- +! + cism_reprosum_tolExceeded = .false. + if (cism_reprosum_reldiffmax < 0.0_r8) return + + if ( present(logunit) ) then + llogunit = logunit + else +!! llogunit = s_logunit + llogunit = iulog + endif + + ! check that "fast" reproducible sum is accurate enough. + exceeds_limit = 0 + max_rel_diff = 0.0_r8 + max_abs_diff = 0.0_r8 + max_rel_diff_idx = 0 + do ifld=1,nflds + if (rel_diff(1,ifld) > cism_reprosum_reldiffmax) then + exceeds_limit = exceeds_limit + 1 + if (rel_diff(1,ifld) > max_rel_diff) then + max_rel_diff = rel_diff(1,ifld) + max_rel_diff_idx = ifld + endif + if (rel_diff(2,ifld) > max_abs_diff) then + max_abs_diff = rel_diff(2,ifld) + max_abs_diff_idx = ifld + endif + endif + enddo + + if (exceeds_limit > 0) then + if (master) then + write(llogunit,*) trim(name), & + ': difference in fixed and floating point sums ', & + ' exceeds tolerance in ', exceeds_limit, & + ' fields.' + write(llogunit,*) ' Maximum relative diff: (rel)', & + rel_diff(1,max_rel_diff_idx), ' (abs) ', & + rel_diff(2,max_rel_diff_idx) + write(llogunit,*) ' Maximum absolute diff: (rel)', & + rel_diff(1,max_abs_diff_idx), ' (abs) ', & + rel_diff(2,max_abs_diff_idx) + endif + cism_reprosum_tolExceeded = .true. + endif + + + end function cism_reprosum_tolExceeded + +! +!======================================================================== +! + + subroutine cism_reprosum_ddpdd (arr, arr_gsum, nsummands, dsummands, & + nflds, mpi_comm ) +!---------------------------------------------------------------------- +! +! Purpose: +! Compute the global sum of each field in "arr" using the indicated +! communicator with a reproducible yet scalable implementation based +! on He and Ding's implementation of the double-double algorithm. +! +!---------------------------------------------------------------------- +! +! Arguments +! + integer, intent(in) :: nsummands ! number of local summands + integer, intent(in) :: dsummands ! declared first dimension + integer, intent(in) :: nflds ! number of fields + real(r8), intent(in) :: arr(dsummands,nflds) + ! input array + integer, intent(in) :: mpi_comm ! MPI subcommunicator + + real(r8), intent(out):: arr_gsum(nflds) + ! global sums + +! +! Local workspace +! + integer :: old_cw ! for x86 processors, save + ! current arithmetic mode + integer :: ifld, isum ! loop variables + integer :: ierr ! MPI error return + + real(r8) :: e, t1, t2 ! temporaries + complex(r8) :: arr_lsum_dd(nflds) ! local sums (in double-double + ! format) + complex(r8) :: arr_gsum_dd(nflds) ! global sums (in double-double + ! format) + + integer, save :: mpi_sumdd + logical, save :: first_time = .true. + +! +!----------------------------------------------------------------------- +! + !WHL - This is a C routine in the CESM shared code: shr_reprosumx86.c + ! Commented out the call. + ! Not sure whether this breaks the algorithm. +!! call cism_reprosumx86_fix_start (old_cw) + + if (first_time) then + call mpi_op_create(ddpdd, .true., mpi_sumdd, ierr) + first_time = .false. + endif + + do ifld=1,nflds + arr_lsum_dd(ifld) = (0.0_r8,0.0_r8) + + do isum=1,nsummands + + ! Compute arr(isum,ifld) + arr_lsum_dd(ifld) using Knuth''s + ! trick. + t1 = arr(isum,ifld) + real(arr_lsum_dd(ifld)) + e = t1 - arr(isum,ifld) + t2 = ((real(arr_lsum_dd(ifld)) - e) & + + (arr(isum,ifld) - (t1 - e))) & + + aimag(arr_lsum_dd(ifld)) + + ! The result is t1 + t2, after normalization. + arr_lsum_dd(ifld) = cmplx ( t1 + t2, t2 - ((t1 + t2) - t1), r8 ) + enddo + + enddo +#ifdef TIMING + call t_startf("repro_sum_allr_c16") +#endif + call mpi_allreduce (arr_lsum_dd, arr_gsum_dd, nflds, & + MPI_COMPLEX16, mpi_sumdd, mpi_comm, ierr) +#ifdef TIMING + call t_stopf("repro_sum_allr_c16") +#endif + do ifld=1,nflds + arr_gsum(ifld) = real(arr_gsum_dd(ifld)) + enddo + + !WHL - commented out; see comment above +!! call cism_reprosumx86_fix_end (old_cw) + + end subroutine cism_reprosum_ddpdd +! +!----------------------------------------------------------------------- +! + subroutine DDPDD (dda, ddb, len, itype) +!---------------------------------------------------------------------- +! +! Purpose: +! Modification of original codes written by David H. Bailey +! This subroutine computes ddb(i) = dda(i)+ddb(i) +! +!---------------------------------------------------------------------- +! +! Arguments +! + integer, intent(in) :: len ! array length + complex(r8), intent(in) :: dda(len) ! input + complex(r8), intent(inout) :: ddb(len) ! result + integer, intent(in) :: itype ! unused +! +! Local workspace +! + real(r8) e, t1, t2 + integer i +! +!----------------------------------------------------------------------- +! + do i = 1, len +! Compute dda + ddb using Knuth's trick. + t1 = real(dda(i)) + real(ddb(i)) + e = t1 - real(dda(i)) + t2 = ((real(ddb(i)) - e) + (real(dda(i)) - (t1 - e))) & + + aimag(dda(i)) + aimag(ddb(i)) + +! The result is t1 + t2, after normalization. + ddb(i) = cmplx ( t1 + t2, t2 - ((t1 + t2) - t1), r8 ) + enddo + + + end subroutine DDPDD +! +!----------------------------------------------------------------------- +! + subroutine split_indices(total,num_pieces,ibeg,iend) +!---------------------------------------------------------------------- +! +! Purpose: +! Split range into 'num_pieces' +! +!---------------------------------------------------------------------- +! +! Arguments +! + integer, intent(in) :: total + integer, intent(in) :: num_pieces + integer, intent(out) :: ibeg(num_pieces), iend(num_pieces) +! +! Local workspace +! + integer :: itmp1, itmp2, ioffset, i +! +!----------------------------------------------------------------------- +! + itmp1 = total/num_pieces + itmp2 = mod(total,num_pieces) + ioffset = 0 + do i=1,itmp2 + ibeg(i) = ioffset + 1 + iend(i) = ioffset + (itmp1+1) + ioffset = iend(i) + enddo + do i=itmp2+1,num_pieces + ibeg(i) = ioffset + 1 + if (ibeg(i) > total) then + iend(i) = ibeg(i) - 1 + else + iend(i) = ioffset + itmp1 + ioffset = iend(i) + endif + enddo + + end subroutine split_indices +! +!======================================================================== +! +end module cism_reprosum_mod diff --git a/libglimmer/glimmer_global.F90 b/libglimmer/glimmer_global.F90 index e9eb745d..0aefadee 100644 --- a/libglimmer/glimmer_global.F90 +++ b/libglimmer/glimmer_global.F90 @@ -30,12 +30,12 @@ module glimmer_global - !> Module holding global variables for Glimmer. Holds real-type - !> kind values, and other global code parameters. + !> Module holding global variables for Glimmer. + !> Holds real and integer kind values and other global code parameters. implicit none - integer,parameter :: sp = kind(1.0) + integer,parameter :: sp = kind(1.0) !> Single precision --- Fortran single-precision real-type kind !> value. Used internally. @@ -44,7 +44,7 @@ module glimmer_global !> the -r8 flag), then this parameter may need to be set in agreement with !> that. - integer,parameter :: dp = kind(1.0d0) + integer,parameter :: dp = kind(1.0d0) !> Double precision --- Fortran double-precision real-type kind !> value. Used internally. @@ -53,6 +53,12 @@ module glimmer_global !> the -r8 flag), then this parameter may need to be set in agreement !> with that + ! Integer kinds + ! Note: Integers are i4 by default. + ! i8 integers can be used to generate reproducible sums + integer, parameter :: i4 = kind(1) + integer, parameter :: i8 = kind(1_8) + !WHL - Removed rk from the code, so commenting out these declarations !!#ifdef GLIMMER_SP !! integer,parameter :: rk=sp !< Precision of glimmer module --- the general Fortran real-type kind value for the Glimmer module and its interfaces. diff --git a/libglimmer/ncdf_template.F90.in b/libglimmer/ncdf_template.F90.in index 177c5e29..378b18b4 100644 --- a/libglimmer/ncdf_template.F90.in +++ b/libglimmer/ncdf_template.F90.in @@ -437,14 +437,14 @@ contains ic=>model%funits%frc_first do while(associated(ic)) -! if (main_task .and. verbose_read_forcing) write(6,*) 'possible forcing times', ic%times +! if (main_task .and. verbose_read_forcing) write(iulog,*) 'possible forcing times', ic%times if (ic%read_once) then ! read once at initialization; do not re-read at runtime ic%nc%just_processed = .true. ! prevent the file from being read if (main_task .and. verbose_read_forcing) then - write(6,*) ' ' - write(6,*) 'In NAME_read_forcing; will not re-read the read_once file ', trim(ic%nc%filename) + write(iulog,*) ' ' + write(iulog,*) 'In NAME_read_forcing; will not re-read the read_once file ', trim(ic%nc%filename) endif else ! not a read_once file @@ -464,11 +464,11 @@ contains endif if (main_task .and. verbose_read_forcing) then - write(6,*) ' ' - write(6,*) 'In NAME_read_forcing, model time + eps =', model%numerics%time + eps - write(6,*) 'Forcing file nt, time_offset =', ic%nt, ic%time_offset - write(6,*) 'time_start_cycle, nyear_cycle:', ic%time_start_cycle, ic%nyear_cycle - write(6,*) 'current forcing time =', current_forcing_time + write(iulog,*) ' ' + write(iulog,*) 'In NAME_read_forcing, model time + eps =', model%numerics%time + eps + write(iulog,*) 'Forcing file nt, time_offset =', ic%nt, ic%time_offset + write(iulog,*) 'time_start_cycle, nyear_cycle:', ic%time_start_cycle, ic%nyear_cycle + write(iulog,*) 'current forcing time =', current_forcing_time endif ! Find the time index associated with the previous model time step @@ -476,7 +476,7 @@ contains do t = ic%nt, 1, -1 ! look through the time array backwards if (ic%times(t) <= current_forcing_time - model%numerics%tinc) then t_prev = t - if (main_task .and. verbose_read_forcing) write(6,*) 'Previous time index =', t_prev + if (main_task .and. verbose_read_forcing) write(iulog,*) 'Previous time index =', t_prev exit end if enddo @@ -486,7 +486,7 @@ contains if ( ic%times(t) <= current_forcing_time) then ! use the largest time that is smaller or equal to the current time (stepwise forcing) if (main_task .and. verbose_read_forcing) & - write(6,*) 'Largest time less than current forcing time: t, times(t):', t, ic%times(t) + write(iulog,*) 'Largest time less than current forcing time: t, times(t):', t, ic%times(t) ! If this time index (t) is larger than the previous index (t_prev), then read a new time slice. ! Otherwise, we already have the current slice, and there is nothing new to read. @@ -494,7 +494,7 @@ contains ! Set the desired time to be read ic%current_time = t ic%nc%just_processed = .false. ! set this to false so file will be read. - if (main_task .and. verbose_read_forcing) write(6,*) 'Read new forcing slice: t, times(t) =', t, ic%times(t) + if (main_task .and. verbose_read_forcing) write(iulog,*) 'Read new forcing slice: t, times(t) =', t, ic%times(t) endif ! t > t_prev exit ! once we find the time, exit the loop @@ -548,10 +548,10 @@ contains if (ic%read_once) then if (main_task .and. verbose_read_forcing) then - write(6,*) ' ' - write(6,*) 'In NAME_read_forcing_once' - write(6,*) 'Filename =', trim(ic%nc%filename) - write(6,*) 'Number of slices =', ic%nt + write(iulog,*) ' ' + write(iulog,*) 'In NAME_read_forcing_once' + write(iulog,*) 'Filename =', trim(ic%nc%filename) + write(iulog,*) 'Number of slices =', ic%nt endif write(message,*) 'Reading', ic%nt, 'slices of file ', trim(ic%nc%filename), ' just once at initialization' @@ -568,7 +568,7 @@ contains do t = 1, ic%nt if (main_task .and. verbose_read_forcing) then - write(6,*) 'Read new forcing slice: t index, times(t) =', t, ic%times(t) + write(iulog,*) 'Read new forcing slice: t index, times(t) =', t, ic%times(t) endif ! Set the desired time to be read @@ -588,7 +588,7 @@ contains endif ! read_once if (main_task .and. verbose_read_forcing) then - write(6,*) 'Final ic%nc%vars = ', trim(ic%nc%vars) + write(iulog,*) 'Final ic%nc%vars = ', trim(ic%nc%vars) endif ic=>ic%next @@ -650,13 +650,13 @@ contains endif if (main_task .and. verbose_read_forcing) then - write(6,*) ' ' - write(6,*) 'In NAME_retrieve_forcing, model time + eps =', model%numerics%time + eps - write(6,*) 'Filename = ', trim(ic%nc%filename) - write(6,*) 'Forcing file nt, time_offset =', ic%nt, ic%time_offset - write(6,*) 'time_start_cycle, nyear_cycle:', ic%time_start_cycle, ic%nyear_cycle - write(6,*) 'current forcing time =', current_forcing_time - write(6,*) 'variable list:', trim(ic%nc%vars) + write(iulog,*) ' ' + write(iulog,*) 'In NAME_retrieve_forcing, model time + eps =', model%numerics%time + eps + write(iulog,*) 'Filename = ', trim(ic%nc%filename) + write(iulog,*) 'Forcing file nt, time_offset =', ic%nt, ic%time_offset + write(iulog,*) 'time_start_cycle, nyear_cycle:', ic%time_start_cycle, ic%nyear_cycle + write(iulog,*) 'current forcing time =', current_forcing_time + write(iulog,*) 'variable list:', trim(ic%nc%vars) endif ! Optionally, associate the current forcing time with a different date in the forcing file. @@ -669,8 +669,8 @@ contains open(unit=11, file=trim(ic%shuffle_file), status='old') this_year = int(current_forcing_time - model%numerics%tstart) if (main_task .and. verbose_read_forcing) then - write(6,*) 'shuffle_file = ', trim(ic%shuffle_file) - write(6,*) 'tstart, this_year =', model%numerics%tstart, this_year + write(iulog,*) 'shuffle_file = ', trim(ic%shuffle_file) + write(iulog,*) 'tstart, this_year =', model%numerics%tstart, this_year endif forcing_year = 0 do while (forcing_year == 0) @@ -684,11 +684,11 @@ contains decimal_year = current_forcing_time - floor(current_forcing_time) current_forcing_time = real(forcing_year,dp) + decimal_year if (main_task .and. verbose_read_forcing) then - write(6,*) 'forcing_year, decimal =', forcing_year, decimal_year - write(6,*) 'shuffled forcing_time =', current_forcing_time + write(iulog,*) 'forcing_year, decimal =', forcing_year, decimal_year + write(iulog,*) 'shuffled forcing_time =', current_forcing_time endif else - if (main_task .and. verbose_read_forcing) write(6,*) 'no shuffle_file' + if (main_task .and. verbose_read_forcing) write(iulog,*) 'no shuffle_file' endif ! shuffle_file exists ! Find the time index associated with the previous model time step @@ -696,7 +696,7 @@ contains do t = ic%nt, 1, -1 ! look through the time array backwards if (ic%times(t) <= current_forcing_time - model%numerics%tinc) then t_prev = t - if (main_task .and. verbose_read_forcing) write(6,*) 'Previous time index =', t_prev + if (main_task .and. verbose_read_forcing) write(iulog,*) 'Previous time index =', t_prev exit end if enddo @@ -706,14 +706,14 @@ contains if ( ic%times(t) <= current_forcing_time) then ! use the largest time that is smaller or equal to the current time (stepwise forcing) if (main_task .and. verbose_read_forcing) & - write(6,*) 'Largest time less than current forcing time: t, times(t):', t, ic%times(t) + write(iulog,*) 'Largest time less than current forcing time: t, times(t):', t, ic%times(t) ! If this time index (t) is larger than the previous index (t_prev), then retrieve a new time slice. ! Otherwise, we already have the current slice, and there is nothing new to read. if (t > t_prev) then ! Set the desired time to be read ic%current_time = t retrieve_new_slice = .true. - if (main_task .and. verbose_read_forcing) write(6,*) 'Retrieve new forcing slice' + if (main_task .and. verbose_read_forcing) write(iulog,*) 'Retrieve new forcing slice' write(message,*) & 'Retrieve slice', t, 'at forcing time', ic%times(t), 'from file ', trim(ic%nc%filename) call write_log(message) diff --git a/libglimmer/parallel_mpi.F90 b/libglimmer/parallel_mpi.F90 index 39a06994..972b8b32 100644 --- a/libglimmer/parallel_mpi.F90 +++ b/libglimmer/parallel_mpi.F90 @@ -30,6 +30,16 @@ module cism_parallel use glimmer_global, only : dp, sp use glimmer_paramets, only: iulog +!TODO - Not sure setopts is needed +!TODO - Remove coupled ifdefs and always use the CISM version? +!TODO - Use cism_reprosum_mod from individual functions +#ifdef CCSMCOUPLED + use shr_reprosum_mod, only: shr_reprosum_setopts, shr_reprosum_calc +#else + use cism_reprosum_mod, only: cism_reprosum_setopts, cism_reprosum_calc +#endif + use cism_reprosum_mod, only: verbose_reprosum + implicit none ! integers associated with the main global communicator @@ -144,6 +154,10 @@ module cism_parallel integer :: main_rank_col ! integer ID for the master task on the column logical :: main_task_col ! true if this_rank_col = main_rank_col + ! option to compute reproducible sums + logical :: reprosum ! if true, compute reproducible global sums + ! (interface parallel_reduce_sum) + end type parallel_type ! Information on the local & global bounds of an array @@ -291,17 +305,21 @@ module cism_parallel interface parallel_global_sum module procedure parallel_global_sum_integer_2d module procedure parallel_global_sum_integer_3d - module procedure parallel_global_sum_real4_2d module procedure parallel_global_sum_real8_2d module procedure parallel_global_sum_real8_3d end interface - interface parallel_global_sum_staggered - module procedure parallel_global_sum_staggered_3d_real8 - module procedure parallel_global_sum_staggered_3d_real8_nvar - module procedure parallel_global_sum_staggered_2d_real8 - module procedure parallel_global_sum_staggered_2d_real8_nvar - end interface parallel_global_sum_staggered + interface parallel_global_sum_patch + module procedure parallel_global_sum_patch_integer_2d + module procedure parallel_global_sum_patch_real8_2d + end interface parallel_global_sum_patch + + interface parallel_global_sum_stagger + module procedure parallel_global_sum_stagger_real8_2d + module procedure parallel_global_sum_stagger_real8_3d + module procedure parallel_global_sum_stagger_real8_2d_nflds + module procedure parallel_global_sum_stagger_real8_3d_nflds + end interface parallel_global_sum_stagger interface parallel_halo module procedure parallel_halo_integer_2d @@ -2427,7 +2445,9 @@ end function distributed_get_var_real8_3d subroutine distributed_grid(ewn, nsn, & parallel, & - nhalo_in, global_bc_in) + nhalo_in, & + global_bc_in, & + reprosum_in) ! Divide the global domain into blocks, with one task per block. ! Set various grid and domain variables for the local task. @@ -2437,6 +2457,7 @@ subroutine distributed_grid(ewn, nsn, & type(parallel_type), intent(inout) :: parallel ! info for parallel communication, computed here integer, intent(in), optional :: nhalo_in ! number of rows of halo cells character(*), intent(in), optional :: global_bc_in ! string indicating the global BC option + logical, intent(in), optional :: reprosum_in ! if true, compute reproducible global sums integer :: best,i,j,metric real(dp) :: rewtasks,rnstasks @@ -2479,7 +2500,8 @@ subroutine distributed_grid(ewn, nsn, & staggered_ilo => parallel%staggered_ilo, & staggered_ihi => parallel%staggered_ihi, & staggered_jlo => parallel%staggered_jlo, & - staggered_jhi => parallel%staggered_jhi & + staggered_jhi => parallel%staggered_jhi, & + reprosum => parallel%reprosum & ) ! set the boundary conditions (periodic by default) @@ -2665,6 +2687,25 @@ subroutine distributed_grid(ewn, nsn, & call parallel_stop(__FILE__, __LINE__) endif + if (present(reprosum_in)) then + reprosum = reprosum_in + else + reprosum = .false. + endif + + ! If computing reproducible sums, then set some options + !TODO - Are these saved from one call to the next? + ! Note: For standalone CISM, reprosum = F by default; can set = T in the config file + ! For CESM coupled runs, reprosum = T by default + if (reprosum) then + +#ifdef CCSM_COUPLED +!! call shr_reprosum_setops() +#else +!! call cism_reprosum_setops() +#endif + endif ! reprosum + ! call parallel_barrier ! write(iulog,*) 'task, west, east, south, north:', this_rank, west, east, south, north @@ -2693,6 +2734,7 @@ subroutine distributed_grid_active_blocks(ewn, nsn, & nx_block, ny_block, & ice_domain_mask, & parallel, & + reprosum_in, & inquire_only) ! Divide the global domain into blocks, setting various grid and domain variables @@ -2731,6 +2773,7 @@ subroutine distributed_grid_active_blocks(ewn, nsn, & integer, intent(in), dimension(:,:) :: & ice_domain_mask ! = 1 where ice is potentially present and active, else = 0 type(parallel_type), intent(inout) :: parallel ! info for parallel communication, computed here + logical, intent(in), optional :: reprosum_in ! if true, compute reproducible global sums logical, intent(in), optional :: inquire_only ! if true, then report the number of active blocks and abort integer :: i, j, nb, nt @@ -2809,7 +2852,8 @@ subroutine distributed_grid_active_blocks(ewn, nsn, & staggered_ilo => parallel%staggered_ilo, & staggered_ihi => parallel%staggered_ihi, & staggered_jlo => parallel%staggered_jlo, & - staggered_jhi => parallel%staggered_jhi & + staggered_jhi => parallel%staggered_jhi, & + reprosum => parallel%reprosum & ) if (present(inquire_only)) then @@ -3283,7 +3327,28 @@ subroutine distributed_grid_active_blocks(ewn, nsn, & ! southwest_corner, southeast_corner, northwest_corner, northeast_corner endif - ! Uncomment to print grid geometry + if (present(reprosum_in)) then + reprosum = reprosum_in + else + reprosum = .false. + endif + + ! If computing reproducible sums, then set some options + !TODO - Are these saved from one call to the next? + ! Note: For standalone CISM, reprosum = F by default; can set = T in the config file + ! For CESM coupled runs, reprosum = T by default + + if (reprosum) then + +#ifdef CCSM_COUPLED +!! call shr_reprosum_setops() +#else +!! call cism_reprosum_setops() +#endif + + endif ! reprosum + +! Uncomment to print grid geometry ! write(iulog,*) " " ! write(iulog,*) "Process ", this_rank, " Total = ", tasks, " ewtasks = ", ewtasks, " nstasks = ", nstasks ! write(iulog,*) "Process ", this_rank, " ewrank = ", ewrank, " nsrank = ", nsrank @@ -6082,18 +6147,18 @@ end function parallel_global_sum_integer_3d !======================================================================= - function parallel_global_sum_real4_2d(a, parallel, mask_2d) + function parallel_global_sum_real8_2d(a, parallel, mask_2d) - ! Calculates the global sum of a 2D single-precision field + ! Calculates the global sum of a 2D double-precision field - real(sp),dimension(:,:),intent(in) :: a + real(dp), dimension(:,:), intent(in) :: a type(parallel_type) :: parallel integer, dimension(:,:), intent(in), optional :: mask_2d integer :: i, j integer, dimension(parallel%local_ewn,parallel%local_nsn) :: mask - real(sp) :: local_sum - real(sp) :: parallel_global_sum_real4_2d + real(dp) :: local_sum + real(dp) :: parallel_global_sum_real8_2d associate( & local_ewn => parallel%local_ewn, & @@ -6105,130 +6170,183 @@ function parallel_global_sum_real4_2d(a, parallel, mask_2d) mask = 1 endif - local_sum = 0.0 - do j = nhalo+1, local_nsn-nhalo - do i = nhalo+1, local_ewn-nhalo - if (mask(i,j) == 1) then - local_sum = local_sum + a(i,j) - endif + if (parallel%reprosum) then ! compute using cism_reprosum_calc + + !TODO - Add the code here + call parallel_stop(__FILE__,__LINE__) + + else ! compute using parallel_reduce_sum (not reproducible) + + local_sum = 0.0d0 + do j = nhalo+1, local_nsn-nhalo + do i = nhalo+1, local_ewn-nhalo + if (mask(i,j) == 1) then + local_sum = local_sum + a(i,j) + endif + enddo enddo - enddo - parallel_global_sum_real4_2d = parallel_reduce_sum(local_sum) + + ! Compute the global sum + parallel_global_sum_real8_2d = parallel_reduce_sum(local_sum) + + endif ! reprosum end associate - end function parallel_global_sum_real4_2d + end function parallel_global_sum_real8_2d !======================================================================= - function parallel_global_sum_real8_2d(a, parallel, mask_2d) + function parallel_global_sum_real8_3d(a, parallel, mask_2d) - ! Calculates the global sum of a 2D double-precision field + ! Calculates the global sum of a 3D double-precision field + ! Note: The vertical dimension should be the first dimension of the input field. - real(dp), dimension(:,:), intent(in) :: a + real(dp), dimension(:,:,:),intent(in) :: a type(parallel_type) :: parallel integer, dimension(:,:), intent(in), optional :: mask_2d - integer :: i, j + integer :: i, j, k + integer :: kmax integer, dimension(parallel%local_ewn,parallel%local_nsn) :: mask real(dp) :: local_sum - real(dp) :: parallel_global_sum_real8_2d + real(dp) :: parallel_global_sum_real8_3d associate( & local_ewn => parallel%local_ewn, & local_nsn => parallel%local_nsn) + kmax = size(a,1) + + ! Note: The mask is 2D, since typically all layers in a column are either masked in or masked out if (present(mask_2d)) then mask = mask_2d else mask = 1 endif - local_sum = 0.0d0 + if (parallel%reprosum) then ! compute using cism_reprosum_calc + + !TODO - Add the code here + call parallel_stop(__FILE__,__LINE__) + + else ! compute using parallel_reduce_sum (not reproducible) + + local_sum = 0 + do j = nhalo+1, local_nsn-nhalo + do i = nhalo+1, local_ewn-nhalo + if (mask(i,j) == 1) then + do k = 1, kmax + local_sum = local_sum + a(k,i,j) + enddo + endif + enddo + enddo + parallel_global_sum_real8_3d = parallel_reduce_sum(local_sum) + + endif ! reprosum + + end associate + + end function parallel_global_sum_real8_3d + +!======================================================================= + ! subroutines belonging to the parallel_global_sum_patch interface + + function parallel_global_sum_patch_integer_2d(a, npatch, patch_id, parallel) + + ! Calculates the global sum of a 2D double-precision field over each + ! user-defined patch of the domain. + ! The number of patches = npatch. + ! Each cell has an integer ID assigning it to at most one patch. + ! If a cell has patch_id = 0, it belongs to no patches. + + integer, dimension(:,:), intent(in) :: a + integer, intent(in) :: npatch + integer, dimension(:,:), intent(in) :: patch_id + type(parallel_type) :: parallel + + integer :: i, j, np + integer, dimension(npatch) :: local_patch_sum + integer, dimension(npatch) :: parallel_global_sum_patch_integer_2d + + associate( & + local_ewn => parallel%local_ewn, & + local_nsn => parallel%local_nsn) + + local_patch_sum = 0 + do j = nhalo+1, local_nsn-nhalo do i = nhalo+1, local_ewn-nhalo - if (mask(i,j) == 1) then - local_sum = local_sum + a(i,j) + np = patch_id(i,j) + if (np > 0) then + local_patch_sum(np) = local_patch_sum(np) + a(i,j) endif enddo enddo - parallel_global_sum_real8_2d = parallel_reduce_sum(local_sum) + + parallel_global_sum_patch_integer_2d = parallel_reduce_sum(local_patch_sum) end associate - end function parallel_global_sum_real8_2d + end function parallel_global_sum_patch_integer_2d !======================================================================= - function parallel_global_sum_real8_3d(a, parallel, mask_3d) + function parallel_global_sum_patch_real8_2d(a, npatch, patch_id, parallel) - ! Calculates the global sum of a 3D double-precision field - ! Note: The vertical dimension should be the first dimension of the input field. + ! Calculates the global sum of a 2D double-precision field over each + ! user-defined patch of the domain. + ! The number of patches = npatch. + ! Each cell has an integer ID assigning it to at most one patch. + ! If a cell has patch_id = 0, it belongs to no patches. + !TODO - Add a reprosum option with npatch = nflds - real(dp), dimension(:,:,:),intent(in) :: a + real(dp), dimension(:,:), intent(in) :: a + integer, intent(in) :: npatch + integer, dimension(:,:), intent(in) :: patch_id type(parallel_type) :: parallel - integer, dimension(:,:,:), intent(in), optional :: mask_3d - integer :: i, j, k - integer :: kmax - integer, dimension(size(a,1),parallel%local_ewn,parallel%local_nsn) :: mask - real(dp) :: local_sum - real(dp) :: parallel_global_sum_real8_3d + integer :: i, j, np + real(dp), dimension(npatch) :: local_patch_sum + real(dp), dimension(npatch) :: parallel_global_sum_patch_real8_2d associate( & local_ewn => parallel%local_ewn, & local_nsn => parallel%local_nsn) - kmax = size(a,1) - - if (present(mask_3d)) then - mask = mask_3d - else - mask = 1 - endif + local_patch_sum = 0.0d0 - local_sum = 0 do j = nhalo+1, local_nsn-nhalo do i = nhalo+1, local_ewn-nhalo - do k = 1, kmax - if (mask(k,i,j) == 1) then - local_sum = local_sum + a(k,i,j) - endif - enddo + np = patch_id(i,j) + if (np > 0) then + local_patch_sum(np) = local_patch_sum(np) + a(i,j) + endif enddo enddo - parallel_global_sum_real8_3d = parallel_reduce_sum(local_sum) + + parallel_global_sum_patch_real8_2d = parallel_reduce_sum(local_patch_sum) end associate - end function parallel_global_sum_real8_3d + end function parallel_global_sum_patch_real8_2d !======================================================================= + ! subroutines belonging to the parallel_global_sum_stagger interface - ! subroutines belonging to the parallel_global_sum_staggered interface - !TODO - Turn these into functions, analogous to the parallel_global_sum functions above. - - subroutine parallel_global_sum_staggered_3d_real8(& - nx, ny, & - nz, parallel, & - global_sum, & - work1, work2) - - ! Sum one or two local arrays on the staggered grid, then take the global sum. - - integer, intent(in) :: & - nx, ny, & ! horizontal grid dimensions (for scalars) - nz ! number of vertical layers at which velocity is computed + function parallel_global_sum_stagger_real8_2d(arr1, parallel, arr2) - type(parallel_type), intent(in) :: & - parallel ! info for parallel communication + ! Calculate the global sum of a 2D double-precision field on the staggered grid + ! Similar to unstagged version, except it uses staggered_ilo/ihi/jlo/jhi - real(dp), intent(out) :: global_sum ! global sum - real(dp), intent(in), dimension(nz,nx-1,ny-1) :: work1 ! local array - real(dp), intent(in), dimension(nz,nx-1,ny-1), optional :: work2 ! local array + real(dp), dimension(:,:), intent(in) :: arr1 + type(parallel_type) :: parallel + real(dp), dimension(:,:), intent(in), optional :: arr2 - integer :: i, j, k + integer :: i, j real(dp) :: local_sum + real(dp) :: parallel_global_sum_stagger_real8_2d integer :: & staggered_ilo, staggered_ihi, & ! bounds of locally owned vertices on staggered grid @@ -6239,58 +6357,53 @@ subroutine parallel_global_sum_staggered_3d_real8(& staggered_jlo = parallel%staggered_jlo staggered_jhi = parallel%staggered_jhi - local_sum = 0.d0 + if (parallel%reprosum) then ! compute using cism_reprosum_calc - ! sum over locally owned velocity points + !TODO - Add the code here + call parallel_stop(__FILE__,__LINE__) - if (present(work2)) then - do j = staggered_jlo, staggered_jhi - do i = staggered_ilo, staggered_ihi - do k = 1, nz - local_sum = local_sum + work1(k,i,j) + work2(k,i,j) - enddo - enddo - enddo - else - do j = staggered_jlo, staggered_jhi - do i = staggered_ilo, staggered_ihi - do k = 1, nz - local_sum = local_sum + work1(k,i,j) + else ! compute using parallel_reduce_sum (not reproducible) + + local_sum = 0.0d0 + + if (present(arr2)) then ! compute global sum of arr1 + arr2 + + do j = staggered_jlo, staggered_jhi + do i = staggered_ilo, staggered_ihi + local_sum = local_sum + arr1(i,j) + arr2(i,j) enddo enddo - enddo - endif - ! take the global sum + else ! compute global sum of arr1 - global_sum = parallel_reduce_sum(local_sum) + do j = staggered_jlo, staggered_jhi + do i = staggered_ilo, staggered_ihi + local_sum = local_sum + arr1(i,j) + enddo + enddo - end subroutine parallel_global_sum_staggered_3d_real8 + endif -!======================================================================= + parallel_global_sum_stagger_real8_2d = parallel_reduce_sum(local_sum) - subroutine parallel_global_sum_staggered_3d_real8_nvar(& - nx, ny, & - nz, parallel, & - global_sum, & - work1, work2) + endif ! reprosum - ! Sum one or two local arrays on the staggered grid, then take the global sum. + end function parallel_global_sum_stagger_real8_2d - integer, intent(in) :: & - nx, ny, & ! horizontal grid dimensions (for scalars) - nz ! number of vertical layers at which velocity is computed +!======================================================================= - type(parallel_type), intent(in) :: & - parallel ! info for parallel communication + function parallel_global_sum_stagger_real8_3d(arr1, parallel, arr2) - real(dp), intent(out), dimension(:) :: global_sum ! global sum + ! Calculate the global sum of a 3D double-precision field on the staggered grid + ! Assumes k is the first index, followed by i and j - real(dp), intent(in), dimension(nz,nx-1,ny-1,size(global_sum)) :: work1 ! local array - real(dp), intent(in), dimension(nz,nx-1,ny-1,size(global_sum)), optional :: work2 ! local array + real(dp), dimension(:,:,:), intent(in) :: arr1 + type(parallel_type) :: parallel + real(dp), dimension(:,:,:), intent(in), optional :: arr2 - integer :: i, j, k, n, nvar - real(dp), dimension(size(global_sum)) :: local_sum + integer :: i, j, k, nz + real(dp) :: local_sum + real(dp) :: parallel_global_sum_stagger_real8_3d integer :: & staggered_ilo, staggered_ihi, & ! bounds of locally owned vertices on staggered grid @@ -6301,63 +6414,66 @@ subroutine parallel_global_sum_staggered_3d_real8_nvar(& staggered_jlo = parallel%staggered_jlo staggered_jhi = parallel%staggered_jhi - nvar = size(global_sum) + nz = size(arr1,1) - local_sum(:) = 0.d0 + if (parallel%reprosum) then ! compute using cism_reprosum_calc - do n = 1, nvar + !TODO - Add the code here + call parallel_stop(__FILE__,__LINE__) - ! sum over locally owned velocity points + else ! compute using parallel_reduce_sum (not reproducible) + + local_sum = 0.0d0 + + if (present(arr2)) then ! compute global sum of arr1 + arr2 - if (present(work2)) then do j = staggered_jlo, staggered_jhi do i = staggered_ilo, staggered_ihi do k = 1, nz - local_sum(n) = local_sum(n) + work1(k,i,j,n) + work2(k,i,j,n) + local_sum = local_sum + arr1(k,i,j) + arr2(k,i,j) enddo enddo enddo - else + + else ! compute global sum of arr1 + do j = staggered_jlo, staggered_jhi do i = staggered_ilo, staggered_ihi do k = 1, nz - local_sum(n) = local_sum(n) + work1(k,i,j,n) + local_sum = local_sum + arr1(k,i,j) enddo enddo enddo - endif - enddo ! nvar + endif - ! take the global sum + parallel_global_sum_stagger_real8_3d = parallel_reduce_sum(local_sum) - global_sum(:) = parallel_reduce_sum(local_sum(:)) + endif ! reprosum - end subroutine parallel_global_sum_staggered_3d_real8_nvar + end function parallel_global_sum_stagger_real8_3d !======================================================================= - subroutine parallel_global_sum_staggered_2d_real8(& - nx, ny, & - parallel, & - global_sum, & - work1, work2) + function parallel_global_sum_stagger_real8_2d_nflds(arr1, nflds, parallel, arr2) ! Sum one or two local arrays on the staggered grid, then take the global sum. + ! The final index is equal to the number of independent fields to be summed. + + real(dp), dimension(:,:,:), intent(in) :: arr1 - integer, intent(in) :: & - nx, ny ! horizontal grid dimensions (for scalars) + integer, intent(in) :: nflds type(parallel_type), intent(in) :: & parallel ! info for parallel communication - real(dp), intent(out) :: global_sum ! global sum + real(dp), dimension(:,:,:), intent(in), optional :: arr2 - real(dp), intent(in), dimension(nx-1,ny-1) :: work1 ! local array - real(dp), intent(in), dimension(nx-1,ny-1), optional :: work2 ! local array + real(dp), dimension(size(arr1,3)) :: parallel_global_sum_stagger_real8_2d_nflds - integer :: i, j - real(dp) :: local_sum + integer :: i, j, n + + real(dp), dimension(size(arr1,3)) :: local_sum integer :: & staggered_ilo, staggered_ihi, & ! bounds of locally owned vertices on staggered grid @@ -6368,55 +6484,64 @@ subroutine parallel_global_sum_staggered_2d_real8(& staggered_jlo = parallel%staggered_jlo staggered_jhi = parallel%staggered_jhi - local_sum = 0.d0 + if (parallel%reprosum) then ! compute using cism_reprosum_calc - ! sum over locally owned velocity points + !TODO - Add the code here + call parallel_stop(__FILE__,__LINE__) - if (present(work2)) then - do j = staggered_jlo, staggered_jhi - do i = staggered_ilo, staggered_ihi - local_sum = local_sum + work1(i,j) + work2(i,j) - enddo - enddo - else - do j = staggered_jlo, staggered_jhi - do i = staggered_ilo, staggered_ihi - local_sum = local_sum + work1(i,j) - enddo - enddo - endif + else ! compute using parallel_reduce_sum (not reproducible) - ! take the global sum + local_sum(:) = 0.d0 + + do n = 1, nflds + + ! sum over locally owned velocity points + + if (present(arr2)) then + do j = staggered_jlo, staggered_jhi + do i = staggered_ilo, staggered_ihi + local_sum(n) = local_sum(n) + arr1(i,j,n) + arr2(i,j,n) + enddo + enddo + else + do j = staggered_jlo, staggered_jhi + do i = staggered_ilo, staggered_ihi + local_sum(n) = local_sum(n) + arr1(i,j,n) + enddo + enddo + endif - global_sum = parallel_reduce_sum(local_sum) + enddo ! nflds - end subroutine parallel_global_sum_staggered_2d_real8 + endif ! reprosum + + ! take the global sum + parallel_global_sum_stagger_real8_2d_nflds = parallel_reduce_sum(local_sum(:)) + + end function parallel_global_sum_stagger_real8_2d_nflds !======================================================================= - subroutine parallel_global_sum_staggered_2d_real8_nvar(& - nx, ny, & - parallel, & - global_sum, & - work1, work2) + function parallel_global_sum_stagger_real8_3d_nflds(arr1, nflds, parallel, arr2) ! Sum one or two local arrays on the staggered grid, then take the global sum. + ! Assumes k is the first index, followed by i and j. + ! The final index is equal to the number of independent fields to be summed. + + real(dp), dimension(:,:,:,:), intent(in) :: arr1 - integer, intent(in) :: & - nx, ny ! horizontal grid dimensions (for scalars) + integer, intent(in) :: nflds ! size of final index; number of global sums to be computed type(parallel_type), intent(in) :: & parallel ! info for parallel communication - real(dp), intent(out), dimension(:) :: & - global_sum ! global sum + real(dp), dimension(:,:,:,:), intent(in), optional :: arr2 - real(dp), intent(in), dimension(nx-1,ny-1,size(global_sum)) :: work1 ! local array - real(dp), intent(in), dimension(nx-1,ny-1,size(global_sum)), optional :: work2 ! local array + real(dp), dimension(size(arr1,4)) :: parallel_global_sum_stagger_real8_3d_nflds - integer :: i, j, n, nvar + integer :: i, j, k, n, nz - real(dp), dimension(size(global_sum)) :: local_sum + real(dp), dimension(size(arr1,4)) :: local_sum integer :: & staggered_ilo, staggered_ihi, & ! bounds of locally owned vertices on staggered grid @@ -6427,38 +6552,49 @@ subroutine parallel_global_sum_staggered_2d_real8_nvar(& staggered_jlo = parallel%staggered_jlo staggered_jhi = parallel%staggered_jhi - nvar = size(global_sum) + nz = size(arr1,1) - local_sum(:) = 0.d0 + if (parallel%reprosum) then ! compute using cism_reprosum_calc - do n = 1, nvar + !TODO - Add the code here + call parallel_stop(__FILE__,__LINE__) - ! sum over locally owned velocity points + else ! compute using parallel_reduce_sum (not reproducible) - if (present(work2)) then - do j = staggered_jlo, staggered_jhi - do i = staggered_ilo, staggered_ihi - local_sum(n) = local_sum(n) + work1(i,j,n) + work2(i,j,n) + local_sum(:) = 0.d0 + + do n = 1, nflds + + ! sum over locally owned velocity points + + if (present(arr2)) then + do j = staggered_jlo, staggered_jhi + do i = staggered_ilo, staggered_ihi + do k = 1, nz + local_sum(n) = local_sum(n) + arr1(k,i,j,n) + arr2(k,i,j,n) + enddo + enddo enddo - enddo - else - do j = staggered_jlo, staggered_jhi - do i = staggered_ilo, staggered_ihi - local_sum(n) = local_sum(n) + work1(i,j,n) + else + do j = staggered_jlo, staggered_jhi + do i = staggered_ilo, staggered_ihi + do k = 1, nz + local_sum(n) = local_sum(n) + arr1(k,i,j,n) + enddo + enddo enddo - enddo - endif + endif - enddo ! nvar + enddo ! nflds - ! take the global sum + endif ! reprosum - global_sum(:) = parallel_reduce_sum(local_sum(:)) + ! take the global sum + parallel_global_sum_stagger_real8_3d_nflds = parallel_reduce_sum(local_sum(:)) - end subroutine parallel_global_sum_staggered_2d_real8_nvar + end function parallel_global_sum_stagger_real8_3d_nflds !======================================================================= - ! functions belonging to the parallel_is_zero interface function parallel_is_zero_integer_2d(a) @@ -8701,13 +8837,20 @@ end function parallel_reduce_sum_real4 function parallel_reduce_sum_real8(x) use mpi_mod + implicit none + real(dp) :: x integer :: ierror real(dp) :: recvbuf,sendbuf, parallel_reduce_sum_real8 + ! Input and output arguments for subroutine cism_reprosum_calc + real(dp), dimension(:,:), allocatable :: arr ! array to be summed over processors + real(dp), dimension(:), allocatable :: arr_gsum ! global sum of arr + ! begin + sendbuf = x call mpi_allreduce(sendbuf,recvbuf,1,mpi_real8,mpi_sum,comm,ierror) parallel_reduce_sum_real8 = recvbuf @@ -8742,14 +8885,151 @@ function parallel_reduce_sum_real8_nvar(x) integer :: ierror, nvar real(dp), dimension(size(x)) :: recvbuf,sendbuf, parallel_reduce_sum_real8_nvar + ! Input and output arguments for subroutine cism_reprosum_calc + real(dp), dimension(:,:), allocatable :: arr ! array to be summed over processors + real(dp), dimension(:), allocatable :: arr_gsum ! global sum of arr + ! begin nvar = size(x) + sendbuf = x call mpi_allreduce(sendbuf,recvbuf,nvar,mpi_real8,mpi_sum,comm,ierror) parallel_reduce_sum_real8_nvar = recvbuf end function parallel_reduce_sum_real8_nvar +!======================================================================= + + subroutine parallel_reduce_reprosum(arr, arr_gsum) + + ! Compute a reproducible global sum for a floating-point variable or array. + ! Can be called from parallel_global_sum, parallel_global_sum_patch, or + ! parallel_global_sum_stagger. + ! Still under construction + + implicit none + + real(dp), dimension(:,:), intent(in) :: arr + real(dp), dimension(:), intent(out) :: arr_gsum + + ! Notes on subroutine cism_reprosum_calc: + ! The first five arguments are required: arr(dsummands,nflds), arr_gsum(dsummands), + ! dsummands, dflds and nsummands. Typically, nsummands = dsummands = number of local values. + ! We use the default fixed-precision algorithm (instead of ddpdd, which apparently is less robust). + ! We do not allow Inf or NaN values in the input array. + ! Typically, the algorithm calls mpi_allreduce twice. By passing in both arr_gbl_max and arr_max_levels, + ! it may be possible to call mpi_allreduce just once, improving performance. + ! If we don't pass these arguments, then arr_gbl_max and arr_max_levels are computed internally.. + ! By passing arr_glb_max_out and arr_max_levels_out, we can see the calculated values. + ! By passing rel_diff, we can verify that the computed reproducible sum is close + ! to the (nonreproducible) floating-point value. + ! +!! ! commid ! MPI communicator + ! See comments in cism_reprosum_calc for more info + + ! Required arguments + + integer :: dsummands, nflds ! dimensions of arr + integer :: nsummands ! number of processors + + ! Optional arguments + + real(dp), dimension(:), allocatable :: & + arr_gbl_max, & ! upper bound on max(abs(arr)) + arr_gbl_max_out ! calculated upper bound on max(abs(arr)) + + real(dp), dimension(:,:), allocatable :: & + rel_diff ! relative and absolute differences between fixed and floating point sums + + integer, dimension(:), allocatable :: & + arr_max_levels, & ! maximum number of levels of integer expansion to use + arr_max_levels_out ! output of number of levels of integer expansion to use + + integer :: & + gbl_max_nsummands, & ! maximum of nsummand over all processes + gbl_max_nsummands_out ! calculated maximum nsummands over all processes + + integer, dimension(6) :: & + repro_sum_stats(6) ! increment running totals for + ! (1) one-reduction repro_sum + ! (2) two-reduction repro_sum + ! (3) both types in one call + ! (4) nonrepro_sum + ! (5) global max nsummands reduction + ! (6) global lor 3*nflds reduction + + logical :: & + ddpdd_sum, & ! use ddpdd algorithm instead of fixed-precision algorithm + allow_infnan, & ! if .true., allow INF or NaN input values; + ! if .false. (the default), then abort if INF or NaNs are present + repro_sum_validate ! flag enabling/disabling testing that gmax and max_levels + ! are accurate/sufficient. Default is enabled. + + + ! Set parameters and allocate arrays + dsummands = size(arr,1) + nflds = size(arr,2) + nsummands = dsummands + + !WHL - debug + if (verbose_reprosum .and. main_task) then + write(iulog,*) 'In parallel_reduce_reprosum' + write(iulog,*) 'dsummands, nflds:', dsummands, nflds + endif + if (verbose_reprosum) then +!! write(iulog,*) 'rank, arr:', this_rank, arr(:,:) + endif + + allocate (arr_gbl_max(nflds)) + allocate (arr_gbl_max_out(nflds)) + allocate (arr_max_levels(nflds)) + allocate (arr_max_levels_out(nflds)) + allocate (rel_diff(2,nflds)) + + allow_infnan = .false. + ddpdd_sum = .false. + repro_sum_validate = .true. + +#ifdef CCSMCOUPLED + +!! call shr_reprosum_calc + +#else + + ! The following subroutine is adapted from shr_reprosum_calc in CESM shared code. + + call cism_reprosum_calc(& + arr, arr_gsum, & + nsummands, dsummands, nflds, & + allow_infnan = allow_infnan, & + ddpdd_sum = ddpdd_sum, & +! arr_gbl_max = arr_gbl_max, & + arr_gbl_max_out = arr_gbl_max_out, & +! arr_max_levels = arr_max_levels, & + arr_max_levels_out = arr_max_levels_out, & +! gbl_max_nsummands = gbl_max_nsummands, & + gbl_max_nsummands_out = gbl_max_nsummands_out, & + repro_sum_validate = repro_sum_validate, & + repro_sum_stats = repro_sum_stats, & + rel_diff = rel_diff) + +#endif + + if (verbose_reprosum .and. main_task) then + write(iulog,*) 'arr_gbl_max_out =', arr_gbl_max_out + write(iulog,*) 'arr_max_levels_out =', arr_max_levels_out + write(iulog,*) 'gbl_max_nsummands_out =', gbl_max_nsummands_out + write(iulog,*) 'rel diff =', rel_diff(1,:) + write(iulog,*) 'abs diff =', rel_diff(2,:) + write(iulog,*) 'stats =', repro_sum_stats(:) + endif + + deallocate(arr_gbl_max, arr_gbl_max_out) + deallocate(arr_max_levels, arr_max_levels_out) + deallocate(rel_diff) + + end subroutine parallel_reduce_reprosum + !======================================================================= ! functions belonging to the parallel_reduce_max interface diff --git a/libglissade/glissade.F90 b/libglissade/glissade.F90 index 83eaec79..6a11c5ac 100644 --- a/libglissade/glissade.F90 +++ b/libglissade/glissade.F90 @@ -218,11 +218,12 @@ subroutine glissade_initialise(model, evolve_ice) ! The subroutine will report how many tasks are needed to compute on all active blocks, and then abort. ! The user can then resubmit (on an optimal number of processors) with model%options%compute_blocks = ACTIVE_BLOCKS. - call distributed_grid_active_blocks(model%general%ewn, model%general%nsn, & - model%general%nx_block, model%general%ny_block, & - model%general%ice_domain_mask, & - model%parallel, & - inquire_only = .true.) + call distributed_grid_active_blocks(& + model%general%ewn, model%general%nsn, & + model%general%nx_block, model%general%ny_block, & + model%general%ice_domain_mask, & + model%parallel, & + inquire_only = .true.) else ! compute_blocks = ACTIVE_BLOCKS_ONLY @@ -235,10 +236,12 @@ subroutine glissade_initialise(model, evolve_ice) model%general%global_bc = GLOBAL_BC_NO_ICE endif - call distributed_grid_active_blocks(model%general%ewn, model%general%nsn, & - model%general%nx_block, model%general%ny_block, & - model%general%ice_domain_mask, & - model%parallel) + call distributed_grid_active_blocks(& + model%general%ewn, model%general%nsn, & + model%general%nx_block, model%general%ny_block, & + model%general%ice_domain_mask, & + model%parallel, & + reprosum_in = model%options%reproducible_sums) endif ! compute_blocks @@ -247,13 +250,19 @@ subroutine glissade_initialise(model, evolve_ice) elseif (model%general%global_bc == GLOBAL_BC_OUTFLOW) then - call distributed_grid(model%general%ewn, model%general%nsn, & - model%parallel, global_bc_in = 'outflow') + call distributed_grid(& + model%general%ewn, model%general%nsn, & + model%parallel, & + reprosum_in = model%options%reproducible_sums, & + global_bc_in = 'outflow') elseif (model%general%global_bc == GLOBAL_BC_NO_ICE) then - call distributed_grid(model%general%ewn, model%general%nsn, & - model%parallel, global_bc_in = 'no_ice') + call distributed_grid(& + model%general%ewn, model%general%nsn, & + model%parallel, & + reprosum_in = model%options%reproducible_sums, & + global_bc_in = 'no_ice') elseif (model%general%global_bc == GLOBAL_BC_NO_PENETRATION) then @@ -261,15 +270,19 @@ subroutine glissade_initialise(model, evolve_ice) ! The difference is that we also use no-penetration masks for (uvel,vvel) at the global boundary ! (computed by calling staggered_no_penetration_mask below). - call distributed_grid(model%general%ewn, model%general%nsn, & - model%parallel, global_bc_in = 'no_penetration') + call distributed_grid(& + model%general%ewn, model%general%nsn, & + model%parallel, & + reprosum_in = model%options%reproducible_sums, & + global_bc_in = 'no_penetration') else ! global_bc = GLOBAL_BC_PERIODIC -! call distributed_grid(model%general%ewn, model%general%nsn, global_bc_in = 'periodic') - - call distributed_grid(model%general%ewn, model%general%nsn, & - model%parallel, global_bc_in = 'periodic') + call distributed_grid(& + model%general%ewn, model%general%nsn, & + model%parallel, & + reprosum_in = model%options%reproducible_sums, & + global_bc_in = 'periodic') endif @@ -815,8 +828,7 @@ subroutine glissade_initialise(model, evolve_ice) call glide_set_mask(model%numerics, & model%geometry%thck, model%geometry%topg, & model%general%ewn, model%general%nsn, & - model%climate%eus, model%geometry%thkmask, & - model%geometry%iarea, model%geometry%ivol) + model%climate%eus, model%geometry%thkmask) ! compute halo for relaxed topography ! Note: See comments above with regard to the halo update for topg. @@ -879,8 +891,7 @@ subroutine glissade_initialise(model, evolve_ice) call glide_set_mask(model%numerics, & model%geometry%thck, model%geometry%topg, & model%general%ewn, model%general%nsn, & - model%climate%eus, model%geometry%thkmask, & - model%geometry%iarea, model%geometry%ivol) + model%climate%eus, model%geometry%thkmask) endif ! initial calving @@ -1104,6 +1115,7 @@ subroutine glissade_initialise(model, evolve_ice) call glissade_basin_average(& model%general%ewn, model%general%nsn, & + parallel, & model%ocean_data%nbasin, & model%ocean_data%basin_number, & floating_mask * 1.0d0, & ! real mask @@ -1155,7 +1167,7 @@ subroutine glissade_tstep(model, time) use glimmer_paramets, only: eps11 use glimmer_physcon, only: scyr - use glide_mask, only: glide_set_mask, calc_iareaf_iareag + use glide_mask, only: glide_set_mask use glissade_mass_balance, only: glissade_prepare_climate_forcing implicit none @@ -1360,15 +1372,7 @@ subroutine glissade_tstep(model, time) call glide_set_mask(model%numerics, & model%geometry%thck, model%geometry%topg, & model%general%ewn, model%general%nsn, & - model%climate%eus, model%geometry%thkmask, & - model%geometry%iarea, model%geometry%ivol) - - ! --- Calculate global area of ice that is floating and grounded. - !TODO May want to calculate iareaf and iareag in glide_write_diag and remove those calculations here. - - call calc_iareaf_iareag(model%numerics%dew, model%numerics%dns, & - model%geometry%thkmask, & - model%geometry%iareaf, model%geometry%iareag) + model%climate%eus, model%geometry%thkmask) ! ------------------------------------------------------------------------ ! Do the vertical thermal solve if it is time to do so. diff --git a/libglissade/glissade_basal_water.F90 b/libglissade/glissade_basal_water.F90 index 3b642157..18b03292 100644 --- a/libglissade/glissade_basal_water.F90 +++ b/libglissade/glissade_basal_water.F90 @@ -1031,8 +1031,7 @@ subroutine fill_depressions(& ! Continue until no further lowering of phi is possible. At that point, phi = phi_out. ! Note: Setting eps = 0 would result in flat surfaces that would need to be fixed later. - use cism_parallel, only: parallel_reduce_sum - use cism_parallel, only: parallel_globalindex + use cism_parallel, only: parallel_reduce_sum, parallel_globalindex implicit none diff --git a/libglissade/glissade_bmlt_float.F90 b/libglissade/glissade_bmlt_float.F90 index 7a0da2d5..227ed6e4 100644 --- a/libglissade/glissade_bmlt_float.F90 +++ b/libglissade/glissade_bmlt_float.F90 @@ -1014,6 +1014,7 @@ subroutine glissade_bmlt_float_thermal_forcing(& call glissade_basin_average(& nx, ny, & + parallel, & ocean_data%nbasin, & ocean_data%basin_number, & thermal_forcing_mask * f_float, & @@ -1024,6 +1025,7 @@ subroutine glissade_bmlt_float_thermal_forcing(& call glissade_basin_average(& nx, ny, & + parallel, & ocean_data%nbasin, & ocean_data%basin_number, & thermal_forcing_mask * f_float, & @@ -1126,6 +1128,7 @@ subroutine glissade_bmlt_float_thermal_forcing(& call glissade_basin_average(& nx, ny, & + parallel, & ocean_data%nbasin, & ocean_data%basin_number, & thermal_forcing_mask * f_float, & diff --git a/libglissade/glissade_calving.F90 b/libglissade/glissade_calving.F90 index f2ad1cb1..04903763 100644 --- a/libglissade/glissade_calving.F90 +++ b/libglissade/glissade_calving.F90 @@ -37,10 +37,9 @@ module glissade_calving use glimmer_utils, only: point_diag use cism_parallel, only: this_rank, main_task, nhalo, & - parallel_halo, parallel_globalindex, & + parallel_halo, parallel_globalindex, parallel_global_sum, & parallel_reduce_sum, parallel_reduce_max, parallel_reduce_log_or - implicit none private @@ -50,7 +49,6 @@ module glissade_calving public :: verbose_calving logical, parameter :: verbose_calving = .false. -!! logical, parameter :: verbose_calving = .true. contains @@ -582,15 +580,7 @@ subroutine glissade_calve_ice(nx, ny, & if (verbose_calving) then call point_diag(cf_length, 'cf_length (m)', itest, jtest, rtest, 7, 7) ! Diagnose the total CF length - total_cf_length = 0.0d0 - do j = nhalo+1, ny-nhalo - do i = nhalo+1, nx-nhalo - if (calving_front_mask(i,j) == 1) then - total_cf_length = total_cf_length + cf_length(i,j) - endif - enddo - enddo - total_cf_length = parallel_reduce_sum(total_cf_length) + total_cf_length = parallel_global_sum(cf_length, parallel, calving_front_mask) if (this_rank == rtest) then write(iulog,*) 'Total CF length (km)', total_cf_length/1000.d0 endif @@ -871,13 +861,7 @@ subroutine glissade_calve_ice(nx, ny, & endif ! Compute the total ice area and the area of each quadrant - total_ice_area = 0.0d0 - do j = nhalo+1, ny-nhalo - do i = nhalo+1, nx-nhalo - total_ice_area = total_ice_area + dx*dy*calving%effective_areafrac(i,j) - enddo - enddo - total_ice_area = parallel_reduce_sum(total_ice_area) + total_ice_area = parallel_global_sum(dx*dy*calving%effective_areafrac, parallel) if (verbose_calving) then if (this_rank == rtest) then diff --git a/libglissade/glissade_glacier.F90 b/libglissade/glissade_glacier.F90 index cce8bdd9..b896beb6 100644 --- a/libglissade/glissade_glacier.F90 +++ b/libglissade/glissade_glacier.F90 @@ -75,7 +75,7 @@ subroutine glissade_glacier_init(model, glacier) ! The CISM input file contains the RGI IDs. use cism_parallel, only: distributed_gather_var, distributed_scatter_var, & - parallel_reduce_sum, parallel_reduce_max, parallel_reduce_min, parallel_is_zero, & + parallel_global_sum, parallel_reduce_max, parallel_reduce_min, parallel_is_zero, & broadcast, parallel_halo, staggered_parallel_halo, parallel_globalindex type(glide_global_type),intent(inout) :: model @@ -113,6 +113,9 @@ subroutine glissade_glacier_init(model, glacier) current_id, & ! current glacier_id from list gid_minval, gid_maxval ! min and max values of glacier_id + integer, dimension(model%general%ewn,model%general%nsn) :: & + glacier_mask ! = 1 for cells with glaciers (glacier_id > 0), else = 0 + type(parallel_type) :: parallel ! info for parallel communication !WHL - debug, for quicksort test @@ -194,8 +197,7 @@ subroutine glissade_glacier_init(model, glacier) ! and these arrays should already have the correct dimensions. if (associated(glacier%glacierid)) deallocate(glacier%glacierid) - if (associated(glacier%cism_to_rgi_glacier_id)) & - deallocate(glacier%cism_to_rgi_glacier_id) + if (associated(glacier%cism_to_rgi_glacier_id)) deallocate(glacier%cism_to_rgi_glacier_id) if (associated(glacier%area)) deallocate(glacier%area) if (associated(glacier%volume)) deallocate(glacier%volume) if (associated(glacier%area_init)) deallocate(glacier%area_init) @@ -220,21 +222,9 @@ subroutine glissade_glacier_init(model, glacier) ! Count the number of cells with glaciers ! Loop over locally owned cells - count = 0 - do j = nhalo+1, nsn-nhalo - do i = nhalo+1, ewn-nhalo - if (glacier%rgi_glacier_id(i,j) > 0) then - count = count + 1 - elseif (glacier%rgi_glacier_id(i,j) < 0) then ! should not happen - call parallel_globalindex(i, j, iglobal, jglobal, parallel) - write(message,*) 'RGI glacier_id < 0: i, j, value =', & - iglobal, jglobal, glacier%rgi_glacier_id(i,j) - call write_log(message, GM_FATAL) - endif - enddo - enddo - - ncells_glacier = parallel_reduce_sum(count) + glacier_mask = 0 + where (glacier%rgi_glacier_id > 0) glacier_mask = 1 + ncells_glacier = parallel_global_sum(glacier_mask, parallel) ! Gather the RGI glacier IDs to the main task if (main_task) allocate(rgi_glacier_id_global(global_ewn, global_nsn)) @@ -434,6 +424,7 @@ subroutine glissade_glacier_init(model, glacier) call glacier_area_volume(& ewn, nsn, & + parallel, & nglacier, & glacier%cism_glacier_id_init, & model%geometry%cell_area, & ! m^2 @@ -543,6 +534,7 @@ subroutine glissade_glacier_init(model, glacier) call glacier_2d_to_1d(& ewn, nsn, & + parallel, & nglacier, glacier%cism_glacier_id_init, & model%climate%smb_obs, glacier%smb_obs) @@ -614,6 +606,7 @@ subroutine glissade_glacier_init(model, glacier) call glacier_area_volume(& ewn, nsn, & + parallel, & nglacier, & glacier%cism_glacier_id, & model%geometry%cell_area, & ! m^2 @@ -626,6 +619,7 @@ subroutine glissade_glacier_init(model, glacier) call glacier_area_volume(& ewn, nsn, & + parallel, & nglacier, & glacier%cism_glacier_id_init, & model%geometry%cell_area, & ! m^2 @@ -730,7 +724,7 @@ subroutine glissade_glacier_update(model, glacier) use glissade_grid_operators, only: glissade_stagger use glissade_utils, only: glissade_usrf_to_thck - use cism_parallel, only: parallel_reduce_sum, parallel_global_sum, & + use cism_parallel, only: parallel_global_sum, & parallel_halo, staggered_parallel_halo ! Do glacier inversion (if applicable), update glacier masks, and compute glacier diagnostics. @@ -767,12 +761,13 @@ subroutine glissade_glacier_update(model, glacier) integer :: i, j, ng integer, dimension(model%general%ewn, model%general%nsn) :: & - ice_mask ! = 1 where ice is present (thck > thklim), else = 0 + ice_mask, & ! = 1 where ice is present (thck > thklim), else = 0 + glacier_mask ! temporary mask real(dp), dimension(model%general%ewn, model%general%nsn) :: & thck, & ! ice thickness (m) dthck_dt, & ! rate of change of thickness (m/yr) - cell_area, & ! grid cell area (m^2) + cell_area_uniform, & ! grid cell area defined as dew*dns(m^2) thck_old, & ! saved value of ice thickness (m) artm, & ! artm, baseline or current date snow, & ! snowfall, baseline or current date @@ -872,14 +867,12 @@ subroutine glissade_glacier_update(model, glacier) nglacier = glacier%nglacier ngdiag = glacier%ngdiag + cell_area_uniform = dew*dns ! some unit conversions - !TODO - Use model%geometry%thck without a copy. ! Skip these conversion and use SI units (s instead of yr) in the code. - dt = model%numerics%dt /scyr ! s to yr - thck = model%geometry%thck + dt = model%numerics%dt /scyr ! s to yr dthck_dt = model%geometry%dthck_dt * scyr ! m/s to m/yr - cell_area = model%geometry%cell_area ! model units to m^2 ! Accumulate the 2D fields used for mu_star and alpha_snow inversion: snow and Tpos. ! Also accumulate dthck_dt, which is used for powerlaw_c inversion. @@ -1189,23 +1182,23 @@ subroutine glissade_glacier_update(model, glacier) if (glacier%redistribute_advanced_ice) then - thck_old = thck + thck_old = model%geometry%thck call glacier_redistribute_advanced_ice(& ewn, nsn, & + parallel, & itest, jtest, rtest, & nglacier, ngdiag, & real(glacier_update_interval,dp), & ! yr - dew*dns, & ! m^2 + cell_area_uniform, & ! m^2 glacier%thinning_rate_advanced_ice, & ! m/yr glacier%cism_glacier_id_init, & glacier%smb_glacier_id, & model%climate%smb, & ! m/yr - thck, & ! m - parallel) + model%geometry%thck) ! m glacier%dthck_dt_annmean = glacier%dthck_dt_annmean + & - (thck - thck_old) / real(glacier_update_interval,dp) + (model%geometry%thck - thck_old) / real(glacier_update_interval,dp) endif ! redistribute advanced ice @@ -1230,6 +1223,7 @@ subroutine glissade_glacier_update(model, glacier) call glacier_2d_to_1d_weighted(& ewn, nsn, & + parallel, & nglacier, & glacier%smb_glacier_id_init, & smb_weight_init, & @@ -1249,6 +1243,7 @@ subroutine glissade_glacier_update(model, glacier) call glacier_2d_to_1d_weighted(& ewn, nsn, & + parallel, & nglacier, & glacier%smb_glacier_id, & smb_weight_current, & @@ -1276,6 +1271,7 @@ subroutine glissade_glacier_update(model, glacier) call glacier_invert_mu_star_alpha_snow(& ewn, nsn, & + parallel, & itest, jtest, rtest, & nglacier, ngdiag, & glacier%smb_glacier_id_init, & @@ -1300,6 +1296,7 @@ subroutine glissade_glacier_update(model, glacier) call glacier_invert_mu_star(& ewn, nsn, & + parallel, & itest, jtest, rtest, & nglacier, ngdiag, & glacier%smb_glacier_id_init, & @@ -1318,13 +1315,13 @@ subroutine glissade_glacier_update(model, glacier) endif ! set_mu_star ! advance/retreat diagnostics - ! Note: This subroutine assumes cell_area = dew*dns for all cells call glacier_area_advance_retreat(& ewn, nsn, & + parallel, & nglacier, & glacier%cism_glacier_id_init, & glacier%cism_glacier_id, & - dew*dns, & + cell_area_uniform, & area_initial, & area_current, & area_advance, & @@ -1383,7 +1380,8 @@ subroutine glissade_glacier_update(model, glacier) ! Interpolate thck to the staggered grid call glissade_stagger(& ewn, nsn, & - thck, stag_thck) + model%geometry%thck, & + stag_thck) ! Interpolate dthck_dt to the staggered grid call glissade_stagger(& @@ -1443,17 +1441,17 @@ subroutine glissade_glacier_update(model, glacier) call glacier_advance_retreat(& ewn, nsn, & + parallel, & itest, jtest, rtest, & nglacier, & glacier%minthck, & ! m - thck, & ! m + model%geometry%thck, & ! m glacier%snow_annmean, & ! mm/yr w.e. glacier%Tpos_annmean, & ! deg C glacier%mu_star, & ! mm/yr/deg glacier%alpha_snow, & ! unitless glacier%cism_glacier_id_init, & - glacier%cism_glacier_id, & - parallel) + glacier%cism_glacier_id) ! Compute smb_glacier_id, which determines where the SMB is computed. It is the union of ! (1) cism_glacier_id > 0 @@ -1559,7 +1557,7 @@ subroutine glissade_glacier_update(model, glacier) endif ! set_mu_star if (verbose_glacier) then - call point_diag(thck, 'After advance_retreat, thck', itest, jtest, rtest, 7, 7) + call point_diag(model%geometry%thck, 'After advance_retreat, thck', itest, jtest, rtest, 7, 7) call point_diag(glacier%cism_glacier_id_init, 'cism_glacier_id_init', itest, jtest, rtest, 7, 7) call point_diag(glacier%smb_glacier_id_init, 'smb_glacier_id_init', itest, jtest, rtest, 7, 7) call point_diag(glacier%cism_glacier_id, 'New cism_glacier_id', itest, jtest, rtest, 7, 7) @@ -1591,16 +1589,20 @@ subroutine glissade_glacier_update(model, glacier) ! (1) Include only cells that are part of the initial glacier extent call glacier_accumulation_area_ratio(& ewn, nsn, & + parallel, & nglacier, & glacier%cism_glacier_id_init, & + cell_area_uniform, & model%climate%smb, & aar_init) ! (2) Include all cells in the glacier call glacier_accumulation_area_ratio(& ewn, nsn, & + parallel, & nglacier, & glacier%cism_glacier_id, & + cell_area_uniform, & model%climate%smb, & aar) @@ -1630,16 +1632,20 @@ subroutine glissade_glacier_update(model, glacier) ! (1) Include only cells that are part of the initial glacier extent call glacier_accumulation_area_ratio(& ewn, nsn, & + parallel, & nglacier, & glacier%cism_glacier_id_init, & + cell_area_uniform, & glacier%smb_recent, & aar_init_recent) ! (2) Include all cells in the glacier call glacier_accumulation_area_ratio(& ewn, nsn, & + parallel, & nglacier, & glacier%cism_glacier_id, & + cell_area_uniform, & glacier%smb_recent, & aar_recent) @@ -1661,10 +1667,11 @@ subroutine glissade_glacier_update(model, glacier) call glacier_area_volume(& ewn, nsn, & + parallel, & nglacier, & glacier%cism_glacier_id, & - cell_area, & ! m^2 - thck, & ! m + model%geometry%cell_area, & ! m^2 + model%geometry%thck, & ! m glacier%diagnostic_minthck, & ! m glacier%area, & ! m^2 glacier%volume) ! m^3 @@ -1674,10 +1681,11 @@ subroutine glissade_glacier_update(model, glacier) call glacier_area_volume(& ewn, nsn, & + parallel, & nglacier, & glacier%cism_glacier_id_init, & - cell_area, & ! m^2 - thck, & ! m + model%geometry%cell_area, & ! m^2 + model%geometry%thck, & ! m glacier%diagnostic_minthck, & ! m glacier%area_init_extent, & ! m^2 glacier%volume_init_extent) ! m^3 @@ -1701,6 +1709,7 @@ subroutine glissade_glacier_update(model, glacier) call glacier_area_volume(& ewn, nsn, & + parallel, & nglacier, & glacier%cism_glacier_id_init, & model%geometry%cell_area, & ! m^2 @@ -1718,28 +1727,21 @@ subroutine glissade_glacier_update(model, glacier) if (verbose_glacier) then - ! debug - count cells in masks - count_cgii = 0 - count_cgi = 0 - count_sgii = 0 - count_sgi = 0 - do j = nhalo+1, nsn-nhalo - do i = nhalo+1, ewn-nhalo - ng = glacier%cism_glacier_id_init(i,j) - if (ng == ngdiag) count_cgii = count_cgii + 1 - ng = glacier%cism_glacier_id(i,j) - if (ng == ngdiag) count_cgi = count_cgi + 1 - ng = glacier%smb_glacier_id_init(i,j) - if (ng == ngdiag) count_sgii = count_sgii + 1 - ng = glacier%smb_glacier_id(i,j) - if (ng == ngdiag) count_sgi = count_sgi + 1 - enddo - enddo + glacier_mask = 0 + where (glacier%cism_glacier_id_init == ngdiag) glacier_mask = 1 + count_cgii = parallel_global_sum(glacier_mask, parallel) + + glacier_mask = 0 + where (glacier%cism_glacier_id == ngdiag) glacier_mask = 1 + count_cgi = parallel_global_sum(glacier_mask, parallel) - count_cgii = parallel_reduce_sum(count_cgii) - count_cgi = parallel_reduce_sum(count_cgi) - count_sgii = parallel_reduce_sum(count_sgii) - count_sgi = parallel_reduce_sum(count_sgi) + glacier_mask = 0 + where (glacier%smb_glacier_id_init == ngdiag) glacier_mask = 1 + count_sgii = parallel_global_sum(glacier_mask, parallel) + + glacier_mask = 0 + where (glacier%smb_glacier_id == ngdiag) glacier_mask = 1 + count_sgi = parallel_global_sum(glacier_mask, parallel) if (this_rank == rtest) then write(iulog,*) ' ' @@ -1752,15 +1754,13 @@ subroutine glissade_glacier_update(model, glacier) endif ! glacier_update_inverval - ! Copy fields back to model derived type - model%geometry%thck = thck - end subroutine glissade_glacier_update !**************************************************** subroutine glacier_invert_mu_star(& ewn, nsn, & + parallel, & itest, jtest, rtest, & nglacier, ngdiag, & smb_glacier_id_init, & @@ -1785,6 +1785,9 @@ subroutine glacier_invert_mu_star(& nglacier, & ! total number of glaciers in the domain ngdiag ! CISM ID of diagnostic glacier + type(parallel_type), intent(in) :: & + parallel ! info for parallel communication + integer, dimension(ewn,nsn), intent(in) :: & smb_glacier_id_init ! smb_glacier_id based on the initial glacier extent @@ -1862,6 +1865,7 @@ subroutine glacier_invert_mu_star(& call glacier_2d_to_1d_weighted(& ewn, nsn, & + parallel, & nglacier, & smb_glacier_id_init, & smb_weight, & @@ -1869,6 +1873,7 @@ subroutine glacier_invert_mu_star(& call glacier_2d_to_1d_weighted(& ewn, nsn, & + parallel, & nglacier, & smb_glacier_id_init, & smb_weight, & @@ -1993,6 +1998,7 @@ end subroutine glacier_invert_mu_star subroutine glacier_invert_mu_star_alpha_snow(& ewn, nsn, & + parallel, & itest, jtest, rtest, & nglacier, ngdiag, & smb_glacier_id_init, & @@ -2023,6 +2029,9 @@ subroutine glacier_invert_mu_star_alpha_snow(& nglacier, & ! total number of glaciers in the domain ngdiag ! CISM ID of diagnostic glacier + type(parallel_type), intent(in) :: & + parallel ! info for parallel communication + integer, dimension(ewn,nsn), intent(in) :: & smb_glacier_id_init ! smb_glacier_id based on the initial glacier extent @@ -2114,6 +2123,7 @@ subroutine glacier_invert_mu_star_alpha_snow(& call glacier_2d_to_1d_weighted(& ewn, nsn, & + parallel, & nglacier, & smb_glacier_id_init, & smb_weight, & @@ -2121,6 +2131,7 @@ subroutine glacier_invert_mu_star_alpha_snow(& call glacier_2d_to_1d_weighted(& ewn, nsn, & + parallel, & nglacier, & smb_glacier_id_init, & smb_weight, & @@ -2128,6 +2139,7 @@ subroutine glacier_invert_mu_star_alpha_snow(& call glacier_2d_to_1d_weighted(& ewn, nsn, & + parallel, & nglacier, & smb_glacier_id_init, & smb_weight, & @@ -2135,6 +2147,7 @@ subroutine glacier_invert_mu_star_alpha_snow(& call glacier_2d_to_1d_weighted(& ewn, nsn, & + parallel, & nglacier, & smb_glacier_id_init, & smb_weight, & @@ -2533,6 +2546,7 @@ end subroutine glacier_calc_snow subroutine glacier_redistribute_advanced_ice(& ewn, nsn, & + parallel, & itest, jtest, rtest, & nglacier, ngdiag, & glacier_update_interval, & ! yr @@ -2541,15 +2555,14 @@ subroutine glacier_redistribute_advanced_ice(& cism_glacier_id_init, & smb_glacier_id, & smb, & ! m/yr - thck, & ! m - parallel) + thck) ! m ! Limit glacier advance in the accumulation zone. ! This applies to grid cells that are initially ice-free, into which ice is advected. ! The fix here is to thin the ice in these cells at a prescribed rate and ! redistribute the mass conservatively across the glacier. - use cism_parallel, only: parallel_reduce_sum, parallel_halo + use cism_parallel, only: parallel_halo, parallel_global_sum_patch ! input/output arguments @@ -2559,11 +2572,16 @@ subroutine glacier_redistribute_advanced_ice(& nglacier, & ! number of glaciers ngdiag ! CISM ID of diagnostic glacier + type(parallel_type), intent(in) :: & + parallel ! info for parallel communication + real(dp), intent(in) :: & glacier_update_interval, & ! time interval (yr) of the glacier update, typically 1 yr - cell_area, & ! grid cell area (m^2), assumed to be the same for each cell thinning_rate_advanced_ice ! thinning rate (m/yr) where glaciers advance in the accumulation zone + real(dp), dimension(ewn,nsn), intent(in) :: & + cell_area ! grid cell area (m^2) + integer, dimension(ewn,nsn), intent(in) :: & cism_glacier_id_init, & ! integer glacier ID at the start of the run smb_glacier_id ! integer ID for current glacier cells and adjacent glacier-free cells @@ -2574,14 +2592,10 @@ subroutine glacier_redistribute_advanced_ice(& real(dp), dimension(ewn,nsn), intent(inout) :: & thck ! ice thickness (m) - type(parallel_type), intent(in) :: parallel ! info for parallel communication - ! local variables integer :: i, j, ng - real(dp) :: dthck ! thickness change (m) - real(dp), dimension(nglacier) :: & glacier_area_init, & ! glacier area based on cism_glacier_id_init glacier_vol_removed, & ! total volume (m^3) removed from each advanced cells in each glacier @@ -2589,53 +2603,37 @@ subroutine glacier_redistribute_advanced_ice(& glacier_vol_1, & ! volume (m^3) of each glacier before thinning and restribution glacier_vol_2 ! volume (m^3) of each glacier after thinning and restribution + real(dp), dimension(ewn,nsn) :: & + dthck ! thickness removed (m) + + integer, dimension(ewn,nsn) :: & + glacier_id ! temporary glacier ID + + glacier_id = max(cism_glacier_id_init, smb_glacier_id) + ! Compute the total volume of each glacier before limiting advance. ! Note: This includes adjacent glacier-free cells that might have a small nonzero thickness ! (i.e., cism_glacier_id = 0 but smb_glacier_id > 0). - !TODO: Write a sum-over-glaciers subroutine - glacier_vol_1(:) = 0.0d0 - do j = nhalo+1, nsn-nhalo - do i = nhalo+1, ewn-nhalo - ng = smb_glacier_id(i,j) - if (ng > 0) then - glacier_vol_1(ng) = glacier_vol_1(ng) + cell_area*thck(i,j) - endif - enddo - enddo - glacier_vol_1 = parallel_reduce_sum(glacier_vol_1) + glacier_vol_1 = parallel_global_sum_patch(cell_area*thck, nglacier, glacier_id, parallel) ! compute the area of each glacier over its initial extent - glacier_area_init(:) = 0.0d0 - do j = nhalo+1, nsn-nhalo - do i = nhalo+1, ewn-nhalo - ng = cism_glacier_id_init(i,j) - if (ng > 0) then - glacier_area_init(ng) = glacier_area_init(ng) + cell_area - endif - enddo - enddo - glacier_area_init = parallel_reduce_sum(glacier_area_init) + + glacier_area_init = parallel_global_sum_patch(cell_area, nglacier, cism_glacier_id_init, parallel) ! Compute thinning in advanced grid cells ! This includes potential advanced cells adjacent to current glacier cells. ! Note: Currently, SMB is set to 0 in advanced cells where SMB would be > 0 otherwise. ! The logic below (smb >= 0) ensures that ice in these cells is thinned. - glacier_vol_removed(:) = 0.0d0 - do j = nhalo+1, nsn-nhalo - do i = nhalo+1, ewn-nhalo - if (cism_glacier_id_init(i,j) == 0 .and. smb_glacier_id(i,j) > 0) then ! advanced cell - if (smb(i,j) >= 0.d0) then ! accumulation zone - ng = smb_glacier_id(i,j) - dthck = min(thinning_rate_advanced_ice*glacier_update_interval, thck(i,j)) - thck(i,j) = thck(i,j) - dthck - glacier_vol_removed(ng) = glacier_vol_removed(ng) + cell_area*dthck - endif - endif - enddo - enddo - glacier_vol_removed = parallel_reduce_sum(glacier_vol_removed) + dthck = 0.0d0 + where (cism_glacier_id_init == 0 .and. smb_glacier_id > 0) ! advanced cell + where (smb >= 0.0d0) ! accumulation zone + dthck = min(thinning_rate_advanced_ice*glacier_update_interval, thck) + thck = thck - dthck + endwhere + endwhere + glacier_vol_removed = parallel_global_sum_patch(cell_area*dthck, nglacier, smb_glacier_id, parallel) ! Assuming conservation of volume, compute the thickness to be added to each glacier. ! Only cells within the initial glacier extent can thicken. @@ -2659,16 +2657,8 @@ subroutine glacier_redistribute_advanced_ice(& call parallel_halo(thck, parallel) ! Compute the volume of each glacier after limiting advance - glacier_vol_2(:) = 0.0d0 - do j = nhalo+1, nsn-nhalo - do i = nhalo+1, ewn-nhalo - ng = max(cism_glacier_id_init(i,j), smb_glacier_id(i,j)) - if (ng > 0) then - glacier_vol_2(ng) = glacier_vol_2(ng) + cell_area*thck(i,j) - endif - enddo - enddo - glacier_vol_2 = parallel_reduce_sum(glacier_vol_2) + + glacier_vol_2 = parallel_global_sum_patch(cell_area*thck, nglacier, glacier_id, parallel) ! conservation check do ng = 1, nglacier @@ -2685,6 +2675,7 @@ end subroutine glacier_redistribute_advanced_ice subroutine glacier_advance_retreat(& ewn, nsn, & + parallel, & itest, jtest, rtest, & nglacier, & glacier_minthck, & @@ -2694,8 +2685,7 @@ subroutine glacier_advance_retreat(& mu_star, & alpha_snow, & cism_glacier_id_init, & - cism_glacier_id, & - parallel) + cism_glacier_id) ! Allow glaciers to advance and retreat. ! @@ -2727,6 +2717,9 @@ subroutine glacier_advance_retreat(& itest, jtest, rtest, & ! coordinates of diagnostic cell nglacier ! number of glaciers + type(parallel_type), intent(in) :: & + parallel ! info for diagnostic only + real(dp), intent(in) :: & glacier_minthck ! min ice thickness (m) counted as part of a glacier @@ -2747,8 +2740,6 @@ subroutine glacier_advance_retreat(& integer, dimension(ewn,nsn), intent(inout) :: & cism_glacier_id ! current cism glacier_id, > 0 for glaciated cells - type(parallel_type), intent(in) :: parallel ! diagnostic only - ! local variables integer, dimension(ewn,nsn) :: & @@ -3038,13 +3029,14 @@ end subroutine update_smb_glacier_id subroutine glacier_2d_to_1d(& ewn, nsn, & + parallel, & nglacier, cism_glacier_id, & field_2d, glacier_field) ! Given a 2D field, compute the average of the field over each glacier !TODO - Pass in cellarea to compute an area average. - use cism_parallel, only: parallel_reduce_sum + use cism_parallel, only: parallel_global_sum_patch ! input/output arguments @@ -3052,6 +3044,9 @@ subroutine glacier_2d_to_1d(& ewn, nsn, & ! number of cells in each horizontal direction nglacier ! total number of glaciers in the domain + type(parallel_type), intent(in) :: & + parallel ! info for parallel communication + integer, dimension(ewn,nsn), intent(in) :: & cism_glacier_id ! integer glacier ID in the range (1, nglacier) @@ -3067,22 +3062,12 @@ subroutine glacier_2d_to_1d(& integer, dimension(nglacier) :: ncells_glacier - ncells_glacier(:) = 0 - glacier_field(:) = 0.0d0 + integer, dimension(ewn,nsn) :: ones ! matrix = 1 everywhere - ! Loop over locally owned cells - do j = nhalo+1, nsn-nhalo - do i = nhalo+1, ewn-nhalo - ng = cism_glacier_id(i,j) - if (ng > 0) then - ncells_glacier(ng) = ncells_glacier(ng) + 1 - glacier_field(ng) = glacier_field(ng) + field_2d(i,j) - endif - enddo - enddo + ones(:,:) = 1 - ncells_glacier = parallel_reduce_sum(ncells_glacier) - glacier_field = parallel_reduce_sum(glacier_field) + ncells_glacier = parallel_global_sum_patch(ones, nglacier, cism_glacier_id, parallel) + glacier_field = parallel_global_sum_patch(field_2d, nglacier, cism_glacier_id, parallel) where (ncells_glacier > 0) glacier_field = glacier_field/ncells_glacier @@ -3094,6 +3079,7 @@ end subroutine glacier_2d_to_1d subroutine glacier_2d_to_1d_weighted(& ewn, nsn, & + parallel, & nglacier, & glacier_id, weight, & field_2d, glacier_field) @@ -3101,7 +3087,7 @@ subroutine glacier_2d_to_1d_weighted(& ! Given a 2D field, compute the average of the field over each glacier ! Certain grid cells (e.g., at the glacier periphery) can be given weights between 0 and 1. - use cism_parallel, only: parallel_reduce_sum + use cism_parallel, only: parallel_global_sum_patch ! input/output arguments @@ -3109,6 +3095,9 @@ subroutine glacier_2d_to_1d_weighted(& ewn, nsn, & ! number of cells in each horizontal direction nglacier ! total number of glaciers in the domain + type(parallel_type), intent(in) :: & + parallel ! info for parallel communication + integer, dimension(ewn,nsn), intent(in) :: & glacier_id ! integer glacier ID @@ -3123,26 +3112,11 @@ subroutine glacier_2d_to_1d_weighted(& ! local variables - integer :: i, j, ng - real(dp), dimension(nglacier) :: sum_weights - sum_weights(:) = 0.0d0 - glacier_field(:) = 0.0d0 - - ! Loop over locally owned cells - do j = nhalo+1, nsn-nhalo - do i = nhalo+1, ewn-nhalo - ng = glacier_id(i,j) - if (ng > 0) then - sum_weights(ng) = sum_weights(ng) + weight(i,j) - glacier_field(ng) = glacier_field(ng) + weight(i,j) * field_2d(i,j) - endif - enddo - enddo + sum_weights = parallel_global_sum_patch(weight, nglacier, glacier_id, parallel) + glacier_field = parallel_global_sum_patch(weight*field_2d, nglacier, glacier_id, parallel) - sum_weights = parallel_reduce_sum(sum_weights) - glacier_field = parallel_reduce_sum(glacier_field) where (sum_weights > 0.0d0) glacier_field = glacier_field/sum_weights endwhere @@ -3196,12 +3170,13 @@ end subroutine glacier_1d_to_2d subroutine glacier_area_volume(& ewn, nsn, & + parallel, & nglacier, cism_glacier_id, & cell_area, thck, & diagnostic_minthck, & area, volume) - use cism_parallel, only: parallel_reduce_sum + use cism_parallel, only: parallel_global_sum_patch ! input/output arguments @@ -3209,6 +3184,9 @@ subroutine glacier_area_volume(& ewn, nsn, & ! number of cells in each horizontal direction nglacier ! total number of glaciers in the domain + type(parallel_type), intent(in) :: & + parallel ! info for parallel communication + integer, dimension(ewn,nsn), intent(in) :: & cism_glacier_id ! integer glacier ID in the range (1, nglacier) @@ -3226,36 +3204,22 @@ subroutine glacier_area_volume(& ! local variables - real(dp), dimension(nglacier) :: & - local_area, local_volume ! area and volume on each processor, before global sum - - integer :: i, j, ng - - ! Initialize the output arrays - area(:) = 0.0d0 - volume(:) = 0.0d0 - - ! Initialize local arrays - local_area(:) = 0.0d0 - local_volume(:) = 0.0d0 + real(dp), dimension(ewn,nsn) :: & + diag_area, diag_volume ! area and volume where thck >= diagnostic_minthck ! Compute the area and volume of each glacier. - ! We need parallel sums, since a glacier can lie on two or more processors. + ! Need parallel sums, since a glacier can lie on two or more processors. - do j = nhalo+1, nsn-nhalo - do i = nhalo+1, ewn-nhalo - ng = cism_glacier_id(i,j) - if (ng > 0) then - if (thck(i,j) >= diagnostic_minthck) then - local_area(ng) = local_area(ng) + cell_area(i,j) - local_volume(ng) = local_volume(ng) + cell_area(i,j) * thck(i,j) - endif - endif - enddo - enddo + where(thck >= diagnostic_minthck) + diag_area = cell_area + diag_volume = cell_area*thck + elsewhere + diag_area = 0.0d0 + diag_volume = 0.0d0 + endwhere - area = parallel_reduce_sum(local_area) - volume = parallel_reduce_sum(local_volume) + area = parallel_global_sum_patch(diag_area, nglacier, cism_glacier_id, parallel) + volume = parallel_global_sum_patch(diag_volume, nglacier, cism_glacier_id, parallel) end subroutine glacier_area_volume @@ -3263,6 +3227,7 @@ end subroutine glacier_area_volume subroutine glacier_area_advance_retreat(& ewn, nsn, & + parallel, & nglacier, & cism_glacier_id_init, & cism_glacier_id, & @@ -3272,7 +3237,7 @@ subroutine glacier_area_advance_retreat(& area_advance, & area_retreat) - use cism_parallel, only: parallel_reduce_sum + use cism_parallel, only: parallel_global_sum_patch ! For each glacier, compare the current glacier area (as given by cism_glacier_id) ! to the initial area (given by cism_glacier_id_init). @@ -3287,12 +3252,15 @@ subroutine glacier_area_advance_retreat(& ewn, nsn, & ! number of cells in each horizontal direction nglacier ! total number of glaciers in the domain + type(parallel_type), intent(in) :: & + parallel ! info for parallel communication + integer, dimension(ewn,nsn), intent(in) :: & cism_glacier_id_init, & ! integer glacier ID in the range (1, nglacier), initial value cism_glacier_id ! integer glacier ID in the range (1, nglacier), current value - real(dp), intent(in) :: & - cell_area ! grid cell area = dew*dns (m^2); same for all cells + real(dp), dimension(ewn,nsn), intent(in) :: & + cell_area ! grid cell area (m^2) real(dp), dimension(nglacier), intent(out) :: & area_initial, & ! initial glacier area @@ -3302,69 +3270,38 @@ subroutine glacier_area_advance_retreat(& ! local variables - real(dp), dimension(nglacier) :: & - local_area ! area on each processor, before global sum - - integer :: i, j, ng, ngi + integer, dimension(ewn,nsn) :: glacier_id ! temporary glacier ID - ! Initialize the output arrays - area_initial(:) = 0.0d0 - area_current(:) = 0.0d0 - area_advance(:) = 0.0d0 - area_retreat(:) = 0.0d0 + integer :: ng ! Compute the area of each glacier over the initial and current masks. ! We need parallel sums, since a glacier can lie on two or more processors. - ! init area - local_area(:) = 0.0d0 - do j = nhalo+1, nsn-nhalo - do i = nhalo+1, ewn-nhalo - ngi = cism_glacier_id_init(i,j) - if (ngi > 0) then - local_area(ngi) = local_area(ngi) + cell_area - endif - enddo - enddo - area_initial = parallel_reduce_sum(local_area) + area_initial = parallel_global_sum_patch(cell_area, nglacier, cism_glacier_id_init, parallel) ! current area - local_area(:) = 0.0d0 - do j = nhalo+1, nsn-nhalo - do i = nhalo+1, ewn-nhalo - ng = cism_glacier_id(i,j) - if (ng > 0) then - local_area(ng) = local_area(ng) + cell_area - endif - enddo - enddo - area_current = parallel_reduce_sum(local_area) + + area_current = parallel_global_sum_patch(cell_area, nglacier, cism_glacier_id, parallel) ! area where the glacier has advanced - local_area(:) = 0.0d0 - do j = nhalo+1, nsn-nhalo - do i = nhalo+1, ewn-nhalo - ngi = cism_glacier_id_init(i,j) - ng = cism_glacier_id(i,j) - if (ngi == 0 .and. ng > 0) then - local_area(ng) = local_area(ng) + cell_area - endif - enddo - enddo - area_advance = parallel_reduce_sum(local_area) + + where (cism_glacier_id_init == 0 .and. cism_glacier_id > 0) + glacier_id = cism_glacier_id + elsewhere + glacier_id = 0 + endwhere + + area_advance = parallel_global_sum_patch(cell_area, nglacier, glacier_id, parallel) ! area where the glacier has retreated - local_area(:) = 0.0d0 - do j = nhalo+1, nsn-nhalo - do i = nhalo+1, ewn-nhalo - ngi = cism_glacier_id_init(i,j) - ng = cism_glacier_id(i,j) - if (ngi > 0 .and. ng == 0) then - local_area(ngi) = local_area(ngi) + cell_area - endif - enddo - enddo - area_retreat = parallel_reduce_sum(local_area) + + where (cism_glacier_id_init > 0 .and. cism_glacier_id == 0) + glacier_id = cism_glacier_id_init + elsewhere + glacier_id = 0 + endwhere + + area_retreat = parallel_global_sum_patch(cell_area, nglacier, glacier_id, parallel) ! bug check do ng = 1, nglacier @@ -3382,15 +3319,16 @@ end subroutine glacier_area_advance_retreat subroutine glacier_accumulation_area_ratio(& ewn, nsn, & + parallel, & nglacier, & cism_glacier_id, & + cell_area, & smb, & aar) ! Compute the accumulation area ratio (AAR) for each glacier. - ! Note: In this subroutine the grid cell area is assumed equal for all cells. - use cism_parallel, only: parallel_reduce_sum + use cism_parallel, only: parallel_global_sum_patch ! input/output arguments @@ -3398,9 +3336,15 @@ subroutine glacier_accumulation_area_ratio(& ewn, nsn, & ! number of cells in each horizontal direction nglacier ! total number of glaciers in the domain + type(parallel_type), intent(in) :: & + parallel ! info for parallel communication + integer, dimension(ewn,nsn), intent(in) :: & cism_glacier_id ! integer glacier ID in the range (1, nglacier) + real(dp), dimension(ewn,nsn), intent(in) :: & + cell_area ! grid cell area = dew*dns (m^2); same for all cells + real(dp), dimension(ewn,nsn), intent(in) :: & smb ! surface mass balance (mm/yr w.e.) @@ -3409,34 +3353,32 @@ subroutine glacier_accumulation_area_ratio(& ! local variables - integer :: i, j, ng +! integer :: i, j, ng real(dp), dimension(nglacier) :: & ablat_area, & ! area of accumulation zone (SMB < 0) accum_area ! area of accumulation zone (SMB > 0) - ! initialize - ablat_area(:) = 0.0d0 - accum_area(:) = 0.0d0 + integer, dimension(ewn,nsn) :: glacier_id ! temporary glacier ID ! Compute the accumulation and ablation area for each glacier ! Note: Grid cells with SMB = 0 are not counted in either zone. - do j = nhalo+1, nsn-nhalo - do i = nhalo+1, ewn-nhalo - ng = cism_glacier_id(i,j) - if (ng > 0) then - if (smb(i,j) > 0.0d0) then - accum_area(ng) = accum_area(ng) + 1.0d0 - elseif (smb(i,j) < 0.0d0) then - ablat_area(ng) = ablat_area(ng) + 1.0d0 - endif - endif - enddo ! i - enddo ! j + where (cism_glacier_id > 0 .and. smb > 0.0d0) + glacier_id = 1 + elsewhere + glacier_id = 0 + endwhere + + accum_area = parallel_global_sum_patch(cell_area, nglacier, cism_glacier_id, parallel) + + where (cism_glacier_id > 0 .and. smb < 0.0d0) + glacier_id = 1 + elsewhere + glacier_id = 0 + endwhere - accum_area = parallel_reduce_sum(accum_area) - ablat_area = parallel_reduce_sum(ablat_area) + ablat_area = parallel_global_sum_patch(cell_area, nglacier, cism_glacier_id, parallel) ! Compute the AAR for each glacier diff --git a/libglissade/glissade_inversion.F90 b/libglissade/glissade_inversion.F90 index 71a40c7a..c8d38bff 100644 --- a/libglissade/glissade_inversion.F90 +++ b/libglissade/glissade_inversion.F90 @@ -699,6 +699,7 @@ subroutine glissade_inversion_solve(model) ewn, nsn, & model%numerics%dew, & ! m model%numerics%dns, & ! m + parallel, & itest, jtest, rtest, & model%ocean_data%nbasin, & model%ocean_data%basin_number, & @@ -780,6 +781,7 @@ subroutine glissade_inversion_solve(model) ewn, nsn, & model%numerics%dew, & ! m model%numerics%dns, & ! m + parallel, & itest, jtest, rtest, & model%ocean_data%nbasin, & model%ocean_data%basin_number, & @@ -853,6 +855,7 @@ subroutine glissade_inversion_solve(model) ewn, nsn, & model%numerics%dew, & ! m model%numerics%dns, & ! m + parallel, & itest, jtest, rtest, & model%ocean_data%nbasin, & model%ocean_data%basin_number, & @@ -906,6 +909,7 @@ subroutine glissade_inversion_solve(model) ewn, nsn, & model%numerics%dew, & ! m model%numerics%dns, & ! m + parallel, & itest, jtest, rtest, & model%ocean_data%nbasin, & model%ocean_data%basin_number, & @@ -944,6 +948,7 @@ subroutine glissade_inversion_solve(model) if (model%ocean_data%nbasin > 1) then call glissade_basin_average(& model%general%ewn, model%general%nsn, & + model%parallel, & model%ocean_data%nbasin, & model%ocean_data%basin_number, & floating_mask * 1.0d0, & ! real mask @@ -1338,6 +1343,7 @@ subroutine invert_basal_friction_basin(& dt, & nx, ny, & dx, dy, & + parallel, & itest, jtest, rtest, & nbasin, & basin_number, & @@ -1372,6 +1378,9 @@ subroutine invert_basal_friction_basin(& real(dp), intent(in) :: & dx, dy ! grid cell size in each direction (m) + type(parallel_type), intent(in) :: & + parallel ! info for parallel communication + integer, intent(in) :: & itest, jtest, rtest ! coordinates of diagnostic point @@ -1418,6 +1427,7 @@ subroutine invert_basal_friction_basin(& call get_basin_targets(& nx, ny, & dx, dy, & + parallel, & nbasin, basin_number, & itest, jtest, rtest, & stag_thck, stag_dthck_dt, & @@ -1430,6 +1440,7 @@ subroutine invert_basal_friction_basin(& call glissade_basin_average(& nx, ny, & + parallel, & nbasin, basin_number, & stag_rmask, & friction_c, friction_c_basin) @@ -1495,6 +1506,7 @@ subroutine invert_deltaT_ocn_basin(& dt, & nx, ny, & dx, dy, & + parallel, & itest, jtest, rtest, & nbasin, & basin_number, & @@ -1534,6 +1546,9 @@ subroutine invert_deltaT_ocn_basin(& real(dp), intent(in) :: & dx, dy ! grid cell size in each direction (m) + type(parallel_type), intent(in) :: & + parallel ! info for parallel communication + integer, intent(in) :: & itest, jtest, rtest ! coordinates of diagnostic point @@ -1592,6 +1607,7 @@ subroutine invert_deltaT_ocn_basin(& call get_basin_targets(& nx, ny, & dx, dy, & + parallel, & nbasin, basin_number, & itest, jtest, rtest, & thck, dthck_dt, & @@ -1610,6 +1626,7 @@ subroutine invert_deltaT_ocn_basin(& call glissade_basin_average(& nx, ny, & + parallel, & nbasin, basin_number, & mask, & deltaT_ocn, deltaT_basin) @@ -2142,6 +2159,7 @@ end subroutine invert_flow_enhancement_factor subroutine get_basin_targets(& nx, ny, & dx, dy, & + parallel, & nbasin, basin_number, & itest, jtest, rtest, & thck, dthck_dt, & @@ -2167,6 +2185,9 @@ subroutine get_basin_targets(& real(dp), intent(in) :: & dx, dy ! grid cell size in each direction (m) + type(parallel_type), intent(in) :: & + parallel ! info for parallel communication + integer, intent(in) :: & nbasin ! number of basins @@ -2221,6 +2242,7 @@ subroutine get_basin_targets(& call glissade_basin_sum(& nx, ny, & + parallel, & nbasin, basin_number, & target_rmask, & cell_area, & @@ -2232,6 +2254,7 @@ subroutine get_basin_targets(& call glissade_basin_sum(& nx, ny, & + parallel, & nbasin, basin_number, & target_rmask, & thck_target*dx*dy, & @@ -2241,6 +2264,7 @@ subroutine get_basin_targets(& call glissade_basin_sum(& nx, ny, & + parallel, & nbasin, basin_number, & target_rmask, & thck*dx*dy, & @@ -2250,6 +2274,7 @@ subroutine get_basin_targets(& call glissade_basin_sum(& nx, ny, & + parallel, & nbasin, basin_number, & target_rmask, & dthck_dt*dx*dy, & diff --git a/libglissade/glissade_mass_balance.F90 b/libglissade/glissade_mass_balance.F90 index ea4c5fac..fb03dc79 100644 --- a/libglissade/glissade_mass_balance.F90 +++ b/libglissade/glissade_mass_balance.F90 @@ -43,7 +43,7 @@ module glissade_mass_balance use glimmer_utils, only: point_diag use glide_types use cism_parallel, only: this_rank, main_task, nhalo, lhalo, uhalo, & - parallel_halo, parallel_reduce_max, parallel_reduce_sum, parallel_globalindex + parallel_halo, parallel_reduce_max, parallel_global_sum, parallel_globalindex implicit none save @@ -81,17 +81,27 @@ subroutine glissade_mass_balance_init(model) ! Initialize artm for the case that we are reading in artm_ref or artm_3d. ! For some temp_init options, this is needed for correct interior temperatures. + ! Note: Do not call if glaciers are enabled. When running with glaciers, artm is + ! accumulated over year 1 from a forcing file which hasn't been read in yet. + ! TODO: Think about how to initialize glacier temperatures. Currently assume artm = 0, + ! which isn't realistic. if (model%options%artm_input_function /= ARTM_INPUT_FUNCTION_XY) then - call downscale_artm(model) + if (.not.model%options%enable_glaciers) then + call downscale_artm(model) + endif endif ! Initialize smb for the case that we are reading in smb_ref or smb_3d. ! This is not strictly needed, since the SMB will be recomputed before it is used, ! but can be a helpful diagnostic. - !TODO - Do this also for the PDD option? + ! Note: Do not call if glaciers are enabled. When running with glaciers, smb is + ! accumulated over year 1 from a forcing file which hasn't been read in yet. + ! TODO - call downscale_smb for the PDD option? if (model%options%smb_input_function == SMB_INPUT_FUNCTION_XY_GRADZ .or. & model%options%smb_input_function == SMB_INPUT_FUNCTION_XYZ) then - call downscale_smb(model) + if (.not.model%options%enable_glaciers) then + call downscale_smb(model) + endif endif ! Initialize acab, if SMB (with different units) was read in @@ -1093,6 +1103,7 @@ subroutine mass_balance_driver(& call glissade_sum_mass_and_tracers(& nx, ny, & nlyr, ntracers, & + parallel, & thck_layer(:,:,:), msum_init, & tracers(:,:,:,:), mtsum_init(:)) @@ -1171,9 +1182,9 @@ subroutine mass_balance_driver(& enddo enddo - sum_acab = parallel_reduce_sum(sum_acab) - sum_bmlt = parallel_reduce_sum(sum_bmlt) - sum_melt_potential = parallel_reduce_sum(sum_melt_potential) + sum_acab = parallel_global_sum(acab*effective_areafrac, parallel) + sum_bmlt = parallel_global_sum(bmlt*effective_areafrac, parallel) + sum_melt_potential = parallel_global_sum(melt_potential, parallel) msum_init = msum_init + (sum_acab - sum_bmlt)*dt @@ -1182,6 +1193,7 @@ subroutine mass_balance_driver(& call glissade_sum_mass_and_tracers(& nx, ny, & nlyr, ntracers, & + parallel, & thck_layer(:,:,:), msum_final, & tracers(:,:,:,:), mtsum_final(:)) diff --git a/libglissade/glissade_transport.F90 b/libglissade/glissade_transport.F90 index b42d6866..9ba618f4 100644 --- a/libglissade/glissade_transport.F90 +++ b/libglissade/glissade_transport.F90 @@ -45,7 +45,7 @@ module glissade_transport use glimmer_log use glissade_remap, only: glissade_horizontal_remap, make_remap_mask, puny use cism_parallel, only: this_rank, main_task, nhalo, lhalo, uhalo, staggered_lhalo, staggered_uhalo, & - parallel_type, parallel_reduce_max, parallel_reduce_sum, parallel_reduce_minloc, & + parallel_type, parallel_global_sum, parallel_reduce_max, parallel_reduce_minloc, & parallel_globalindex, broadcast implicit none @@ -445,6 +445,7 @@ subroutine glissade_transport_driver(dt, & call glissade_sum_mass_and_tracers(& nx, ny, & nlyr, ntracers, & + parallel, & thck_layer(:,:,:), msum_init, & tracers(:,:,:,:), mtsum_init(:)) endif @@ -603,6 +604,7 @@ subroutine glissade_transport_driver(dt, & call glissade_sum_mass_and_tracers(& nx, ny, & nlyr, ntracers, & + parallel, & thck_layer(:,:,:), msum_final, & tracers(:,:,:,:), mtsum_final(:)) @@ -661,6 +663,7 @@ subroutine glissade_transport_driver(dt, & call glissade_sum_mass_and_tracers(& nx, ny, & nlyr, ntracers, & + parallel, & thck_layer(:,:,:), msum_final, & tracers(:,:,:,:), mtsum_final(:)) @@ -950,6 +953,7 @@ end subroutine glissade_check_cfl subroutine glissade_sum_mass_and_tracers(& nx, ny, & nlyr, ntracer, & + parallel, & thck_layer, msum, & tracer, mtsum) @@ -963,6 +967,9 @@ subroutine glissade_sum_mass_and_tracers(& nlyr, &! number of vertical layers ntracer ! number of tracers + type(parallel_type), intent(in) :: & + parallel ! info for parallel communication + real(dp), dimension (nx,ny,nlyr), intent(in) :: & thck_layer ! ice layer thickness @@ -977,30 +984,21 @@ subroutine glissade_sum_mass_and_tracers(& ! Local arguments - integer :: i, j, nt - - msum = 0.d0 - if (present(mtsum)) mtsum(:) = 0.d0 + integer :: nt, k - do j = 1+nhalo, ny-nhalo - do i = 1+nhalo, nx-nhalo - - ! accumulate ice mass and mass*tracers - ! (actually, accumulate thickness, assuming rhoi*dx*dy is the same for each cell) - - msum = msum + sum(thck_layer(i,j,:)) - - if (present(mtsum)) then - do nt = 1, ntracer - mtsum(nt) = mtsum(nt) + sum(tracer(i,j,nt,:)*thck_layer(i,j,:)) - enddo - endif - - enddo ! i - enddo ! j + msum = 0.0d0 + do k = 1, nlyr + msum = msum + parallel_global_sum(thck_layer(:,:,k), parallel) + enddo - msum = parallel_reduce_sum(msum) - if (present(mtsum)) mtsum = parallel_reduce_sum(mtsum) + if (present(mtsum)) then + mtsum(:) = 0.0d0 + do k = 1, nlyr + do nt = 1, ntracer + mtsum(nt) = mtsum(nt) + parallel_global_sum(tracer(:,:,nt,k)*thck_layer(:,:,k), parallel) + enddo + enddo + endif end subroutine glissade_sum_mass_and_tracers diff --git a/libglissade/glissade_utils.F90 b/libglissade/glissade_utils.F90 index e7e3431f..eb5b629f 100644 --- a/libglissade/glissade_utils.F90 +++ b/libglissade/glissade_utils.F90 @@ -516,8 +516,10 @@ end subroutine glissade_adjust_topography !**************************************************** + !TODO - Calls to this subroutine could be replaced by inline calls to parallel_global_sum_patch subroutine glissade_basin_sum(& nx, ny, & + parallel, & nbasin, basin_number, & rmask, & field_2d, & @@ -527,11 +529,14 @@ subroutine glissade_basin_sum(& ! The sum is taken over grid cells with mask = 1. ! All cells are weighted equally. - use cism_parallel, only: parallel_reduce_sum, nhalo + use cism_parallel, only: parallel_global_sum_patch integer, intent(in) :: & nx, ny !> number of grid cells in each dimension + type(parallel_type), intent(in) :: & + parallel !> info for parallel communication + integer, intent(in) :: & nbasin !> number of basins @@ -546,29 +551,10 @@ subroutine glissade_basin_sum(& real(dp), dimension(nbasin), intent(out) :: & field_basin_sum !> basin-sum output field - ! local variables - - integer :: i, j, nb - !TODO - Replace sumcell with sumarea, and pass in cell area. ! Current algorithm assumes all cells with mask = 1 have equal weight. - real(dp), dimension(nbasin) :: & - sumfield_local ! sum of field on local task - - sumfield_local(:) = 0.0d0 - - ! loop over locally owned cells - do j = nhalo+1, ny-nhalo - do i = nhalo+1, nx-nhalo - nb = basin_number(i,j) - if (nb >= 1) then - sumfield_local(nb) = sumfield_local(nb) + rmask(i,j)*field_2d(i,j) - endif - enddo - enddo - - field_basin_sum(:) = parallel_reduce_sum(sumfield_local(:)) + field_basin_sum = parallel_global_sum_patch(rmask*field_2d, nbasin, basin_number, parallel) end subroutine glissade_basin_sum @@ -576,6 +562,7 @@ end subroutine glissade_basin_sum subroutine glissade_basin_average(& nx, ny, & + parallel, & nbasin, basin_number, & rmask, & field_2d, & @@ -586,11 +573,14 @@ subroutine glissade_basin_average(& ! All cells are weighted equally. ! Note: This subroutine assumes an input field located at cell centers - use cism_parallel, only: parallel_reduce_sum, nhalo + use cism_parallel, only: parallel_global_sum_patch integer, intent(in) :: & nx, ny !> number of grid cells in each dimension + type(parallel_type), intent(in) :: & + parallel !> info for parallel communication + integer, intent(in) :: & nbasin !> number of basins @@ -607,33 +597,17 @@ subroutine glissade_basin_average(& ! local variables - integer :: i, j, nb + integer :: nb !TODO - Replace sumcell with sumarea, and pass in cell area. ! Current algorithm assumes all cells with mask = 1 have equal weight. real(dp), dimension(nbasin) :: & - summask_local, & ! sum of mask in each basin on local task summask_global, & ! sum of mask in each basin on full domain - sumfield_local, & ! sum of field on local task sumfield_global ! sum of field over full domain - summask_local(:) = 0.0d0 - sumfield_local(:) = 0.0d0 - - ! loop over locally owned cells only - do j = nhalo+1, ny-nhalo - do i = nhalo+1, nx-nhalo - nb = basin_number(i,j) - if (nb >= 1) then - summask_local(nb) = summask_local(nb) + rmask(i,j) - sumfield_local(nb) = sumfield_local(nb) + rmask(i,j)*field_2d(i,j) - endif - enddo - enddo - - summask_global(:) = parallel_reduce_sum(summask_local(:)) - sumfield_global(:) = parallel_reduce_sum(sumfield_local(:)) + summask_global = parallel_global_sum_patch(rmask, nbasin, basin_number, parallel) + sumfield_global = parallel_global_sum_patch(rmask*field_2d, nbasin, basin_number, parallel) do nb = 1, nbasin if (summask_global(nb) > tiny(0.0d0)) then diff --git a/libglissade/glissade_velo_higher.F90 b/libglissade/glissade_velo_higher.F90 index 3b146e44..6ea89c0a 100644 --- a/libglissade/glissade_velo_higher.F90 +++ b/libglissade/glissade_velo_higher.F90 @@ -239,6 +239,8 @@ module glissade_velo_higher ! logical :: verbose_L1L2 = .true. logical :: verbose_diva = .false. ! logical :: verbose_diva = .true. + logical :: verbose_bp = .false. +! logical :: verbose_bp = .true. logical :: verbose_glp = .false. ! logical :: verbose_glp = .true. logical :: verbose_picard = .false. @@ -3591,7 +3593,7 @@ subroutine glissade_velo_higher_solve(model, & !WHL - debug !TODO - One diagnostic to write out column velocities for any approximation - if (whichapprox == HO_APPROX_BP .and. this_rank==rtest) then + if (verbose_bp .and. whichapprox == HO_APPROX_BP .and. this_rank==rtest) then write(iulog,*) ' ' i = itest j = jtest @@ -9424,7 +9426,7 @@ subroutine compute_residual_vector_3d(nx, ny, nz, & if ( (k+kA >= 1 .and. k+kA <= nz) & .and. & (i+iA >= 1 .and. i+iA <= nx-1) & - .and. & + .and. & (j+jA >= 1 .and. j+jA <= ny-1) ) then m = indxA_3d(iA,jA,kA) @@ -9475,6 +9477,11 @@ subroutine compute_residual_vector_3d(nx, ny, nz, & L2_norm = parallel_reduce_sum(L2_norm) L2_norm = sqrt(L2_norm) +!! sum_resid_u_sq = parallel_global_sum(bu*bu, parallel, active_vertex) +!! sum_resid_v_sq = parallel_global_sum(bv*bv, parallel, active_vertex) +!! L2_norm = parallel_global_sum(resid_sq, parallel) +!! L2_norm = sqrt(L2_norm) + if (verbose_residual) then if (this_rank==rtest) then diff --git a/libglissade/glissade_velo_higher_pcg.F90 b/libglissade/glissade_velo_higher_pcg.F90 index 23c9ceaf..1d4df22a 100644 --- a/libglissade/glissade_velo_higher_pcg.F90 +++ b/libglissade/glissade_velo_higher_pcg.F90 @@ -48,8 +48,7 @@ module glissade_velo_higher_pcg use glimmer_log use profile, only: t_startf, t_stopf use cism_parallel, only: this_rank, main_task, & - parallel_type, staggered_parallel_halo, parallel_reduce_sum, & - parallel_global_sum_staggered + parallel_type, staggered_parallel_halo, parallel_global_sum_stagger implicit none @@ -60,8 +59,6 @@ module glissade_velo_higher_pcg logical, parameter :: verbose_pcg = .false. logical, parameter :: verbose_tridiag = .false. -!! logical, parameter :: verbose_pcg = .true. -!! logical, parameter :: verbose_tridiag = .true. contains @@ -322,11 +319,7 @@ subroutine pcg_solver_standard_3d(nx, ny, & ! find global sum of the squared L2 norm call t_startf("pcg_glbsum_init") - call parallel_global_sum_staggered(& - nx, ny, & - nz, parallel, & - L2_rhs, & - work0u, work0v) + L2_rhs = parallel_global_sum_stagger(work0u, parallel, work0v) call t_stopf("pcg_glbsum_init") ! take square root @@ -387,11 +380,7 @@ subroutine pcg_solver_standard_3d(nx, ny, & call t_stopf("pcg_dotprod") call t_startf("pcg_glbsum_iter") - call parallel_global_sum_staggered(& - nx, ny, & - nz, parallel, & - eta1, & - work0u, work0v) + eta1 = parallel_global_sum_stagger(work0u, parallel, work0v) call t_stopf("pcg_glbsum_iter") !WHL - If the SIA solver has failed due to singular matrices, @@ -447,11 +436,7 @@ subroutine pcg_solver_standard_3d(nx, ny, & call t_stopf("pcg_dotprod") call t_startf("pcg_glbsum_iter") - call parallel_global_sum_staggered(& - nx, ny, & - nz, parallel, & - eta2, & - work0u, work0v) + eta2 = parallel_global_sum_stagger(work0u, parallel, work0v) call t_stopf("pcg_glbsum_iter") ! Compute alpha @@ -519,11 +504,7 @@ subroutine pcg_solver_standard_3d(nx, ny, & call t_stopf("pcg_dotprod") call t_startf("pcg_glbsum_resid") - call parallel_global_sum_staggered(& - nx, ny, & - nz, parallel, & - L2_resid, & - work0u, work0v) + L2_resid = parallel_global_sum_stagger(work0u, parallel, work0v) call t_stopf("pcg_glbsum_resid") ! take square root @@ -753,11 +734,7 @@ subroutine pcg_solver_standard_2d(nx, ny, & ! find global sum of the squared L2 norm call t_startf("pcg_glbsum_init") - call parallel_global_sum_staggered(& - nx, ny, & - parallel, & - L2_rhs, & - work0u, work0v) + L2_rhs = parallel_global_sum_stagger(work0u, parallel, work0v) call t_stopf("pcg_glbsum_init") ! take square root @@ -810,11 +787,7 @@ subroutine pcg_solver_standard_2d(nx, ny, & call t_stopf("pcg_dotprod") call t_startf("pcg_glbsum_iter") - call parallel_global_sum_staggered(& - nx, ny, & - parallel, & - eta1, & - work0u, work0v) + eta1 = parallel_global_sum_stagger(work0u, parallel, work0v) call t_stopf("pcg_glbsum_iter") !WHL - If the SIA solver has failed due to singular matrices, @@ -870,11 +843,7 @@ subroutine pcg_solver_standard_2d(nx, ny, & call t_stopf("pcg_dotprod") call t_startf("pcg_glbsum_iter") - call parallel_global_sum_staggered(& - nx, ny, & - parallel, & - eta2, & - work0u, work0v) + eta2 = parallel_global_sum_stagger(work0u, parallel, work0v) call t_stopf("pcg_glbsum_iter") ! Compute alpha @@ -942,11 +911,7 @@ subroutine pcg_solver_standard_2d(nx, ny, & call t_stopf("pcg_dotprod") call t_startf("pcg_glbsum_resid") - call parallel_global_sum_staggered(& - nx, ny, & - parallel, & - L2_resid, & - work0u, work0v) + L2_resid = parallel_global_sum_stagger(work0u, parallel, work0v) call t_stopf("pcg_glbsum_resid") ! take square root @@ -1428,11 +1393,7 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & ! find global sum of the squared L2 norm call t_startf("pcg_glbsum_init") - call parallel_global_sum_staggered(& - nx, ny, & - nz, parallel, & - bb, & - worku, workv) + bb = parallel_global_sum_stagger(worku, parallel, workv) call t_stopf("pcg_glbsum_init") ! take square root @@ -1626,11 +1587,7 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & !---- Find global sums of (r,z) and (d,q) call t_startf("pcg_glbsum_iter") - call parallel_global_sum_staggered(& - nx, ny, & - nz, parallel, & - gsum, & - work2u, work2v) + gsum = parallel_global_sum_stagger(work2u, 2, parallel, work2v) call t_stopf("pcg_glbsum_iter") !---- Halo update for q @@ -1805,11 +1762,7 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & ! this is the one MPI global reduction per iteration. call t_startf("pcg_glbsum_iter") - call parallel_global_sum_staggered(& - nx, ny, & - nz, parallel, & - gsum, & - work2u, work2v) + gsum = parallel_global_sum_stagger(work2u, 2, parallel, work2v) call t_stopf("pcg_glbsum_iter") !---- Halo update for Az @@ -1911,11 +1864,7 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & call t_stopf("pcg_dotprod") call t_startf("pcg_glbsum_resid") - call parallel_global_sum_staggered(& - nx, ny, & - nz, parallel, & - rr, & - worku, workv) + rr = parallel_global_sum_stagger(worku, parallel, workv) call t_stopf("pcg_glbsum_resid") L2_resid = sqrt(rr) ! L2 norm of residual @@ -2255,6 +2204,7 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & denom_u, denom_v) elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then + !TODO - Figure out why these calculations depend on the number of cores. Halo bug? ! Allocate tridiagonal matrices ! Note: (i,j) indices are switced for the A_v matrices to reduce striding. @@ -2365,11 +2315,7 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & ! find global sum of the squared L2 norm call t_startf("pcg_glbsum_init") - call parallel_global_sum_staggered(& - nx, ny, & - parallel, & - bb, & - worku, workv) + bb = parallel_global_sum_stagger(worku, parallel, workv) call t_stopf("pcg_glbsum_init") ! take square root @@ -2450,7 +2396,7 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & !WHL - debug if (verbose_pcg .and. this_rank == rtest) then - i = itest +! i = itest ! write(iulog,*) ' ' ! write(iulog,*) 'zv solve with diagonal precond, this_rank, i =', this_rank, i ! write(iulog,*) 'j, active, Adiagv, rv, zv, xv:' @@ -2496,14 +2442,14 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & zu, zv) ! solution !WHL - debug - if (verbose_pcg .and. this_rank == rtest) then - j = jtest - write(iulog,*) ' ' - write(iulog,*) 'tridiag solve: i, ru, rv, zu, zv:' - do i = itest-3, itest+3 - write(iulog,'(i4, 4f16.10)') i, ru(i,j), rv(i,j), zu(i,j), zv(i,j) - enddo - endif + if (verbose_pcg .and. this_rank == rtest) then + j = jtest + write(iulog,*) ' ' + write(iulog,*) 'tridiag solve: i, ru, rv, zu, zv:' + do i = itest-3, itest+3 + write(iulog,'(i4, 4f16.10)') i, ru(i,j), rv(i,j), zu(i,j), zv(i,j) + enddo + endif !Note: Need zu and zv in a row of halo cells so that q = A*d is correct in all locally owned cells !TODO: See whether tridiag solvers could be modified to provide zu and zv in halo cells? @@ -2627,16 +2573,6 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & qu, qv) call t_stopf("pcg_matmult_iter") - !WHL - debug - usum = sum(qu(staggered_ilo:staggered_ihi,staggered_jlo:staggered_jhi)) - usum_global = parallel_reduce_sum(usum) - vsum = sum(qv(staggered_ilo:staggered_ihi,staggered_jlo:staggered_jhi)) - vsum_global = parallel_reduce_sum(vsum) - - if (verbose_pcg .and. this_rank == rtest) then -!! write(iulog,*) 'Prep: sum(qu), sum(qv) =', usum_global, vsum_global - endif - !---- Compute intermediate result for dot product (d,q) = (d,Ad) call t_startf("pcg_dotprod") @@ -2647,17 +2583,9 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & !---- Find global sums of (r,z) and (d,q) call t_startf("pcg_glbsum_iter") - call parallel_global_sum_staggered(& - nx, ny, & - parallel, & - gsum, & - work2u, work2v) + gsum = parallel_global_sum_stagger(work2u, 2, parallel, work2v) ! nflds = 2 call t_stopf("pcg_glbsum_iter") - if (verbose_pcg .and. this_rank == rtest) then -!! write(iulog,*) 'Prep: gsum(1), gsum(2) =', gsum(1), gsum(2) - endif - !---- Halo update for q call t_startf("pcg_halo_iter") @@ -2686,16 +2614,16 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & rv(:,:) = rv(:,:) - alpha*qv(:,:) call t_stopf("pcg_vecupdate") - !WHL - debug - if (verbose_pcg .and. this_rank == rtest) then - j = jtest - write(iulog,*) ' ' - write(iulog,*) 'iter = 1: i, xu, xv, ru, rv:' + !WHL - debug + if (verbose_pcg .and. this_rank == rtest) then + j = jtest + write(iulog,*) ' ' + write(iulog,*) 'iter = 1: i, xu, xv, ru, rv:' !! do i = itest-3, itest+3 - do i = staggered_ilo, staggered_ihi - write(iulog,'(i4, 4f16.10)') i, xu(i,j), xv(i,j), ru(i,j), rv(i,j) - enddo ! i - endif + do i = staggered_ilo, staggered_ihi + write(iulog,'(i4, 4f16.10)') i, xu(i,j), xv(i,j), ru(i,j), rv(i,j) + enddo ! i + endif !--------------------------------------------------------------- ! Iterate to solution @@ -2934,11 +2862,7 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & ! this is the one MPI global reduction per iteration. call t_startf("pcg_glbsum_iter") - call parallel_global_sum_staggered(& - nx, ny, & - parallel, & - gsum, & - work2u, work2v) + gsum = parallel_global_sum_stagger(work2u, 2, parallel, work2v) ! nflds = 2 call t_stopf("pcg_glbsum_iter") !---- Halo update for Az @@ -3038,11 +2962,7 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & call t_stopf("pcg_dotprod") call t_startf("pcg_glbsum_resid") - call parallel_global_sum_staggered(& - nx, ny, & - parallel, & - rr, & - worku, workv) + rr = parallel_global_sum_stagger(worku, parallel, workv) call t_stopf("pcg_glbsum_resid") L2_resid = sqrt(rr) ! L2 norm of residual From 37df31dac277793384aadda9ffd3e6c11ac5a69c Mon Sep 17 00:00:00 2001 From: William Lipscomb Date: Fri, 5 Dec 2025 12:51:55 -0700 Subject: [PATCH 03/21] Shortened some subroutine names in the cism_parallel module This commit changes subroutines distributed_gather_var_* to gather_var_*, and similarly changes distributed_scatter_var_* to scatter_var_*. This prevents some compilers from complaining about long subroutine names. The 'distributed' prefix in subroutines names goes back a decade or so to when CISM was first parallelized. Might at some point want to remove it from other subroutine names. --- libglide/isostasy_elastic.F90 | 12 +- libglimmer/parallel_mpi.F90 | 198 +++++++++++------------ libglint/glint_initialise.F90 | 4 +- libglint/glint_interp.F90 | 40 ++--- libglissade/glissade.F90 | 3 +- libglissade/glissade_basal_traction.F90 | 10 +- libglissade/glissade_glacier.F90 | 10 +- libglissade/glissade_velo_higher_pcg.F90 | 17 +- 8 files changed, 146 insertions(+), 148 deletions(-) diff --git a/libglide/isostasy_elastic.F90 b/libglide/isostasy_elastic.F90 index 60f2f55e..d309137d 100644 --- a/libglide/isostasy_elastic.F90 +++ b/libglide/isostasy_elastic.F90 @@ -136,7 +136,7 @@ subroutine calc_elastic(& !> the load for simulations on more than one task. use cism_parallel, only: this_rank, main_task, & - parallel_type, distributed_gather_var, distributed_scatter_var, parallel_halo + parallel_type, gather_var, scatter_var, parallel_halo implicit none @@ -182,8 +182,8 @@ subroutine calc_elastic(& ! Gather the local arrays onto the main task ! Note: global arrays are allocated in the subroutine - call distributed_gather_var(load_factors, load_factors_global, parallel) - call distributed_gather_var(load, load_global, parallel) + call gather_var(load_factors, load_factors_global, parallel) + call gather_var(load, load_global, parallel) if (main_task) then do j = 1, global_nsn @@ -207,9 +207,9 @@ subroutine calc_elastic(& ! Scatter the load values back to local arrays ! Note: The global array is deallocated in the subroutine - call distributed_scatter_var(load, load_global, parallel) + call scatter_var(load, load_global, parallel) - ! distributed_scatter_var does not update the halo, so do an update here + ! scatter_var does not update the halo, so do an update here call parallel_halo(load, parallel) ! Deallocate the other global array (which is intent(in) and does not need to be scattered) @@ -251,7 +251,7 @@ subroutine init_rbel(rbel, a) use isostasy_kelvin implicit none type(isos_elastic) :: rbel !> structure holding elastic litho data - real(dp), intent(in) :: a !> radius of disk + real(dp), intent(in) :: a !> radius of disk real(dp) :: dummy_a diff --git a/libglimmer/parallel_mpi.F90 b/libglimmer/parallel_mpi.F90 index 972b8b32..6afb1d9f 100644 --- a/libglimmer/parallel_mpi.F90 +++ b/libglimmer/parallel_mpi.F90 @@ -196,29 +196,29 @@ module cism_parallel module procedure broadcast_real8_1d end interface - interface distributed_gather_var - module procedure distributed_gather_var_integer_2d - module procedure distributed_gather_var_logical_2d - module procedure distributed_gather_var_real4_2d - module procedure distributed_gather_var_real4_3d - module procedure distributed_gather_var_real8_2d - module procedure distributed_gather_var_real8_3d + interface gather_var + module procedure gather_var_integer_2d + module procedure gather_var_logical_2d + module procedure gather_var_real4_2d + module procedure gather_var_real4_3d + module procedure gather_var_real8_2d + module procedure gather_var_real8_3d end interface - interface distributed_gather_var_row - module procedure distributed_gather_var_row_real8_2d + interface gather_var_row + module procedure gather_var_row_real8_2d end interface - interface distributed_gather_all_var_row - module procedure distributed_gather_all_var_row_real8_2d + interface gather_all_var_row + module procedure gather_all_var_row_real8_2d end interface - interface distributed_gather_var_col - module procedure distributed_gather_var_col_real8_2d + interface gather_var_col + module procedure gather_var_col_real8_2d end interface - interface distributed_gather_all_var_col - module procedure distributed_gather_all_var_col_real8_2d + interface gather_all_var_col + module procedure gather_all_var_col_real8_2d end interface interface distributed_get_var @@ -246,21 +246,21 @@ module cism_parallel module procedure distributed_put_var_real8_3d end interface - interface distributed_scatter_var - module procedure distributed_scatter_var_integer_2d - module procedure distributed_scatter_var_logical_2d - module procedure distributed_scatter_var_real4_2d - module procedure distributed_scatter_var_real4_3d - module procedure distributed_scatter_var_real8_2d - module procedure distributed_scatter_var_real8_3d + interface scatter_var + module procedure scatter_var_integer_2d + module procedure scatter_var_logical_2d + module procedure scatter_var_real4_2d + module procedure scatter_var_real4_3d + module procedure scatter_var_real8_2d + module procedure scatter_var_real8_3d end interface - interface distributed_scatter_var_row - module procedure distributed_scatter_var_row_real8_2d + interface scatter_var_row + module procedure scatter_var_row_real8_2d end interface - interface distributed_scatter_var_col - module procedure distributed_scatter_var_col_real8_2d + interface scatter_var_col + module procedure scatter_var_col_real8_2d end interface interface parallel_boundary_value @@ -631,10 +631,10 @@ end function distributed_execution !======================================================================= - ! subroutines belonging to the distributed_gather_var interface + ! subroutines belonging to the gather_var interface ! WHL, July 2019: - ! There is an issue with allocating the global_values array in the distributed_gather_var_*, + ! There is an issue with allocating the global_values array in the gather_var_*, ! distributed_get_var_*, distributed_print_*, and distributed_put_var_* functions and subroutines ! when computing only on active blocks (compute_blocks = 1). ! This array is allocated based on the max and min of ewlb, ewub, nslb, and nsub over the global domain. @@ -646,7 +646,7 @@ end function distributed_execution ! global_minval_nslb, and global_maxval_nsub, which are now computed at initialization ! based on the bounds in all blocks (including inactive blocks), not just active blocks. - subroutine distributed_gather_var_integer_2d(values, global_values, parallel) + subroutine gather_var_integer_2d(values, global_values, parallel) ! Gather a distributed variable back to main_task node ! values = local portion of distributed variable @@ -679,7 +679,7 @@ subroutine distributed_gather_var_integer_2d(values, global_values, parallel) if (uhalo==0 .and. size(values,1)==local_ewn-1) then ! Fixing this would require some generalization as is done for distributed_put_var - write(*,*) "distributed_gather does not currently work for" + write(*,*) "gather does not currently work for" write(*,*) "variables on the staggered grid when uhalo=0" call parallel_stop(__FILE__, __LINE__) end if @@ -751,10 +751,10 @@ subroutine distributed_gather_var_integer_2d(values, global_values, parallel) end associate ! automatic deallocation - end subroutine distributed_gather_var_integer_2d + end subroutine gather_var_integer_2d - subroutine distributed_gather_var_logical_2d(values, global_values, parallel) + subroutine gather_var_logical_2d(values, global_values, parallel) ! Gather a distributed variable back to main_task node ! values = local portion of distributed variable @@ -787,7 +787,7 @@ subroutine distributed_gather_var_logical_2d(values, global_values, parallel) if (uhalo==0 .and. size(values,1)==local_ewn-1) then ! Fixing this would require some generalization as is done for distributed_put_var - write(*,*) "distributed_gather does not currently work for" + write(*,*) "gather does not currently work for" write(*,*) "variables on the staggered grid when uhalo=0" call parallel_stop(__FILE__, __LINE__) end if @@ -858,10 +858,10 @@ subroutine distributed_gather_var_logical_2d(values, global_values, parallel) end associate ! automatic deallocation - end subroutine distributed_gather_var_logical_2d + end subroutine gather_var_logical_2d - subroutine distributed_gather_var_real4_2d(values, global_values, parallel) + subroutine gather_var_real4_2d(values, global_values, parallel) ! Gather a distributed variable back to main_task node ! values = local portion of distributed variable @@ -894,7 +894,7 @@ subroutine distributed_gather_var_real4_2d(values, global_values, parallel) if (uhalo==0 .and. size(values,1)==local_ewn-1) then ! Fixing this would require some generalization as is done for distributed_put_var - write(*,*) "distributed_gather does not currently work for" + write(*,*) "gather does not currently work for" write(*,*) "variables on the staggered grid when uhalo=0" call parallel_stop(__FILE__, __LINE__) end if @@ -965,10 +965,10 @@ subroutine distributed_gather_var_real4_2d(values, global_values, parallel) end associate ! automatic deallocation - end subroutine distributed_gather_var_real4_2d + end subroutine gather_var_real4_2d - subroutine distributed_gather_var_real4_3d(values, global_values, parallel, ld1, ud1) + subroutine gather_var_real4_3d(values, global_values, parallel, ld1, ud1) ! Gather a distributed variable back to main_task node ! values = local portion of distributed variable @@ -1002,7 +1002,7 @@ subroutine distributed_gather_var_real4_3d(values, global_values, parallel, ld1, if (uhalo==0 .and. size(values,1)==local_ewn-1) then ! Fixing this would require some generalization as is done for distributed_put_var - write(*,*) "distributed_gather does not currently work for" + write(*,*) "gather does not currently work for" write(*,*) "variables on the staggered grid when uhalo=0" call parallel_stop(__FILE__, __LINE__) end if @@ -1091,10 +1091,10 @@ subroutine distributed_gather_var_real4_3d(values, global_values, parallel, ld1, end associate ! automatic deallocation - end subroutine distributed_gather_var_real4_3d + end subroutine gather_var_real4_3d - subroutine distributed_gather_var_real8_2d(values, global_values, parallel) + subroutine gather_var_real8_2d(values, global_values, parallel) ! Gather a distributed variable back to main_task node ! values = local portion of distributed variable @@ -1127,7 +1127,7 @@ subroutine distributed_gather_var_real8_2d(values, global_values, parallel) if (uhalo==0 .and. size(values,1)==local_ewn-1) then ! Fixing this would require some generalization as is done for distributed_put_var - write(*,*) "distributed_gather does not currently work for" + write(*,*) "gather does not currently work for" write(*,*) "variables on the staggered grid when uhalo=0" call parallel_stop(__FILE__, __LINE__) end if @@ -1199,10 +1199,10 @@ subroutine distributed_gather_var_real8_2d(values, global_values, parallel) end associate ! automatic deallocation - end subroutine distributed_gather_var_real8_2d + end subroutine gather_var_real8_2d - subroutine distributed_gather_var_real8_3d(values, global_values, parallel, ld1, ud1) + subroutine gather_var_real8_3d(values, global_values, parallel, ld1, ud1) ! Gather a distributed variable back to main_task node ! values = local portion of distributed variable @@ -1236,7 +1236,7 @@ subroutine distributed_gather_var_real8_3d(values, global_values, parallel, ld1, if (uhalo==0 .and. size(values,1)==local_ewn-1) then ! Fixing this would require some generalization as is done for distributed_put_var - write(*,*) "distributed_gather does not currently work for" + write(*,*) "gather does not currently work for" write(*,*) "variables on the staggered grid when uhalo=0" call parallel_stop(__FILE__, __LINE__) end if @@ -1325,16 +1325,16 @@ subroutine distributed_gather_var_real8_3d(values, global_values, parallel, ld1, end associate ! automatic deallocation - end subroutine distributed_gather_var_real8_3d + end subroutine gather_var_real8_3d !======================================================================= - ! subroutines belonging to the distributed_gather_var_row interface + ! subroutines belonging to the gather_var_row interface - subroutine distributed_gather_var_row_real8_2d(values, global_values, parallel) + subroutine gather_var_row_real8_2d(values, global_values, parallel) ! Gather data along a row of tasks onto the main task for that row. - ! Based on distributed_gather_var_real8_2d. + ! Based on gather_var_real8_2d. ! Note: The first index represents a data dimension that is the same on each task, ! whose size generally is less than own_ewn. ! The second index represents the north-south dimension, and is assumed @@ -1369,7 +1369,7 @@ subroutine distributed_gather_var_row_real8_2d(values, global_values, parallel) if (size(values,2) /= own_nsn) then ! Note: Removing this restriction would require some recoding below. - write(*,*) "ERROR: distributed_gather_var_row requires N-S array size of own_nsn" + write(*,*) "ERROR: gather_var_row requires N-S array size of own_nsn" write(*,*) 'rank, own_nsn, size(values,2) =', this_rank, own_nsn, size(values,2) call parallel_stop(__FILE__, __LINE__) end if @@ -1464,16 +1464,16 @@ subroutine distributed_gather_var_row_real8_2d(values, global_values, parallel) end associate ! automatic deallocation - end subroutine distributed_gather_var_row_real8_2d + end subroutine gather_var_row_real8_2d !======================================================================= - ! subroutines belonging to the distributed_gather_all_var_row interface + ! subroutines belonging to the gather_all_var_row interface - subroutine distributed_gather_all_var_row_real8_2d(values, global_values, parallel) + subroutine gather_all_var_row_real8_2d(values, global_values, parallel) ! Gather global data along a row of tasks onto each task for that row. - ! Based on distributed_gather_var_real8_2d. + ! Based on gather_var_real8_2d. ! Note: The first index represents a data dimension that is the same on each task, ! whose size generally is less than own_ewn. ! The second index represents the north-south dimension, and is assumed @@ -1511,7 +1511,7 @@ subroutine distributed_gather_all_var_row_real8_2d(values, global_values, parall ! TODO: Do this recoding. This subroutine currently fails with outflow BC, because ! the southern and western rows of tasks have an extra locally owned vertex, ! giving size(values,2) = own_nsn + 1 - write(*,*) "ERROR: distributed_gather_var_row requires N-S array size of own_nsn" + write(*,*) "ERROR: gather_var_row requires N-S array size of own_nsn" write(*,*) 'rank, own_nsn, size(values,2) =', this_rank, own_nsn, size(values,2) call parallel_stop(__FILE__, __LINE__) end if @@ -1586,16 +1586,16 @@ subroutine distributed_gather_all_var_row_real8_2d(values, global_values, parall end associate ! automatic deallocation - end subroutine distributed_gather_all_var_row_real8_2d + end subroutine gather_all_var_row_real8_2d !======================================================================= - ! subroutines belonging to the distributed_gather_var_col interface + ! subroutines belonging to the gather_var_col interface - subroutine distributed_gather_var_col_real8_2d(values, global_values, parallel) + subroutine gather_var_col_real8_2d(values, global_values, parallel) ! Gather data along a column of tasks onto the main task for that column. - ! Based on distributed_gather_var_real8_2d. + ! Based on gather_var_real8_2d. ! Note: The first index represents a data dimension that is the same on each task, ! whose size generally is less than own_nsn. ! The second index represents the east-west dimension, and is assumed @@ -1630,7 +1630,7 @@ subroutine distributed_gather_var_col_real8_2d(values, global_values, parallel) if (size(values,2) /= own_ewn) then ! Note: Removing this restriction would require some recoding below. - write(*,*) "ERROR: distributed_gather_var_row requires E-W array size of own_ewn" + write(*,*) "ERROR: gather_var_row requires E-W array size of own_ewn" write(*,*) 'rank, own_ewn, size(values,2) =', this_rank, own_ewn, size(values,2) call parallel_stop(__FILE__, __LINE__) end if @@ -1726,16 +1726,16 @@ subroutine distributed_gather_var_col_real8_2d(values, global_values, parallel) end associate ! automatic deallocation - end subroutine distributed_gather_var_col_real8_2d + end subroutine gather_var_col_real8_2d !======================================================================= - ! subroutines belonging to the distributed_gather_all_var_col interface + ! subroutines belonging to the gather_all_var_col interface - subroutine distributed_gather_all_var_col_real8_2d(values, global_values, parallel) + subroutine gather_all_var_col_real8_2d(values, global_values, parallel) ! Gather global data along a column of tasks onto each task for that column. - ! Based on distributed_gather_var_real8_2d. + ! Based on gather_var_real8_2d. ! Note: The first index represents a data dimension that is the same on each task, ! whose size generally is less than own_nsn. ! The second index represents the east-west dimension, and is assumed @@ -1770,7 +1770,7 @@ subroutine distributed_gather_all_var_col_real8_2d(values, global_values, parall if (size(values,2) /= own_ewn) then ! Note: Removing this restriction would require some recoding below. - write(*,*) "ERROR: distributed_gather_var_row requires E-W array size of own_ewn" + write(*,*) "ERROR: gather_var_row requires E-W array size of own_ewn" write(*,*) 'rank, own_ewn, size(values,2) =', this_rank, own_ewn, size(values,2) call parallel_stop(__FILE__, __LINE__) end if @@ -1845,7 +1845,7 @@ subroutine distributed_gather_all_var_col_real8_2d(values, global_values, parall end associate ! automatic deallocation - end subroutine distributed_gather_all_var_col_real8_2d + end subroutine gather_all_var_col_real8_2d !======================================================================= @@ -4334,9 +4334,9 @@ end function distributed_put_var_real8_3d !======================================================================= - ! subroutines belonging to the distributed_scatter_var interface + ! subroutines belonging to the scatter_var interface - subroutine distributed_scatter_var_integer_2d(values, global_values, parallel) + subroutine scatter_var_integer_2d(values, global_values, parallel) ! Scatter a variable on the main_task node back to the distributed ! values = local portion of distributed variable @@ -4366,7 +4366,7 @@ subroutine distributed_scatter_var_integer_2d(values, global_values, parallel) if (uhalo==0 .and. size(values,1)==local_ewn-1) then ! Fixing this would require some generalization as is done for distributed_put_var - write(iulog,*) "distributed_scatter does not currently work for" + write(iulog,*) "scatter does not currently work for" write(iulog,*) "variables on the staggered grid when uhalo=0" call parallel_stop(__FILE__, __LINE__) end if @@ -4421,10 +4421,10 @@ subroutine distributed_scatter_var_integer_2d(values, global_values, parallel) deallocate(global_values) ! TODO - Is this deallocation necessary, here and below? ! automatic deallocation - end subroutine distributed_scatter_var_integer_2d + end subroutine scatter_var_integer_2d - subroutine distributed_scatter_var_logical_2d(values, global_values, parallel) + subroutine scatter_var_logical_2d(values, global_values, parallel) ! Scatter a variable on the main_task node back to the distributed ! values = local portion of distributed variable @@ -4454,7 +4454,7 @@ subroutine distributed_scatter_var_logical_2d(values, global_values, parallel) if (uhalo==0 .and. size(values,1)==local_ewn-1) then ! Fixing this would require some generalization as is done for distributed_put_var - write(iulog,*) "distributed_scatter does not currently work for" + write(iulog,*) "scatter does not currently work for" write(iulog,*) "variables on the staggered grid when uhalo=0" call parallel_stop(__FILE__, __LINE__) end if @@ -4509,10 +4509,10 @@ subroutine distributed_scatter_var_logical_2d(values, global_values, parallel) deallocate(global_values) ! automatic deallocation - end subroutine distributed_scatter_var_logical_2d + end subroutine scatter_var_logical_2d - subroutine distributed_scatter_var_real4_2d(values, global_values, parallel) + subroutine scatter_var_real4_2d(values, global_values, parallel) ! Scatter a variable on the main_task node back to the distributed ! values = local portion of distributed variable @@ -4542,7 +4542,7 @@ subroutine distributed_scatter_var_real4_2d(values, global_values, parallel) if (uhalo==0 .and. size(values,1)==local_ewn-1) then ! Fixing this would require some generalization as is done for distributed_put_var - write(iulog,*) "distributed_scatter does not currently work for" + write(iulog,*) "scatter does not currently work for" write(iulog,*) "variables on the staggered grid when uhalo=0" call parallel_stop(__FILE__, __LINE__) end if @@ -4597,10 +4597,10 @@ subroutine distributed_scatter_var_real4_2d(values, global_values, parallel) deallocate(global_values) ! automatic deallocation - end subroutine distributed_scatter_var_real4_2d + end subroutine scatter_var_real4_2d - subroutine distributed_scatter_var_real4_3d(values, global_values, parallel) + subroutine scatter_var_real4_3d(values, global_values, parallel) ! Scatter a variable on the main_task node back to the distributed ! values = local portion of distributed variable @@ -4630,7 +4630,7 @@ subroutine distributed_scatter_var_real4_3d(values, global_values, parallel) if (uhalo==0 .and. size(values,1)==local_ewn-1) then ! Fixing this would require some generalization as is done for distributed_put_var - write(iulog,*) "distributed_scatter does not currently work for" + write(iulog,*) "scatter does not currently work for" write(iulog,*) "variables on the staggered grid when uhalo=0" call parallel_stop(__FILE__, __LINE__) end if @@ -4687,10 +4687,10 @@ subroutine distributed_scatter_var_real4_3d(values, global_values, parallel) deallocate(global_values) ! automatic deallocation - end subroutine distributed_scatter_var_real4_3d + end subroutine scatter_var_real4_3d - subroutine distributed_scatter_var_real8_2d(values, global_values, parallel) + subroutine scatter_var_real8_2d(values, global_values, parallel) ! Scatter a variable on the main_task node back to the distributed ! values = local portion of distributed variable @@ -4720,7 +4720,7 @@ subroutine distributed_scatter_var_real8_2d(values, global_values, parallel) if (uhalo==0 .and. size(values,1)==local_ewn-1) then ! Fixing this would require some generalization as is done for distributed_put_var - write(iulog,*) "distributed_scatter does not currently work for" + write(iulog,*) "scatter does not currently work for" write(iulog,*) "variables on the staggered grid when uhalo=0" call parallel_stop(__FILE__, __LINE__) end if @@ -4775,10 +4775,10 @@ subroutine distributed_scatter_var_real8_2d(values, global_values, parallel) deallocate(global_values) ! automatic deallocation - end subroutine distributed_scatter_var_real8_2d + end subroutine scatter_var_real8_2d - subroutine distributed_scatter_var_real8_3d(values, global_values, parallel, deallocflag) + subroutine scatter_var_real8_3d(values, global_values, parallel, deallocflag) ! Scatter a variable on the main_task node back to the distributed ! values = local portion of distributed variable @@ -4810,7 +4810,7 @@ subroutine distributed_scatter_var_real8_3d(values, global_values, parallel, dea if (uhalo==0 .and. size(values,1)==local_ewn-1) then ! Fixing this would require some generalization as is done for distributed_put_var - write(iulog,*) "distributed_scatter does not currently work for" + write(iulog,*) "scatter does not currently work for" write(iulog,*) "variables on the staggered grid when uhalo=0" call parallel_stop(__FILE__, __LINE__) end if @@ -4874,16 +4874,16 @@ subroutine distributed_scatter_var_real8_3d(values, global_values, parallel, dea if (deallocmem) deallocate(global_values) ! automatic deallocation - end subroutine distributed_scatter_var_real8_3d + end subroutine scatter_var_real8_3d !======================================================================= - ! subroutines belonging to the distributed_scatter_var_row interface + ! subroutines belonging to the scatter_var_row interface - subroutine distributed_scatter_var_row_real8_2d(values, global_values, parallel) + subroutine scatter_var_row_real8_2d(values, global_values, parallel) ! Scatter data to a row of tasks from the main task for that row. - ! Based on distributed_scatter_var_real8_2d. + ! Based on scatter_var_real8_2d. ! Note: The first index represents a data dimension that is the same on each task, ! whose size generally is less than own_ewn. ! The second index represents the north-south dimension, and is assumed @@ -4917,7 +4917,7 @@ subroutine distributed_scatter_var_row_real8_2d(values, global_values, parallel) if (size(values,2) /= own_nsn) then ! Note: Removing this restriction would require some recoding below. - write(iulog,*) "ERROR: distributed_scatter_var_row requires N-S array size of own_nsn" + write(iulog,*) "ERROR: scatter_var_row requires N-S array size of own_nsn" call parallel_stop(__FILE__, __LINE__) end if @@ -4974,16 +4974,16 @@ subroutine distributed_scatter_var_row_real8_2d(values, global_values, parallel) end associate ! automatic deallocation - end subroutine distributed_scatter_var_row_real8_2d + end subroutine scatter_var_row_real8_2d !======================================================================= - ! subroutines belonging to the distributed_scatter_var_col interface + ! subroutines belonging to the scatter_var_col interface - subroutine distributed_scatter_var_col_real8_2d(values, global_values, parallel) + subroutine scatter_var_col_real8_2d(values, global_values, parallel) ! Scatter data to a column of tasks from the main task for that column - ! Based on distributed_scatter_var_real8_2d. + ! Based on scatter_var_real8_2d. ! Note: The first index represents a data dimension that is the same on each task, ! whose size generally is less than own_nsn. ! The second index represents the east-west dimension, and is assumed @@ -5016,7 +5016,7 @@ subroutine distributed_scatter_var_col_real8_2d(values, global_values, parallel) if (size(values,2) /= own_ewn) then ! Note: Removing this restriction would require some recoding below. - write(iulog,*) "ERROR: distributed_scatter_var_col requires E-W array size of own_nsn" + write(iulog,*) "ERROR: scatter_var_col requires E-W array size of own_nsn" call parallel_stop(__FILE__, __LINE__) end if @@ -5073,7 +5073,7 @@ subroutine distributed_scatter_var_col_real8_2d(values, global_values, parallel) end associate ! automatic deallocation - end subroutine distributed_scatter_var_col_real8_2d + end subroutine scatter_var_col_real8_2d !======================================================================= @@ -9402,7 +9402,7 @@ subroutine parallel_test_comm_row_col(parallel) enddo endif ! this_rank - call distributed_gather_var_row(test_array, global_test_array, parallel) + call gather_var_row(test_array, global_test_array, parallel) !! if (parallel%main_task_row) then if (parallel%main_task_row .and. this_rank == 0) then @@ -9416,7 +9416,7 @@ subroutine parallel_test_comm_row_col(parallel) write(iulog,*) ' ' endif - call distributed_scatter_var_row(test_array, global_test_array, parallel) + call scatter_var_row(test_array, global_test_array, parallel) if (this_rank == 0) then write(iulog,*) ' ' @@ -9456,7 +9456,7 @@ subroutine parallel_test_comm_row_col(parallel) enddo endif ! this_rank - call distributed_gather_var_col(test_array, global_test_array, parallel) + call gather_var_col(test_array, global_test_array, parallel) !! if (parallel%main_task_col) then if (parallel%main_task_col .and. this_rank == 0) then @@ -9470,7 +9470,7 @@ subroutine parallel_test_comm_row_col(parallel) write(iulog,*) ' ' endif - call distributed_scatter_var_col(test_array, global_test_array, parallel) + call scatter_var_col(test_array, global_test_array, parallel) if (this_rank == 0) then write(iulog,*) ' ' diff --git a/libglint/glint_initialise.F90 b/libglint/glint_initialise.F90 index a94a105e..79cd9038 100644 --- a/libglint/glint_initialise.F90 +++ b/libglint/glint_initialise.F90 @@ -607,7 +607,7 @@ subroutine setup_lgrid_fulldomain(instance, grid, grid_orog) use glint_global_grid , only : global_grid use glimmer_coordinates, only : coordsystem_new use glide_types , only : get_dew, get_dns - use cism_parallel , only : parallel_type, distributed_gather_var + use cism_parallel , only : parallel_type, gather_var implicit none @@ -630,7 +630,7 @@ subroutine setup_lgrid_fulldomain(instance, grid, grid_orog) global_ewn = instance%model%parallel%global_ewn global_nsn = instance%model%parallel%global_nsn - call distributed_gather_var(instance%out_mask, out_mask_fulldomain, parallel) + call gather_var(instance%out_mask, out_mask_fulldomain, parallel) if (main_task) then diff --git a/libglint/glint_interp.F90 b/libglint/glint_interp.F90 index 83491b99..5ef84eb0 100644 --- a/libglint/glint_interp.F90 +++ b/libglint/glint_interp.F90 @@ -248,7 +248,7 @@ subroutine interp_to_local (lgrid_fulldomain, global, & use glimmer_utils use glimmer_coordinates use glimmer_log - use cism_parallel, only : main_task, parallel_type, distributed_scatter_var, parallel_halo + use cism_parallel, only : main_task, parallel_type, scatter_var, parallel_halo !TODO - Not sure we need localsp now that the code is fully double precision @@ -299,9 +299,9 @@ subroutine interp_to_local (lgrid_fulldomain, global, & ! Allocate variables to hold result of interpolation ! We allocate size 0 arrays on non-main task (rather than leaving variables - ! unallocated there), because distributed_scatter_var tries to do a deallocate on all tasks + ! unallocated there), because scatter_var tries to do a deallocate on all tasks ! Note that coordsystem_allocate can't be used here because it only works on pointer - ! variables, and the *_fulldomain variables are non-pointers (as is required for distributed_scatter_var) + ! variables, and the *_fulldomain variables are non-pointers (as is required for scatter_var) if (present(localsp)) then if (main_task) then @@ -450,25 +450,25 @@ subroutine interp_to_local (lgrid_fulldomain, global, & end if ! main_task ! Main task scatters interpolated data from the full domain to the task owning each point - ! Note that distributed_scatter_var doesn't set halo values, so we need to do a halo + ! Note that scatter_var doesn't set halo values, so we need to do a halo ! update if it's important to have correct values in the halo cells. ! Although it's not strictly necessary to have the halo values, we compute them just in ! case another part of the code (e.g., glissade_temp) assumes they are available. if (present(localsp)) then localsp(:,:) = 0.d0 - call distributed_scatter_var(localsp, localsp_fulldomain, parallel) + call scatter_var(localsp, localsp_fulldomain, parallel) call parallel_halo(localsp, parallel) endif if (present(localdp)) then localdp(:,:) = 0.d0 - call distributed_scatter_var(localdp, localdp_fulldomain, parallel) + call scatter_var(localdp, localdp_fulldomain, parallel) call parallel_halo(localdp, parallel) endif ! We do NOT deallocate the local*_fulldomain variables here, because the - ! distributed_scatter_var routines do this deallocation + ! scatter_var routines do this deallocation end subroutine interp_to_local @@ -489,7 +489,7 @@ subroutine copy_to_local (lgrid_fulldomain, global, & ! on the main task. use glimmer_coordinates - use cism_parallel, only : main_task, parallel_type, distributed_scatter_var, parallel_halo + use cism_parallel, only : main_task, parallel_type, scatter_var, parallel_halo ! Argument declarations @@ -524,16 +524,16 @@ subroutine copy_to_local (lgrid_fulldomain, global, & end if ! Main task scatters interpolated data from the full domain to the task owning each point - ! Note that distributed_scatter_var doesn't set halo values, so we need to do a halo + ! Note that scatter_var doesn't set halo values, so we need to do a halo ! update if it's important to have correct values in the halo cells. ! Although it's not strictly necessary to have the halo values, we compute them just in ! case another part of the code (e.g., glissade_temp) assumes they are available. local(:,:) = 0.d0 - call distributed_scatter_var(local, local_fulldomain, parallel) + call scatter_var(local, local_fulldomain, parallel) call parallel_halo(local, parallel) - ! We do NOT deallocate local_fulldomain here, because the distributed_scatter_var + ! We do NOT deallocate local_fulldomain here, because the scatter_var ! routine does this deallocation end subroutine copy_to_local @@ -695,7 +695,7 @@ subroutine local_to_global_avg(ups, parallel, local, global, mask) !> \texttt{interp\_to\_local} routine. !> \end{itemize} - use cism_parallel, only : main_task, parallel_type, distributed_gather_var + use cism_parallel, only : main_task, parallel_type, gather_var ! Arguments @@ -729,8 +729,8 @@ subroutine local_to_global_avg(ups, parallel, local, global, mask) ! Gather 'local' and 'tempmask' onto main task, which is the only one that does the regridding - call distributed_gather_var(local, local_fulldomain, parallel) - call distributed_gather_var(tempmask, tempmask_fulldomain, parallel) + call gather_var(local, local_fulldomain, parallel) + call gather_var(tempmask, tempmask_fulldomain, parallel) ! Main task does regridding @@ -785,7 +785,7 @@ subroutine local_to_global_sum(ups, parallel, local, global, mask) !> \item \texttt{gboxn} is the same size as \texttt{global} !> \end{itemize} - use cism_parallel, only : main_task, parallel_type, distributed_gather_var + use cism_parallel, only : main_task, parallel_type, gather_var ! Arguments @@ -816,8 +816,8 @@ subroutine local_to_global_sum(ups, parallel, local, global, mask) ! Gather 'local' and 'tempmask' onto main task, which is the only one that does the regridding - call distributed_gather_var(local, local_fulldomain, parallel) - call distributed_gather_var(tempmask, tempmask_fulldomain, parallel) + call gather_var(local, local_fulldomain, parallel) + call gather_var(tempmask, tempmask_fulldomain, parallel) ! Main task does regridding if (main_task) then @@ -854,7 +854,7 @@ subroutine local_to_global_min(ups, parallel, local, global, mask) !> \item \texttt{gboxn} is the same size as \texttt{global} !> \end{itemize} - use cism_parallel, only : main_task, parallel_type, distributed_gather_var + use cism_parallel, only : main_task, parallel_type, gather_var ! Arguments @@ -885,8 +885,8 @@ subroutine local_to_global_min(ups, parallel, local, global, mask) ! Gather 'local' and 'tempmask' onto main task, which is the only one that does the regridding - call distributed_gather_var(local, local_fulldomain, parallel) - call distributed_gather_var(tempmask, tempmask_fulldomain, parallel) + call gather_var(local, local_fulldomain, parallel) + call gather_var(tempmask, tempmask_fulldomain, parallel) ! Main task does regridding if (main_task) then diff --git a/libglissade/glissade.F90 b/libglissade/glissade.F90 index 6a11c5ac..72bab6f4 100644 --- a/libglissade/glissade.F90 +++ b/libglissade/glissade.F90 @@ -92,8 +92,7 @@ subroutine glissade_initialise(model, evolve_ice) ! initialise Glissade model instance - use cism_parallel, only: parallel_type, distributed_gather_var, & - distributed_scatter_var, parallel_finalise, & + use cism_parallel, only: parallel_type, parallel_finalise, & distributed_grid, distributed_grid_active_blocks, parallel_global_edge_mask, & parallel_halo, parallel_halo_extrapolate, parallel_reduce_max, & staggered_parallel_halo_extrapolate, staggered_no_penetration_mask, & diff --git a/libglissade/glissade_basal_traction.F90 b/libglissade/glissade_basal_traction.F90 index a5142dab..3ffcca40 100644 --- a/libglissade/glissade_basal_traction.F90 +++ b/libglissade/glissade_basal_traction.F90 @@ -52,7 +52,7 @@ module glissade_basal_traction use glimmer_utils, only: point_diag use glide_types use cism_parallel, only : this_rank, main_task, parallel_type, & - parallel_halo, staggered_parallel_halo, parallel_globalindex, distributed_scatter_var + parallel_halo, staggered_parallel_halo, parallel_globalindex, scatter_var implicit none @@ -347,7 +347,7 @@ subroutine glissade_calcbeta (& ! The following code sets beta on the full grid as prescribed by Pattyn et al. (2008). ! Allocate a global array on the main task only. - ! On other tasks, allocate a size 0 array, since distributed_scatter_var wants to deallocate on all tasks. + ! On other tasks, allocate a size 0 array, since scatter_var wants to deallocate on all tasks. if (main_task) then allocate(beta_global(parallel%global_ewn, parallel%global_nsn)) else @@ -377,9 +377,9 @@ subroutine glissade_calcbeta (& ! Note: beta_extend has dimensions (ewn,nsn), so it can receive scattered data from beta_global. allocate(beta_extend(ewn, nsn)) beta_extend(:,:) = 0.d0 - call distributed_scatter_var(beta_extend, beta_global, parallel) + call scatter_var(beta_extend, beta_global, parallel) - ! distributed_scatter_var does not update the halo, so do an update here + ! scatter_var does not update the halo, so do an update here call parallel_halo(beta_extend, parallel) ! Copy beta_extend to beta on the local processor. @@ -391,7 +391,7 @@ subroutine glissade_calcbeta (& enddo enddo - ! beta_extend is no longer needed (beta_global is deallocated in distributed_scatter_var) + ! beta_extend is no longer needed (beta_global is deallocated in scatter_var) deallocate(beta_extend) case(HO_BABC_BETA_EXTERNAL) ! use beta value from external file diff --git a/libglissade/glissade_glacier.F90 b/libglissade/glissade_glacier.F90 index b896beb6..67744610 100644 --- a/libglissade/glissade_glacier.F90 +++ b/libglissade/glissade_glacier.F90 @@ -74,7 +74,7 @@ subroutine glissade_glacier_init(model, glacier) ! Another array, cism_to_rgi_glacier_id, identifies the RGI ID associated with each CISM ID. ! The CISM input file contains the RGI IDs. - use cism_parallel, only: distributed_gather_var, distributed_scatter_var, & + use cism_parallel, only: gather_var, scatter_var, & parallel_global_sum, parallel_reduce_max, parallel_reduce_min, parallel_is_zero, & broadcast, parallel_halo, staggered_parallel_halo, parallel_globalindex @@ -228,10 +228,10 @@ subroutine glissade_glacier_init(model, glacier) ! Gather the RGI glacier IDs to the main task if (main_task) allocate(rgi_glacier_id_global(global_ewn, global_nsn)) - call distributed_gather_var(glacier%rgi_glacier_id, rgi_glacier_id_global, parallel) + call gather_var(glacier%rgi_glacier_id, rgi_glacier_id_global, parallel) ! Allocate a global array for the CISM glacier IDs on the main task. - ! Allocate a size 0 array on other tasks; distributed_scatter_var wants arrays allocated on all tasks. + ! Allocate a size 0 array on other tasks; scatter_var wants arrays allocated on all tasks. if (main_task) then allocate(cism_glacier_id_global(global_ewn,global_nsn)) else @@ -377,8 +377,8 @@ subroutine glissade_glacier_init(model, glacier) endif ! main_task ! Scatter cism_glacier_id_global to all processors - ! Note: This global array is deallocated in the distributed_scatter_var subroutine - call distributed_scatter_var(glacier%cism_glacier_id, cism_glacier_id_global, parallel) + ! Note: This global array is deallocated in the scatter_var subroutine + call scatter_var(glacier%cism_glacier_id, cism_glacier_id_global, parallel) call parallel_halo(glacier%cism_glacier_id, parallel) diff --git a/libglissade/glissade_velo_higher_pcg.F90 b/libglissade/glissade_velo_higher_pcg.F90 index 1d4df22a..8d31e098 100644 --- a/libglissade/glissade_velo_higher_pcg.F90 +++ b/libglissade/glissade_velo_higher_pcg.F90 @@ -4557,9 +4557,8 @@ subroutine tridiag_solver_global_2d(ilocal, jlocal, & first_time, gather_data) use glimmer_utils, only: tridiag - use cism_parallel, only: distributed_gather_var_row, distributed_gather_var_col, & - distributed_gather_all_var_row, distributed_gather_all_var_col, & - distributed_scatter_var_row, distributed_scatter_var_col + use cism_parallel, only: gather_var_row, gather_var_col, & + gather_all_var_row, gather_all_var_col, scatter_var_row, scatter_var_col integer, intent(in) :: & ilocal, jlocal ! size of input/output arrays; number of locally owned vertices in each direction @@ -4784,11 +4783,11 @@ subroutine tridiag_solver_global_2d(ilocal, jlocal, & if (tridiag_solver_flag == 'row') then call t_startf("pcg_tridiag_gather_row") - call distributed_gather_all_var_row(outdata, gather_data2, parallel) + call gather_all_var_row(outdata, gather_data2, parallel) call t_stopf ("pcg_tridiag_gather_row") elseif (tridiag_solver_flag == 'col') then call t_startf("pcg_tridiag_gather_col") - call distributed_gather_all_var_col(outdata, gather_data2, parallel) + call gather_all_var_col(outdata, gather_data2, parallel) call t_stopf ("pcg_tridiag_gather_col") endif @@ -4799,11 +4798,11 @@ subroutine tridiag_solver_global_2d(ilocal, jlocal, & if (tridiag_solver_flag == 'row') then call t_startf("pcg_tridiag_gather_row") - call distributed_gather_var_row(outdata, gather_data2, parallel) + call gather_var_row(outdata, gather_data2, parallel) call t_stopf ("pcg_tridiag_gather_row") elseif (tridiag_solver_flag == 'col') then call t_startf("pcg_tridiag_gather_col") - call distributed_gather_var_col(outdata, gather_data2, parallel) + call gather_var_col(outdata, gather_data2, parallel) call t_stopf ("pcg_tridiag_gather_col") endif @@ -4918,11 +4917,11 @@ subroutine tridiag_solver_global_2d(ilocal, jlocal, & if (tridiag_solver_flag == 'row') then call t_startf("pcg_tridiag_scatter_row") - call distributed_scatter_var_row(local_coeffs, global_coeffs, parallel) + call scatter_var_row(local_coeffs, global_coeffs, parallel) call t_stopf ("pcg_tridiag_scatter_row") elseif (tridiag_solver_flag == 'col') then call t_startf("pcg_tridiag_scatter_col") - call distributed_scatter_var_col(local_coeffs, global_coeffs, parallel) + call scatter_var_col(local_coeffs, global_coeffs, parallel) call t_stopf ("pcg_tridiag_scatter_col") endif From 0aae0b7816593a975feb824aecc49ac8efca53cb Mon Sep 17 00:00:00 2001 From: William Lipscomb Date: Fri, 19 Dec 2025 16:34:12 -0700 Subject: [PATCH 04/21] More work toward reproducible sums This commit continues the work toward a reproducible sum capability: an option to compute sums such that results on different numbers of processors are BFB. The basic capability is now working. In the various parallel_global_sum routines (with interfaces parallel_global_sum, parallel_global_sum_stagger and parallel_global_sum_patch), there is an option to call subroutine parallel_reduce_reprosum in lieu of parallel_reduce sum. Subroutine parallel_reduce_reprosum calls cism_reproduce_calc, which converts dp variables to i8 and does the sums. I verified that sums are now reproducible for some test problems, including a Greenland configuration similar to what we are using for CESM3. One issue was that the velocity solver was using local copies of variables xVertex and yVertex for its node geometry, which led to a dependence on processor count. Computing these variables based on the global coordinates model%general%x0 and model%general%y0 solves the problem, although it leads to slightly larger roundoff errors for some calculations. I fixed another subtle issue by rewriting two equations of this form: c1 = c1 + a*b c2 = c2 + a*b with equations of this form: f = a*b c1 = c1 + f c2 = c2 + f I don't know why this was necessary. Some results still depend on processor count: for instance, the scheme that routes subglacial water downhill to neighboring cells. I will work on that in a future commit. Other changes: * Fixed a bug in the way the rmse for surface velocity is computed. * Added a subroutine called double_to_binary in glimmer_utils. Given an input double-precision floating-point number, the output is a character string corresponding to the 64-bit internal representation of that number (1 bit for sign, 11 for exponent, and 52 for fraction). This is useful for debugging. * Added two 'write_array_to_file' subroutines (real8_2d and real8_3d) in glissade_utils. These subroutines take an input array of dp variables, convert each variable to a character string corresponding to the 64-bit binary representation, and write that string to a file. This can be useful for debugging if we suspect that the values of some array elements depend on processor count, but we don't know which elements. The real8_3d version has an optional argument to make it work for either (k,i,j) arrays or (i,j,k) arrays. --- libglide/glide_diagnostics.F90 | 26 +- libglide/glide_setup.F90 | 3 + libglimmer/cism_reprosum_mod.F90 | 131 +----- libglimmer/glimmer_utils.F90 | 85 ++++ libglimmer/parallel_mpi.F90 | 570 +++++++++++++++++++---- libglissade/glissade.F90 | 28 +- libglissade/glissade_basal_water.F90 | 13 + libglissade/glissade_utils.F90 | 115 ++++- libglissade/glissade_velo_higher.F90 | 447 +++++++++++++----- libglissade/glissade_velo_higher_pcg.F90 | 176 +++---- utils/build/generate_ncvars.py | 8 +- 11 files changed, 1142 insertions(+), 460 deletions(-) diff --git a/libglide/glide_diagnostics.F90 b/libglide/glide_diagnostics.F90 index cd72557c..8aec8052 100644 --- a/libglide/glide_diagnostics.F90 +++ b/libglide/glide_diagnostics.F90 @@ -45,6 +45,8 @@ module glide_diagnostics implicit none + logical, parameter :: verbose_diagnostics = .false. + contains subroutine glide_write_diagnostics (model, time, & @@ -64,8 +66,6 @@ subroutine glide_write_diagnostics (model, time, & ! local arguments - logical, parameter :: verbose_diagnostics = .false. - ! debug if (main_task .and. verbose_diagnostics) then write(iulog,*) ' ' @@ -234,6 +234,9 @@ subroutine glide_write_diag (model, time) grounded_mask, & ! = 1 where ice is present and grounded, else = 0 glacier_ice_mask ! = 1 where glacier ice is present, initially and/or currently + integer, dimension(model%general%ewn-1,model%general%nsn-1) :: & + stag_ice_mask ! staggered mask; = 1 if ice_mask = 1 for any of the four neighbors + real(dp), dimension(model%general%upn) :: & temp_diag, & ! Note: sfc temp not included if temps are staggered ! (use artm instead) @@ -275,9 +278,11 @@ subroutine glide_write_diag (model, time) real(dp), dimension(model%general%ewn, model%general%nsn) :: & mass_above_flotation,& ! ice mass above flotation (kg) - velo_sfc, & ! surface ice speed (m/s) thck_obs ! observed ice thickness (m), derived from usrf_obs and topg + real(dp), dimension(model%general%ewn-1, model%general%nsn-1) :: & + velo_sfc ! surface ice speed (m/s) + real(dp), dimension(:,:,:), allocatable :: & local_energy ! internal energy (J) per layer in a column @@ -355,6 +360,17 @@ subroutine glide_write_diag (model, time) enddo enddo + do j = 1, nsn-1 + do i = 1, ewn-1 + if (ice_mask(i,j+1) == 1 .or. ice_mask(i+1,j+1) == 1 .or. & + ice_mask(i,j) == 1 .or. ice_mask(i+1,j) == 1) then + stag_ice_mask(i,j) = 1 + else + stag_ice_mask(i,j) = 0 + endif + enddo + enddo + !----------------------------------------------------------------- ! Compute and write global diagnostics !----------------------------------------------------------------- @@ -897,8 +913,8 @@ subroutine glide_write_diag (model, time) + model%velocity%vvel(1,:,:)**2) call glissade_rms_error(& - ewn, nsn, & - ice_mask, & + ewn-1, nsn-1, & + stag_ice_mask, & parallel, & velo_sfc * scyr, & ! m/yr model%velocity%velo_sfc_obs * scyr, & ! m/yr diff --git a/libglide/glide_setup.F90 b/libglide/glide_setup.F90 index 3b2aa1e4..d28088be 100644 --- a/libglide/glide_setup.F90 +++ b/libglide/glide_setup.F90 @@ -2699,6 +2699,9 @@ subroutine print_parameters(model) if (model%options%which_ho_babc == HO_BABC_BETA_CONSTANT) then write(message,*) 'uniform beta (Pa yr/m) : ',model%basal_physics%ho_beta_const call write_log(message) + elseif (model%options%which_ho_babc == HO_BABC_BETA_LARGE) then + write(message,*) 'large beta (Pa yr/m) : ',model%basal_physics%ho_beta_large + call write_log(message) elseif (model%options%which_ho_babc == HO_BABC_BETA_BPMP) then write(message,*) 'large (frozen) beta (Pa yr/m) : ',model%basal_physics%ho_beta_large call write_log(message) diff --git a/libglimmer/cism_reprosum_mod.F90 b/libglimmer/cism_reprosum_mod.F90 index 55a034c1..2f071f82 100644 --- a/libglimmer/cism_reprosum_mod.F90 +++ b/libglimmer/cism_reprosum_mod.F90 @@ -42,16 +42,16 @@ module cism_reprosum_mod use glimmer_global, only: r8 => dp use glimmer_global, only: i8 use glimmer_paramets, only: iulog -!! use cism_parallel, only: main_task + use glimmer_utils, only: double_to_binary ! use shr_log_mod, only: s_loglev => shr_log_Level ! use shr_log_mod, only: s_logunit => shr_log_Unit ! use shr_sys_mod, only: shr_sys_abort - use cism_infnan_mod,only: cism_infnan_inf_type, assignment(=), & cism_infnan_posinf, cism_infnan_neginf, & cism_infnan_nan, & cism_infnan_isnan, cism_infnan_isinf, & cism_infnan_isposinf, cism_infnan_isneginf + #ifdef TIMING use perf_mod #endif @@ -456,21 +456,14 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands, & real(r8) :: abs_diff ! absolute difference between ! fixed and floating point ! sums + character(len=64) :: binary_str ! string to represent 64 bits of i8 integer + integer :: n #ifdef _OPENMP integer omp_get_max_threads external omp_get_max_threads #endif - !WHL - debug - logical :: test_sum = .false. - real(r8), dimension(dsummands,nflds) :: arr_test - arr_test(:,:) = 0.0d0 - call mpi_comm_rank(MPI_COMM_WORLD, mypid, ierr) - if (verbose_reprosum .and. mypid == 0) then - write(iulog,*) 'In cism_reprosum_mod' - endif -! !----------------------------------------------------------------------- ! ! initialize local statistics variables @@ -505,10 +498,6 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands, & !TODO - Remove the inf_nan option; assume abort_inf_nan = T, as in CICE if (abort_inf_nan) then - if (verbose_reprosum .and. mypid == 0) then - write(iulog,*) ' abort_inf_nan check' - endif - ! check whether input contains NaNs or INFs, and abort if so nan_check = any(cism_infnan_isnan(arr)) inf_check = any(cism_infnan_isinf(arr)) @@ -587,9 +576,6 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands, & ! get number of MPI tasks call mpi_comm_size(mpi_comm, tasks, ierr) - if (verbose_reprosum .and. mypid == 0) then - write(iulog,*) 'Starting reprosum, tasks =', tasks - endif ! get number of OpenMP threads #ifdef _OPENMP omp_nthreads = omp_get_max_threads() @@ -847,12 +833,6 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands, & enddo endif - if (verbose_reprosum .and. mypid == 0) then - write(iulog,*) ' max_nsummands =', max_nsummands - write(iulog,*) ' max_levels(1), max_level =', max_levels(1), max_level - write(iulog,*) ' call reprosum_int, pid =', mypid - endif - ! calculate sum validate = .false. call cism_reprosum_int(arr, arr_gsum, nsummands, dsummands, & @@ -944,13 +924,20 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands, & repro_sum_stats(6) = repro_sum_stats(6) + gbl_lor_red endif - !WHL - debug - if (verbose_reprosum .and. mypid == 0) then - write(iulog,*) 'Exiting cism_reprosum_calc, arr_gsum =', arr_gsum(:) + if (verbose_reprosum) then + call mpi_comm_rank(MPI_COMM_WORLD, mypid, ierr) + if (mypid == 0) then + write(iulog,*) 'Exit reprosum, nflds =', nflds + write(iulog,*) ' n, arr_gsum, binary_str:' +!! do n = 1, nflds + do n = 1, min(2,nflds) + call double_to_binary(arr_gsum(n), binary_str) + write(iulog,*) n, arr_gsum(n), binary_str + enddo + endif endif end subroutine cism_reprosum_calc - ! !======================================================================== ! @@ -1079,13 +1066,6 @@ subroutine cism_reprosum_int (arr, arr_gsum, nsummands, dsummands, nflds, & !WHL - debug integer :: mypid, k call mpi_comm_rank(MPI_COMM_WORLD, mypid, ierr) - - if (verbose_reprosum .and. mypid == 0) then - write(iulog,*) 'In cism_reprosum_int, pid, arr =', mypid, arr(:,:) - write(iulog,*) 'dsummands, nsummands =', dsummands, nsummands - write(iulog,*) 'size(arr) =', size(arr,1), size(arr,2) - endif - ! !----------------------------------------------------------------------- ! Save radix of i8 variables in an i8 variable @@ -1108,20 +1088,8 @@ subroutine cism_reprosum_int (arr, arr_gsum, nsummands, dsummands, nflds, & veclth = offset(nflds) + max_levels(nflds) ! split summand index range over OpenMP threads -! WHL - Should this be dsummands instead of nsummands? -!! call split_indices(nsummands, omp_nthreads, isum_beg, isum_end) call split_indices(dsummands, omp_nthreads, isum_beg, isum_end) - if (verbose_reprosum .and. mypid == 0) then - write(iulog,*) 'pid, i8_radix, voffset, veclth, nflds =', mypid, i8_radix, voffset, veclth, nflds - write(iulog,*) ' max_levels =', max_levels(:) - write(iulog,*) ' dsummands, omp_nthreads =', dsummands, omp_nthreads - write(iulog,*) ' isum_beg/end =', isum_beg, isum_end -! write(iulog,*) ' size(i8_arr_lsum_level) = ', size(i8_arr_lsum_level) -! write(iulog,*) ' size(i8_arr_gsum_level) = ', size(i8_arr_gsum_level) - endif - - ! convert local summands to vector of integers and sum ! (Using scale instead of set_exponent because arr_remainder may not be ! "normal" after level 1 calculation) @@ -1152,27 +1120,12 @@ subroutine cism_reprosum_int (arr, arr_gsum, nsummands, dsummands, nflds, & do isum=isum_beg(ithread),isum_end(ithread) - if (verbose_reprosum .and. mypid == 1) then - write (iulog,*) ' pid, isum, ifld, arr(isum,ifld) =', mypid, isum, ifld, arr(isum,ifld) - endif - arr_remainder = 0.0_r8 - !WHL - debug - ! If isum_end > size(arr,1) = dsummands, then the next 'if' will try to access nonexistent memory. - ! Adding some logic to exit the do loop first. - ! Not necessary if split_indices is called with dsummands - !TODO: Check the split_indices logic for isum_end -!!!! if (isum > size(arr,1)) exit - if (arr(isum,ifld) .ne. 0.0_r8) then arr_exp = exponent(arr(isum,ifld)) arr_frac = fraction(arr(isum,ifld)) - if (verbose_reprosum) then -!!! write(iulog,*) ' pid, isum, arr_exp, arr_frac:', mypid, isum, arr_exp, arr_frac - endif - ! test that global maximum upper bound is an upper bound if (arr_exp > arr_gmax_exp(ifld)) then max_error(ifld,ithread) = 1 @@ -1197,10 +1150,6 @@ subroutine cism_reprosum_int (arr, arr_gsum, nsummands, dsummands, nflds, & ilevel = 1 endif - if (verbose_reprosum .and. mypid == 1) then - write (iulog,*) ' pid, ilevel, max_levels =', mypid, ilevel, max_levels(ifld) - endif - if (ilevel .le. max_levels(ifld)) then ! apply first shift/truncate, add it to the relevant running ! sum, and calculate the remainder. @@ -1209,30 +1158,17 @@ subroutine cism_reprosum_int (arr, arr_gsum, nsummands, dsummands, nflds, & i8_arr_tlsum_level(ilevel,ifld,ithread) = & i8_arr_tlsum_level(ilevel,ifld,ithread) + i8_arr_level arr_remainder = arr_remainder - i8_arr_level - - if (verbose_reprosum .and. mypid <= 1) then - write(iulog,*) ' pid, arr_old, i8_arr_level, arr_new:', & - mypid, scale(arr_frac,arr_shift), i8_arr_level, arr_remainder - endif ! while the remainder is non-zero, continue to shift, truncate, ! sum, and calculate new remainder do while ((arr_remainder .ne. 0.0_r8) & .and. (ilevel < max_levels(ifld))) ilevel = ilevel + 1 - if (verbose_reprosum .and. mypid <= 1) then - write(iulog,*) ' pid, ilevel, arr_old =', mypid, ilevel, arr_remainder - endif arr_remainder = scale(arr_remainder,arr_max_shift) i8_arr_level = int(arr_remainder,i8) i8_arr_tlsum_level(ilevel,ifld,ithread) = & i8_arr_tlsum_level(ilevel,ifld,ithread) + i8_arr_level - if (verbose_reprosum .and. mypid <= 1) then - write(iulog,*) ' pid, arr_scaled, i8_arr_level:', & - mypid, arr_remainder, i8_arr_level - endif - arr_remainder = arr_remainder - i8_arr_level enddo ! arr_remainder /= 0.0_r8 @@ -1272,22 +1208,12 @@ subroutine cism_reprosum_int (arr, arr_gsum, nsummands, dsummands, nflds, & ! sum contributions from different threads do ifld=1,nflds - if (verbose_reprosum .and. mypid == 0) then -!! write(iulog,*) 'ifld =', ifld - endif - ioffset = offset(ifld) do ithread = 1,omp_nthreads do ilevel = 0,max_levels(ifld) i8_arr_lsum_level(ioffset+ilevel) = & i8_arr_lsum_level(ioffset+ilevel) & + i8_arr_tlsum_level(ilevel,ifld,ithread) - - if (verbose_reprosum) then -!! write(iulog,*) ' pid, ilevel, ioffset, i8_arr_lsum_level =', & -!! mypid, ilevel, i8_arr_lsum_level(ioffset+ilevel) - endif - enddo enddo enddo @@ -1322,22 +1248,6 @@ subroutine cism_reprosum_int (arr, arr_gsum, nsummands, dsummands, nflds, & call mpi_allreduce (i8_arr_lsum_level, i8_arr_gsum_level, & veclth, MPI_INTEGER8, MPI_SUM, mpi_comm, ierr) - if (verbose_reprosum .and. mypid < 5) then - do k = 1, size(i8_arr_lsum_level) - if (i8_arr_lsum_level(k) /= 0) then - write(iulog,*) 'pid, k, i8_arr_lsum_level =', mypid, k, i8_arr_lsum_level(k) - endif - enddo - endif - - if (verbose_reprosum .and. mypid == 0) then - do k = 1, size(i8_arr_gsum_level) - if (i8_arr_gsum_level(k) /= 0) then - write(iulog,*) 'pid, k, i8_arr_gsum_level =', mypid, k, i8_arr_gsum_level(k) - endif - enddo - endif - #ifdef TIMING call t_stopf("repro_sum_allr_i8") #endif @@ -1477,10 +1387,6 @@ subroutine cism_reprosum_int (arr, arr_gsum, nsummands, dsummands, nflds, & arr_gsum(ifld) = scale(RX_8,MINEXPONENT(1._r8)) endif - if (verbose_reprosum .and. mypid == 0) then - write(iulog,*) ' ifld, arr_gsum =', ifld, arr_gsum(ifld) - endif - ! if validate is .true. and some precision lost, test whether 'too much' ! was lost, due to too loose an upper bound, too stringent a limit on number ! of levels of expansion, cancellation, .... Calculated by comparing lower @@ -1513,13 +1419,6 @@ subroutine cism_reprosum_int (arr, arr_gsum, nsummands, dsummands, nflds, & enddo - if (verbose_reprosum .and. mypid == 0) then - write(iulog,*) ' Done in cism_reprosum_int, pid =', mypid - k = 1 - write(iulog,*) ' arr_gsum, exp, frac =', arr_gsum(k), & - exponent(arr_gsum(k)), fraction(arr_gsum(k)) - endif - end subroutine cism_reprosum_int ! diff --git a/libglimmer/glimmer_utils.F90 b/libglimmer/glimmer_utils.F90 index c21b05c9..00a3859e 100644 --- a/libglimmer/glimmer_utils.F90 +++ b/libglimmer/glimmer_utils.F90 @@ -589,6 +589,91 @@ subroutine point_diag_real8_2d(& end subroutine point_diag_real8_2d +!-------------------------------------------------------------------------- + + subroutine double_to_binary(& + x, binary_str, binary_full, binary_sign, binary_exponent, binary_mantissa) + + ! Find the internal binary representation of a double-precision floating point number + ! Based on the IEEE-754 standard + + use glimmer_global, only: dp, i8 + implicit none + + real(dp), intent(in) :: x + character(len=64), intent(out) :: binary_str ! string representation of the binary number + + integer(i8), intent(out), optional :: binary_full ! 64 bits + integer, intent(out), optional :: binary_sign ! 1 bit + integer, intent(out), optional :: binary_exponent ! 11 bits + integer(i8), intent(out), optional :: binary_mantissa ! 52 bits + + integer :: i + character(len=1) :: bin(64) + integer (i8) :: binary_number + integer :: sign_bit + integer :: exponent_bits + integer :: mantissa_bits + + logical :: verbose_binary = .false. + + ! Transfer the double value into a 64-bit integer + binary_number = transfer(x, binary_number) + + ! Get the sign bit (bit 64) + sign_bit = ishft(binary_number, -63) .and. 1 + + ! Get the exponent bits (bits 63–53) + exponent_bits = ishft(binary_number, -52) .and. Z'7FF' + + ! Extract mantissa (fraction) bits (bits 52–1) + mantissa_bits = binary_number .and. Z'FFFFFFFFFFFFF' + + if (present(binary_full)) binary_full = binary_number + if (present(binary_sign)) binary_sign = sign_bit + if (present(binary_exponent)) binary_exponent = exponent_bits + if (present(binary_mantissa)) binary_mantissa = mantissa_bits + + if (verbose_binary) then + write(iulog,*) ' ' + write(iulog,*) 'x =', x + write(iulog,*) 'IEEE-754 double precision representation of x:' + write(iulog,*) 'Sign bit: ', sign_bit + write(iulog,*) 'Exponent (11 bits):', exponent_bits + write(iulog,*) 'Mantissa (52 bits):', mantissa_bits + endif + + ! Convert full 64-bit integer to a binary string + do i = 1, 64 + if (btest(binary_number, 64 - i)) then + bin(i) = '1' + else + bin(i) = '0' + end if + end do + + binary_str = concat(bin) + if (verbose_binary) then + write(iulog,*) 'Full 64-bit binary:' + write(iulog,*), ' ', binary_str + endif + + end subroutine double_to_binary + + + pure function concat(arr) result(str) + ! Turn a character array into a string + + character(len=*), intent(in) :: arr(:) + character(len=size(arr)) :: str + integer :: k + + do k = 1, size(arr) + str(k:k) = arr(k) + end do + + end function concat + !**************************************************************************** end module glimmer_utils diff --git a/libglimmer/parallel_mpi.F90 b/libglimmer/parallel_mpi.F90 index 6afb1d9f..bab339cd 100644 --- a/libglimmer/parallel_mpi.F90 +++ b/libglimmer/parallel_mpi.F90 @@ -348,6 +348,8 @@ module cism_parallel end interface interface parallel_is_zero + module procedure parallel_is_zero_integer_1d + module procedure parallel_is_zero_real8_1d module procedure parallel_is_zero_integer_2d module procedure parallel_is_zero_real8_2d module procedure parallel_is_zero_real8_3d @@ -703,9 +705,7 @@ subroutine gather_var_integer_2d(values, global_values, parallel) mpi_integer,main_rank,comm) if (main_task) then - if (allocated(global_values)) then - deallocate(global_values) - endif + if (allocated(global_values)) deallocate(global_values) !WHL - See comments above on allocating the global_values array !! allocate(global_values(& !! minval(d_gs_bounds(1,:)):maxval(d_gs_bounds(2,:)),& @@ -724,9 +724,7 @@ subroutine gather_var_integer_2d(values, global_values, parallel) end do allocate(recvbuf(displs(tasks+1))) else - if (allocated(global_values)) then - deallocate(global_values) - endif + if (allocated(global_values)) deallocate(global_values) allocate(global_values(1,1)) ! This prevents a problem with NULL pointers later. allocate(displs(1)) allocate(recvcounts(1)) @@ -811,9 +809,7 @@ subroutine gather_var_logical_2d(values, global_values, parallel) mpi_integer,main_rank,comm) if (main_task) then - if (allocated(global_values)) then - deallocate(global_values) - endif + if (allocated(global_values)) deallocate(global_values) !WHL - See comments above on allocating the global_values array !! allocate(global_values(& !! minval(d_gs_bounds(1,:)):maxval(d_gs_bounds(2,:)),& @@ -832,9 +828,7 @@ subroutine gather_var_logical_2d(values, global_values, parallel) end do allocate(recvbuf(displs(tasks+1))) else - if (allocated(global_values)) then - deallocate(global_values) - endif + if (allocated(global_values)) deallocate(global_values) allocate(global_values(1,1)) ! This prevents a problem with NULL pointers later. allocate(displs(1)) allocate(recvcounts(1)) @@ -918,9 +912,7 @@ subroutine gather_var_real4_2d(values, global_values, parallel) mpi_integer,main_rank,comm) if (main_task) then - if (allocated(global_values)) then - deallocate(global_values) - endif + if (allocated(global_values)) deallocate(global_values) !WHL - See comments above on allocating the global_values array !! allocate(global_values(& !! minval(d_gs_bounds(1,:)):maxval(d_gs_bounds(2,:)),& @@ -939,9 +931,7 @@ subroutine gather_var_real4_2d(values, global_values, parallel) end do allocate(recvbuf(displs(tasks+1))) else - if (allocated(global_values)) then - deallocate(global_values) - endif + if (allocated(global_values)) deallocate(global_values) allocate(global_values(1,1)) ! This prevents a problem with NULL pointers later. allocate(displs(1)) allocate(recvcounts(1)) @@ -1026,9 +1016,7 @@ subroutine gather_var_real4_3d(values, global_values, parallel, ld1, ud1) mpi_integer,main_rank,comm) if (main_task) then - if (allocated(global_values)) then - deallocate(global_values) - endif + if (allocated(global_values)) deallocate(global_values) if (present(ld1)) then d1l = ld1 else @@ -1062,9 +1050,7 @@ subroutine gather_var_real4_3d(values, global_values, parallel, ld1, ud1) end do allocate(recvbuf(displs(tasks+1))) else - if (allocated(global_values)) then - deallocate(global_values) - endif + if (allocated(global_values)) deallocate(global_values) allocate(global_values(1,1,1)) ! This prevents a problem with NULL pointers later. allocate(displs(1)) allocate(recvcounts(1)) @@ -1151,9 +1137,7 @@ subroutine gather_var_real8_2d(values, global_values, parallel) mpi_integer,main_rank,comm) if (main_task) then - if (allocated(global_values)) then - deallocate(global_values) - endif + if (allocated(global_values)) deallocate(global_values) !WHL - See comments above on allocating the global_values array !! allocate(global_values(& !! minval(d_gs_bounds(1,:)):maxval(d_gs_bounds(2,:)),& @@ -1172,9 +1156,7 @@ subroutine gather_var_real8_2d(values, global_values, parallel) end do allocate(recvbuf(displs(tasks+1))) else - if (allocated(global_values)) then - deallocate(global_values) - endif + if (allocated(global_values)) deallocate(global_values) allocate(global_values(1,1)) ! This prevents a problem with NULL pointers later. allocate(displs(1)) allocate(recvcounts(1)) @@ -1211,7 +1193,7 @@ subroutine gather_var_real8_3d(values, global_values, parallel, ld1, ud1) use mpi_mod implicit none - real(dp),dimension(:,:,:),intent(in) :: values + real(dp),dimension(:,:,:),intent(in) :: values ! i and j are indices 2 and 3 real(dp),dimension(:,:,:),allocatable,intent(inout) :: global_values integer,optional,intent(in) :: ld1, ud1 type(parallel_type) :: parallel @@ -1234,7 +1216,7 @@ subroutine gather_var_real8_3d(values, global_values, parallel, ld1, ud1) global_maxval_nsub => parallel%global_maxval_nsub & ) - if (uhalo==0 .and. size(values,1)==local_ewn-1) then + if (uhalo==0 .and. size(values,2)==local_ewn-1) then ! Fixing this would require some generalization as is done for distributed_put_var write(*,*) "gather does not currently work for" write(*,*) "variables on the staggered grid when uhalo=0" @@ -1260,9 +1242,7 @@ subroutine gather_var_real8_3d(values, global_values, parallel, ld1, ud1) mpi_integer,main_rank,comm) if (main_task) then - if (allocated(global_values)) then - deallocate(global_values) - endif + if (allocated(global_values)) deallocate(global_values) if (present(ld1)) then d1l = ld1 else @@ -1296,9 +1276,7 @@ subroutine gather_var_real8_3d(values, global_values, parallel, ld1, ud1) end do allocate(recvbuf(displs(tasks+1))) else - if (allocated(global_values)) then - deallocate(global_values) - endif + if (allocated(global_values)) deallocate(global_values) allocate(global_values(1,1,1)) ! This prevents a problem with NULL pointers later. allocate(displs(1)) allocate(recvcounts(1)) @@ -6112,7 +6090,7 @@ function parallel_global_sum_integer_3d(a, parallel, mask_3d) integer, dimension(:,:,:), intent(in), optional :: mask_3d integer :: i, j, k - integer :: kmax + integer :: nz integer, dimension(size(a,1),parallel%local_ewn,parallel%local_nsn) :: mask integer :: local_sum integer :: parallel_global_sum_integer_3d @@ -6121,7 +6099,7 @@ function parallel_global_sum_integer_3d(a, parallel, mask_3d) local_ewn => parallel%local_ewn, & local_nsn => parallel%local_nsn) - kmax = size(a,1) + nz = size(a,1) if (present(mask_3d)) then mask = mask_3d @@ -6132,7 +6110,7 @@ function parallel_global_sum_integer_3d(a, parallel, mask_3d) local_sum = 0 do j = nhalo+1, local_nsn-nhalo do i = nhalo+1, local_ewn-nhalo - do k = 1, kmax + do k = 1, nz if (mask(k,i,j) == 1) then local_sum = local_sum + a(k,i,j) endif @@ -6160,9 +6138,23 @@ function parallel_global_sum_real8_2d(a, parallel, mask_2d) real(dp) :: local_sum real(dp) :: parallel_global_sum_real8_2d + ! variables for computing reproductible sums + integer :: nsummands, nflds ! dimensions of array passed to parallel_reduce_reprosum + integer :: count + real(dp), dimension(:,:), allocatable :: arr + real(dp), dimension(:), allocatable :: arr_gsum + associate( & local_ewn => parallel%local_ewn, & - local_nsn => parallel%local_nsn) + local_nsn => parallel%local_nsn, & + own_ewn => parallel%own_ewn, & + own_nsn => parallel%own_nsn) + + !WHL - debug + if (verbose_reprosum .and. main_task) then +! write(iulog,*) 'In parallel_global_sum_real8_2d, reprosum =', parallel%reprosum +! write(iulog,*) 'nhalo, local ewn, local_nsn =', nhalo, local_ewn, local_nsn + endif if (present(mask_2d)) then mask = mask_2d @@ -6170,10 +6162,44 @@ function parallel_global_sum_real8_2d(a, parallel, mask_2d) mask = 1 endif - if (parallel%reprosum) then ! compute using cism_reprosum_calc + if (parallel%reprosum) then ! compute using parallel_reduce_reprosum - !TODO - Add the code here - call parallel_stop(__FILE__,__LINE__) + ! Allocate and fill arrays to pass to parallel_reduce_reprosum + nsummands = own_ewn*own_nsn + nflds = 1 + allocate(arr(nsummands,nflds)) + allocate(arr_gsum(nflds)) + + count = 0 + do j = nhalo+1, local_nsn-nhalo + do i = nhalo+1, local_ewn-nhalo + count = count + 1 + if (mask(i,j) == 1) then + arr(count,1) = a(i,j) + else + arr(count,1) = 0.0d0 + endif + enddo + enddo + + ! bug check + if (count /= nsummands) then + if (main_task) write(iulog,*) 'Error: count, nsummands =', count, nsummands + call parallel_stop(__FILE__,__LINE__) + endif + + ! Call parallel_reduce_reprosum + call parallel_reduce_reprosum(arr, arr_gsum) + + parallel_global_sum_real8_2d = arr_gsum(1) + + !WHL - debug + if (verbose_reprosum .and. main_task) then +!! write(iulog,*) 'arr_gsum =', arr_gsum + endif + + deallocate(arr) + deallocate(arr_gsum) else ! compute using parallel_reduce_sum (not reproducible) @@ -6207,16 +6233,24 @@ function parallel_global_sum_real8_3d(a, parallel, mask_2d) integer, dimension(:,:), intent(in), optional :: mask_2d integer :: i, j, k - integer :: kmax + integer :: nz integer, dimension(parallel%local_ewn,parallel%local_nsn) :: mask real(dp) :: local_sum real(dp) :: parallel_global_sum_real8_3d + ! variables for computing reproductible sums + integer :: nsummands, nflds ! dimensions of array passed to parallel_reduce_reprosum + integer :: count + real(dp), dimension(:,:), allocatable :: arr + real(dp), dimension(:), allocatable :: arr_gsum + associate( & local_ewn => parallel%local_ewn, & - local_nsn => parallel%local_nsn) + local_nsn => parallel%local_nsn, & + own_ewn => parallel%own_ewn, & + own_nsn => parallel%own_nsn) - kmax = size(a,1) + nz = size(a,1) ! Note: The mask is 2D, since typically all layers in a column are either masked in or masked out if (present(mask_2d)) then @@ -6227,8 +6261,47 @@ function parallel_global_sum_real8_3d(a, parallel, mask_2d) if (parallel%reprosum) then ! compute using cism_reprosum_calc - !TODO - Add the code here - call parallel_stop(__FILE__,__LINE__) + ! Allocate and fill arrays to pass to parallel_reduce_reprosum + nsummands = own_ewn*own_nsn*nz + nflds = 1 + allocate(arr(nsummands,nflds)) + allocate(arr_gsum(nflds)) + + count = 0 + do j = nhalo+1, local_nsn-nhalo + do i = nhalo+1, local_ewn-nhalo + if (mask(i,j) == 1) then + do k = 1, nz + count = count + 1 + arr(count,1) = a(k,i,j) + enddo + else + do k = 1, nz + count = count + 1 + arr(count,1) = 0.0d0 + enddo + endif + enddo + enddo + + ! bug check + if (count /= nsummands) then + if (main_task) write(iulog,*) 'Error: count, nsummands =', count, nsummands + call parallel_stop(__FILE__,__LINE__) + endif + + ! Call parallel_reduce_reprosum + call parallel_reduce_reprosum(arr, arr_gsum) + + parallel_global_sum_real8_3d = arr_gsum(1) + + !WHL - debug + if (verbose_reprosum .and. main_task) then +! write(iulog,*) 'arr_gsum =', arr_gsum + endif + + deallocate(arr) + deallocate(arr_gsum) else ! compute using parallel_reduce_sum (not reproducible) @@ -6236,7 +6309,7 @@ function parallel_global_sum_real8_3d(a, parallel, mask_2d) do j = nhalo+1, local_nsn-nhalo do i = nhalo+1, local_ewn-nhalo if (mask(i,j) == 1) then - do k = 1, kmax + do k = 1, nz local_sum = local_sum + a(k,i,j) enddo endif @@ -6311,22 +6384,73 @@ function parallel_global_sum_patch_real8_2d(a, npatch, patch_id, parallel) real(dp), dimension(npatch) :: local_patch_sum real(dp), dimension(npatch) :: parallel_global_sum_patch_real8_2d + ! variables for computing reproductible sums + integer :: nsummands, nflds ! dimensions of array passed to parallel_reduce_reprosum + integer :: count + real(dp), dimension(:,:), allocatable :: arr + real(dp), dimension(:), allocatable :: arr_gsum + associate( & local_ewn => parallel%local_ewn, & - local_nsn => parallel%local_nsn) + local_nsn => parallel%local_nsn, & + own_ewn => parallel%own_ewn, & + own_nsn => parallel%own_nsn) - local_patch_sum = 0.0d0 + if (parallel%reprosum) then ! compute using cism_reprosum_calc - do j = nhalo+1, local_nsn-nhalo - do i = nhalo+1, local_ewn-nhalo - np = patch_id(i,j) - if (np > 0) then - local_patch_sum(np) = local_patch_sum(np) + a(i,j) - endif + ! Allocate and fill arrays to pass to parallel_reduce_reprosum + nsummands = own_ewn*own_nsn + nflds = npatch + allocate(arr(nsummands,nflds)) + allocate(arr_gsum(nflds)) + + count = 0 + arr(:,:) = 0.0d0 + + do j = nhalo+1, local_nsn-nhalo + do i = nhalo+1, local_ewn-nhalo + count = count + 1 + np = patch_id(i,j) + if (np > 0) then + arr(count,np) = a(i,j) + endif + enddo enddo - enddo - parallel_global_sum_patch_real8_2d = parallel_reduce_sum(local_patch_sum) + ! bug check + if (count /= nsummands) then + if (main_task) write(iulog,*) 'Error: count, nsummands =', count, nsummands + call parallel_stop(__FILE__,__LINE__) + endif + + ! Call parallel_reduce_reprosum + call parallel_reduce_reprosum(arr, arr_gsum) + + parallel_global_sum_patch_real8_2d(:) = arr_gsum(:) + + if (verbose_reprosum .and. main_task) then +!! write(iulog,*) 'arr_gsum =', arr_gsum + endif + + deallocate(arr) + deallocate(arr_gsum) + + else ! compute using parallel_reduce_sum (not reproducible) + + local_patch_sum = 0.0d0 + + do j = nhalo+1, local_nsn-nhalo + do i = nhalo+1, local_ewn-nhalo + np = patch_id(i,j) + if (np > 0) then + local_patch_sum(np) = local_patch_sum(np) + a(i,j) + endif + enddo + enddo + + parallel_global_sum_patch_real8_2d = parallel_reduce_sum(local_patch_sum) + + endif ! reprosum end associate @@ -6352,6 +6476,13 @@ function parallel_global_sum_stagger_real8_2d(arr1, parallel, arr2) staggered_ilo, staggered_ihi, & ! bounds of locally owned vertices on staggered grid staggered_jlo, staggered_jhi + ! variables for computing reproductible sums + integer :: nsummands, nflds ! dimensions of array passed to parallel_reduce_reprosum + integer :: count + real(dp), dimension(:,:), allocatable :: arr + real(dp), dimension(:), allocatable :: arr_gsum + + !TODO - associate staggered_ilo = parallel%staggered_ilo staggered_ihi = parallel%staggered_ihi staggered_jlo = parallel%staggered_jlo @@ -6359,8 +6490,54 @@ function parallel_global_sum_stagger_real8_2d(arr1, parallel, arr2) if (parallel%reprosum) then ! compute using cism_reprosum_calc - !TODO - Add the code here - call parallel_stop(__FILE__,__LINE__) + ! Allocate and fill arrays to pass to parallel_reduce_reprosum + nsummands = (staggered_ihi-staggered_ilo+1) * (staggered_jhi-staggered_jlo+1) + nflds = 1 + allocate(arr(nsummands,nflds)) + allocate(arr_gsum(nflds)) + + arr(:,:) = 0.0d0 + + if (present(arr2)) then ! compute global sum of arr1 + arr2 + + count = 0 + do j = staggered_jlo, staggered_jhi + do i = staggered_ilo, staggered_ihi + count = count + 1 + arr(count,1) = arr1(i,j) + arr2(i,j) + enddo + enddo + + else ! compute global sum of arr1 + + count = 0 + do j = staggered_jlo, staggered_jhi + do i = staggered_ilo, staggered_ihi + count = count + 1 + arr(count,1) = arr1(i,j) + enddo + enddo + + endif + + ! bug check + if (count /= nsummands) then + if (main_task) write(iulog,*) 'Error: count, nsummands =', count, nsummands + call parallel_stop(__FILE__,__LINE__) + endif + + ! Call parallel_reduce_reprosum + call parallel_reduce_reprosum(arr, arr_gsum) + + parallel_global_sum_stagger_real8_2d = arr_gsum(1) + + !WHL - debug + if (verbose_reprosum .and. main_task) then +! write(iulog,*) 'arr_gsum =', arr_gsum + endif + + deallocate(arr) + deallocate(arr_gsum) else ! compute using parallel_reduce_sum (not reproducible) @@ -6405,10 +6582,17 @@ function parallel_global_sum_stagger_real8_3d(arr1, parallel, arr2) real(dp) :: local_sum real(dp) :: parallel_global_sum_stagger_real8_3d + ! variables for computing reproductible sums + integer :: nsummands, nflds ! dimensions of array passed to parallel_reduce_reprosum + integer :: count + real(dp), dimension(:,:), allocatable :: arr + real(dp), dimension(:), allocatable :: arr_gsum + integer :: & staggered_ilo, staggered_ihi, & ! bounds of locally owned vertices on staggered grid staggered_jlo, staggered_jhi + !TODO - Associate these variables (and not the ones above) staggered_ilo = parallel%staggered_ilo staggered_ihi = parallel%staggered_ihi staggered_jlo = parallel%staggered_jlo @@ -6418,8 +6602,58 @@ function parallel_global_sum_stagger_real8_3d(arr1, parallel, arr2) if (parallel%reprosum) then ! compute using cism_reprosum_calc - !TODO - Add the code here - call parallel_stop(__FILE__,__LINE__) + ! Allocate and fill arrays to pass to parallel_reduce_reprosum + nsummands = (staggered_ihi-staggered_ilo+1) * (staggered_jhi-staggered_jlo+1) * nz + nflds = 1 + allocate(arr(nsummands,nflds)) + allocate(arr_gsum(nflds)) + + arr(:,:) = 0.0d0 + + if (present(arr2)) then ! compute global sum of arr1 + arr2 + + count = 0 + do j = staggered_jlo, staggered_jhi + do i = staggered_ilo, staggered_ihi + do k = 1, nz + count = count + 1 + arr(count,1) = arr1(k,i,j) + arr2(k,i,j) + enddo + enddo + enddo + + else ! compute global sum of arr1 + + count = 0 + do j = staggered_jlo, staggered_jhi + do i = staggered_ilo, staggered_ihi + do k = 1, nz + count = count + 1 + arr(count,1) = arr1(k,i,j) + enddo + enddo + enddo + + endif + + ! bug check + if (count /= nsummands) then + if (main_task) write(iulog,*) 'Error: count, nsummands =', count, nsummands + call parallel_stop(__FILE__,__LINE__) + endif + + ! Call parallel_reduce_reprosum + call parallel_reduce_reprosum(arr, arr_gsum) + + parallel_global_sum_stagger_real8_3d = arr_gsum(1) + + !WHL - debug + if (verbose_reprosum .and. main_task) then +! write(iulog,*) 'arr_gsum =', arr_gsum + endif + + deallocate(arr) + deallocate(arr_gsum) else ! compute using parallel_reduce_sum (not reproducible) @@ -6459,6 +6693,7 @@ function parallel_global_sum_stagger_real8_2d_nflds(arr1, nflds, parallel, arr2) ! Sum one or two local arrays on the staggered grid, then take the global sum. ! The final index is equal to the number of independent fields to be summed. + !TODO - Don't have to pass in nflds, since it equals size(a,3)? real(dp), dimension(:,:,:), intent(in) :: arr1 @@ -6479,6 +6714,13 @@ function parallel_global_sum_stagger_real8_2d_nflds(arr1, nflds, parallel, arr2) staggered_ilo, staggered_ihi, & ! bounds of locally owned vertices on staggered grid staggered_jlo, staggered_jhi + ! variables for computing reproductible sums +!! integer :: nsummands, nflds ! dimensions of array passed to parallel_reduce_reprosum + integer :: nsummands ! dimensions of array passed to parallel_reduce_reprosum + integer :: count + real(dp), dimension(:,:), allocatable :: arr + real(dp), dimension(:), allocatable :: arr_gsum + staggered_ilo = parallel%staggered_ilo staggered_ihi = parallel%staggered_ihi staggered_jlo = parallel%staggered_jlo @@ -6486,8 +6728,54 @@ function parallel_global_sum_stagger_real8_2d_nflds(arr1, nflds, parallel, arr2) if (parallel%reprosum) then ! compute using cism_reprosum_calc - !TODO - Add the code here - call parallel_stop(__FILE__,__LINE__) + ! Allocate and fill arrays to pass to parallel_reduce_reprosum + nsummands = (staggered_ihi-staggered_ilo+1) * (staggered_jhi-staggered_jlo+1) +! nflds = size(a,3) + allocate(arr(nsummands,nflds)) + allocate(arr_gsum(nflds)) + + arr(:,:) = 0.0d0 + + do n = 1, nflds + + if (present(arr2)) then ! compute global sum of arr1 + arr2 + count = 0 + do j = staggered_jlo, staggered_jhi + do i = staggered_ilo, staggered_ihi + count = count + 1 + arr(count,n) = arr1(i,j,n) + arr2(i,j,n) + enddo + enddo + else ! compute global sum of arr1 + count = 0 + do j = staggered_jlo, staggered_jhi + do i = staggered_ilo, staggered_ihi + count = count + 1 + arr(count,n) = arr1(i,j,n) + enddo + enddo + endif + + enddo ! nflds + + ! bug check + if (count /= nsummands) then + if (main_task) write(iulog,*) 'Error: count, nsummands =', count, nsummands + call parallel_stop(__FILE__,__LINE__) + endif + + ! Call parallel_reduce_reprosum + call parallel_reduce_reprosum(arr, arr_gsum) + + parallel_global_sum_stagger_real8_2d_nflds = arr_gsum(:) + + !WHL - debug + if (verbose_reprosum .and. main_task) then +! write(iulog,*) 'arr_gsum =', arr_gsum + endif + + deallocate(arr) + deallocate(arr_gsum) else ! compute using parallel_reduce_sum (not reproducible) @@ -6513,10 +6801,10 @@ function parallel_global_sum_stagger_real8_2d_nflds(arr1, nflds, parallel, arr2) enddo ! nflds - endif ! reprosum + ! take the global sum + parallel_global_sum_stagger_real8_2d_nflds = parallel_reduce_sum(local_sum(:)) - ! take the global sum - parallel_global_sum_stagger_real8_2d_nflds = parallel_reduce_sum(local_sum(:)) + endif ! reprosum end function parallel_global_sum_stagger_real8_2d_nflds @@ -6547,6 +6835,14 @@ function parallel_global_sum_stagger_real8_3d_nflds(arr1, nflds, parallel, arr2) staggered_ilo, staggered_ihi, & ! bounds of locally owned vertices on staggered grid staggered_jlo, staggered_jhi + ! variables for computing reproductible sums +!! integer :: nsummands, nflds ! dimensions of array passed to parallel_reduce_reprosum + integer :: nsummands ! dimensions of array passed to parallel_reduce_reprosum + integer :: count + real(dp), dimension(:,:), allocatable :: arr + real(dp), dimension(:), allocatable :: arr_gsum + + !TODO - Associate these variables staggered_ilo = parallel%staggered_ilo staggered_ihi = parallel%staggered_ihi staggered_jlo = parallel%staggered_jlo @@ -6556,8 +6852,63 @@ function parallel_global_sum_stagger_real8_3d_nflds(arr1, nflds, parallel, arr2) if (parallel%reprosum) then ! compute using cism_reprosum_calc - !TODO - Add the code here - call parallel_stop(__FILE__,__LINE__) + !WHL - debug + if (verbose_reprosum .and. main_task) then + write(iulog,*) ' In global_sum_stagger_real8_3d_nflds' + endif + + ! Allocate and fill arrays to pass to parallel_reduce_reprosum + nsummands = (staggered_ihi-staggered_ilo+1) * (staggered_jhi-staggered_jlo+1) * nz +! nflds = size(a,3) + allocate(arr(nsummands,nflds)) + allocate(arr_gsum(nflds)) + + arr(:,:) = 0.0d0 + + do n = 1, nflds + + if (present(arr2)) then ! compute global sum of arr1 + arr2 + count = 0 + do j = staggered_jlo, staggered_jhi + do i = staggered_ilo, staggered_ihi + do k = 1, nz + count = count + 1 + arr(count,n) = arr1(k,i,j,n) + arr2(k,i,j,n) + enddo + enddo + enddo + else ! compute global sum of arr1 + count = 0 + do j = staggered_jlo, staggered_jhi + do i = staggered_ilo, staggered_ihi + do k = 1, nz + count = count + 1 + arr(count,n) = arr1(k,i,j,n) + enddo + enddo + enddo + endif + + enddo ! nflds + + ! bug check + if (count /= nsummands) then + if (main_task) write(iulog,*) 'Error: count, nsummands =', count, nsummands + call parallel_stop(__FILE__,__LINE__) + endif + + ! Call parallel_reduce_reprosum + call parallel_reduce_reprosum(arr, arr_gsum) + + parallel_global_sum_stagger_real8_3d_nflds = arr_gsum(:) + + !WHL - debug + if (verbose_reprosum .and. main_task) then +! write(iulog,*) 'arr_gsum =', arr_gsum + endif + + deallocate(arr) + deallocate(arr_gsum) else ! compute using parallel_reduce_sum (not reproducible) @@ -6597,6 +6948,48 @@ end function parallel_global_sum_stagger_real8_3d_nflds !======================================================================= ! functions belonging to the parallel_is_zero interface + function parallel_is_zero_integer_1d(a) + + ! returns .true. if the field has all zero values, else returns .false. + + integer, dimension(:), intent(in) :: a + logical :: parallel_is_zero_integer_1d + + integer :: maxval_a + + maxval_a = maxval(abs(a)) + maxval_a = parallel_reduce_max(maxval_a) + if (maxval_a > 0) then + parallel_is_zero_integer_1d = .false. + else + parallel_is_zero_integer_1d = .true. + endif + + end function parallel_is_zero_integer_1d + +!======================================================================= + + function parallel_is_zero_real8_1d(a) + + ! returns .true. if the field has all zero values, else returns .false. + + real(dp), dimension(:), intent(in) :: a + logical :: parallel_is_zero_real8_1d + + real(dp) :: maxval_a + + maxval_a = maxval(abs(a)) + maxval_a = parallel_reduce_max(maxval_a) + if (maxval_a > 0.0d0) then + parallel_is_zero_real8_1d = .false. + else + parallel_is_zero_real8_1d = .true. + endif + + end function parallel_is_zero_real8_1d + +!======================================================================= + function parallel_is_zero_integer_2d(a) ! returns .true. if the field has all zero values, else returns .false. @@ -6604,7 +6997,7 @@ function parallel_is_zero_integer_2d(a) integer, dimension(:,:), intent(in) :: a logical :: parallel_is_zero_integer_2d - real(dp) :: maxval_a + integer :: maxval_a maxval_a = maxval(abs(a)) maxval_a = parallel_reduce_max(maxval_a) @@ -8905,7 +9298,6 @@ subroutine parallel_reduce_reprosum(arr, arr_gsum) ! Compute a reproducible global sum for a floating-point variable or array. ! Can be called from parallel_global_sum, parallel_global_sum_patch, or ! parallel_global_sum_stagger. - ! Still under construction implicit none @@ -8965,21 +9357,11 @@ subroutine parallel_reduce_reprosum(arr, arr_gsum) repro_sum_validate ! flag enabling/disabling testing that gmax and max_levels ! are accurate/sufficient. Default is enabled. - ! Set parameters and allocate arrays dsummands = size(arr,1) nflds = size(arr,2) nsummands = dsummands - !WHL - debug - if (verbose_reprosum .and. main_task) then - write(iulog,*) 'In parallel_reduce_reprosum' - write(iulog,*) 'dsummands, nflds:', dsummands, nflds - endif - if (verbose_reprosum) then -!! write(iulog,*) 'rank, arr:', this_rank, arr(:,:) - endif - allocate (arr_gbl_max(nflds)) allocate (arr_gbl_max_out(nflds)) allocate (arr_max_levels(nflds)) @@ -8990,12 +9372,6 @@ subroutine parallel_reduce_reprosum(arr, arr_gsum) ddpdd_sum = .false. repro_sum_validate = .true. -#ifdef CCSMCOUPLED - -!! call shr_reprosum_calc - -#else - ! The following subroutine is adapted from shr_reprosum_calc in CESM shared code. call cism_reprosum_calc(& @@ -9013,15 +9389,13 @@ subroutine parallel_reduce_reprosum(arr, arr_gsum) repro_sum_stats = repro_sum_stats, & rel_diff = rel_diff) -#endif - if (verbose_reprosum .and. main_task) then - write(iulog,*) 'arr_gbl_max_out =', arr_gbl_max_out - write(iulog,*) 'arr_max_levels_out =', arr_max_levels_out - write(iulog,*) 'gbl_max_nsummands_out =', gbl_max_nsummands_out - write(iulog,*) 'rel diff =', rel_diff(1,:) - write(iulog,*) 'abs diff =', rel_diff(2,:) - write(iulog,*) 'stats =', repro_sum_stats(:) +! write(iulog,*) 'arr_gbl_max_out =', arr_gbl_max_out +! write(iulog,*) 'arr_max_levels_out =', arr_max_levels_out +! write(iulog,*) 'gbl_max_nsummands_out =', gbl_max_nsummands_out +! write(iulog,*) 'rel diff =', rel_diff(1,:) +! write(iulog,*) 'abs diff =', rel_diff(2,:) +! write(iulog,*) 'stats =', repro_sum_stats(:) endif deallocate(arr_gbl_max, arr_gbl_max_out) diff --git a/libglissade/glissade.F90 b/libglissade/glissade.F90 index 72bab6f4..6c020822 100644 --- a/libglissade/glissade.F90 +++ b/libglissade/glissade.F90 @@ -85,7 +85,7 @@ module glissade !======================================================================= ! Note: There is no glissade_config subroutine; glide_config works for all dycores. - +! glide_config is called from cism_init_dycore before glissade_initialise. !======================================================================= subroutine glissade_initialise(model, evolve_ice) @@ -366,6 +366,32 @@ subroutine glissade_initialise(model, evolve_ice) itest = model%numerics%idiag_local jtest = model%numerics%jdiag_local + ! Check whether x0 and y0 were read in. If not, then compute them from x1 and y1. + if (parallel_is_zero(model%general%x1)) then + if (main_task) write(iulog,*) 'Warning: model%general%x1 = 0' + else +! if (main_task) write(iulog,*) 'x1_global:', model%general%x1_global(:) +! if (main_task) write(iulog,*) 'y1_global:', model%general%y1_global(:) +! if (main_task) write(iulog,*) 'x1:', model%general%x1(:) +! if (main_task) write(iulog,*) 'y1:', model%general%y1(:) + endif + + if (parallel_is_zero(model%general%x0)) then + if (main_task) write(iulog,*) 'Initialize x0' + do i = 1, model%general%ewn-1 + model%general%x0(i) = 0.5d0 * (model%general%x1(i) + model%general%x1(i+1)) + enddo + endif + if (main_task) write(iulog,*) 'x0:', model%general%x0(:) + + if (parallel_is_zero(model%general%y0)) then + if (main_task) write(iulog,*) 'Initialize y0' + do j = 1, model%general%nsn-1 + model%general%y0(j) = 0.5d0 * (model%general%y1(j) + model%general%y1(j+1)) + enddo + endif + if (main_task) write(iulog,*) 'y0:', model%general%y0(:) + ! Check that lat and lon fields were read in, if desired !TODO - Use the parallel_is_nonzero function instead, here and below if (model%options%read_lat_lon) then diff --git a/libglissade/glissade_basal_water.F90 b/libglissade/glissade_basal_water.F90 index 18b03292..593b33b2 100644 --- a/libglissade/glissade_basal_water.F90 +++ b/libglissade/glissade_basal_water.F90 @@ -34,12 +34,16 @@ module glissade_basal_water use glide_types use cism_parallel, only: main_task, this_rank, nhalo, parallel_type, parallel_halo + !WHL - debug + use glimmer_utils, only: double_to_binary + implicit none private public :: glissade_basal_water_init, glissade_calcbwat, glissade_bwat_flux_routing logical, parameter :: verbose_bwat = .false. +!! logical, parameter :: verbose_bwat = .true. contains @@ -535,6 +539,9 @@ subroutine route_basal_water(& character(len=100) :: message + !WHL - debug + character(len=64) :: binary_str + ! Allocate the sorted_ij array nlocal = parallel%own_ewn * parallel%own_nsn @@ -842,8 +849,14 @@ subroutine route_basal_water(& if (verbose_bwat .and. this_rank == rtest) then write(iulog,*) 'Total bwatflx at margin (m^3/s):', total_flux_margin + call double_to_binary(total_flux_margin, binary_str) + write(iulog,*) ' ', binary_str write(iulog,*) 'Total bwatflx_refreeze (m^3/s)=', total_flux_refreeze + call double_to_binary(total_flux_refreeze, binary_str) + write(iulog,*) ' ', binary_str write(iulog,*) 'Total bwatflx (m^3/s)=', total_flux_out + call double_to_binary(total_flux_out, binary_str) + write(iulog,*) ' ', binary_str write(iulog,*) 'Difference between output and input =', total_flux_out - total_flux_in endif diff --git a/libglissade/glissade_utils.F90 b/libglissade/glissade_utils.F90 index eb5b629f..0a959f1e 100644 --- a/libglissade/glissade_utils.F90 +++ b/libglissade/glissade_utils.F90 @@ -44,7 +44,12 @@ module glissade_utils glissade_basin_sum, glissade_basin_average, & glissade_usrf_to_thck, glissade_thck_to_usrf, & glissade_edge_fluxes, glissade_input_fluxes, & - glissade_rms_error + glissade_rms_error, write_array_to_file + + interface write_array_to_file + module procedure write_array_to_file_real8_2d + module procedure write_array_to_file_real8_3d + end interface contains @@ -945,10 +950,116 @@ subroutine glissade_input_fluxes(& end subroutine glissade_input_fluxes + + ! subroutines belonging to the write_array_to_file interface + subroutine write_array_to_file_real8_2d(arr, fileunit, filename, parallel) + + ! Copy the input array into a global array and write all values to an output file. + ! This can be useful for debugging, if we want to find differences between two fields + ! (e.g., in two different runs). + ! This version writes out 64-bit character strings corresponding to the binary representation + ! of each floating-point variable. This can be useful for BFB comparisons. + ! Sometimes, two floating-point variables appear to have the same values in base 10, + ! when the last few bits actually vary. + !TODO - Allow either float or binary output + + use glimmer_utils, only: double_to_binary + use cism_parallel, only: gather_var + + real(dp), dimension(:,:), intent(in) :: arr + integer, intent(in) :: fileunit + character(len=*), intent(in) :: filename + type(parallel_type), intent(in) :: parallel + + integer :: i, j + character(len=64) :: binary_str + real(dp), dimension(:,:), allocatable :: arr_global + + call gather_var(arr, arr_global, parallel) + if (main_task) then + open(unit=fileunit, file=trim(filename), status='replace', position='append') + do j = 1, parallel%global_nsn + do i = 1, parallel%global_ewn + call double_to_binary(arr_global(i,j), binary_str) + write (fileunit, '(2i6,a4,a64)') i, j, ' ', binary_str + enddo + enddo + close(unit=fileunit) + deallocate(arr_global) + endif + + end subroutine write_array_to_file_real8_2d + + + subroutine write_array_to_file_real8_3d(arr, fileunit, filename, parallel, cycle_indices) + + ! Copy the input array into a global array and write all values to an output file. + ! This can be useful for debugging, if we want to find differences between two fields + ! (e.g., in two different runs). + ! This version writes out 64-bit character strings corresponding to the binary representation + ! of each floating-point variable. This can be useful for BFB comparisons. + ! Sometimes, two floating-point variables appear to have the same values in base 10, + ! when the last few bits actually vary. + !TODO - Allow either float or binary output + + use glimmer_utils, only: double_to_binary + use cism_parallel, only: gather_var + + real(dp), dimension(:,:,:), intent(in) :: arr ! first two indices are i and j + integer, intent(in) :: fileunit + character(len=*), intent(in) :: filename + type(parallel_type), intent(in) :: parallel + logical, intent(in), optional :: cycle_indices ! if true, then index 3->1, 1->2, 2->3 + + integer :: i, j, k, kmax + character(len=64) :: binary_str + real(dp), dimension(:,:,:), allocatable :: arr_global + real(dp), dimension(:,:,:), allocatable :: arr_cycle + logical :: cycle_ind + + if (present(cycle_indices)) then + cycle_ind = cycle_indices + else + cycle_ind = .false. + endif + + if (cycle_ind) then + allocate(arr_cycle(size(arr,3), size(arr,1), size(arr,2))) + kmax = size(arr,3) + do j = 1, size(arr,2) + do i = 1, size(arr,1) + do k = 1, kmax + arr_cycle(k,i,j) = arr(i,j,k) + enddo + enddo + enddo + call gather_var(arr_cycle, arr_global, parallel) + deallocate(arr_cycle) + else + kmax = size(arr,1) + call gather_var(arr, arr_global, parallel) + endif + + if (main_task) then + open(unit=fileunit, file=trim(filename), status='unknown') + do j = 1, parallel%global_nsn + do i = 1, parallel%global_ewn + do k = 1, kmax + call double_to_binary(arr_global(k,i,j), binary_str) + write (fileunit, '(3i6,a4,a64)') i, j, k, ' ', binary_str + enddo + enddo + enddo + close(unit=fileunit) + deallocate(arr_global) + endif + + end subroutine write_array_to_file_real8_3d + !**************************************************************************** !TODO - Other utility subroutines to add here? -! E.g., tridiag; calclsrf; subroutines to zero out tracers +! E.g., calclsrf; subroutines to zero out tracers !**************************************************************************** diff --git a/libglissade/glissade_velo_higher.F90 b/libglissade/glissade_velo_higher.F90 index 6ea89c0a..3b3ec31b 100644 --- a/libglissade/glissade_velo_higher.F90 +++ b/libglissade/glissade_velo_higher.F90 @@ -56,11 +56,11 @@ module glissade_velo_higher - use glimmer_global, only: dp + use glimmer_global, only: dp, i8 use glimmer_physcon, only: n_glen, rhoi, rhoo, grav, scyr, pi - use glimmer_paramets, only: iulog, eps08, eps10, eps11 + use glimmer_paramets, only: iulog, eps11, eps10 use glimmer_paramets, only: velo_scale, len_scale ! used for whichefvs = HO_EFVS_FLOWFACT - use glimmer_utils, only: point_diag + use glimmer_utils, only: point_diag, double_to_binary use glimmer_log use glimmer_sparse_type use glimmer_sparse @@ -92,7 +92,9 @@ module glissade_velo_higher use cism_parallel, only: this_rank, main_task, nhalo, tasks, & parallel_type, parallel_halo, staggered_parallel_halo, parallel_globalindex, & parallel_reduce_max, parallel_reduce_sum, not_parallel - + !WHL - debug + use cism_parallel, only: parallel_global_sum_stagger + use cism_reprosum_mod, only: verbose_reprosum implicit none private @@ -225,8 +227,8 @@ module glissade_velo_higher ! logical :: verbose_bfric = .true. logical :: verbose_trilinos = .false. ! logical :: verbose_trilinos = .true. - logical :: verbose_beta = .false. -! logical :: verbose_beta = .true. +! logical :: verbose_beta = .false. + logical :: verbose_beta = .true. logical :: verbose_efvs = .false. ! logical :: verbose_efvs = .true. logical :: verbose_tau = .false. @@ -261,6 +263,10 @@ module glissade_velo_higher dphi_dyr_3d_vav, &! vertical avg of dphi_dyr_3d dphi_dzr_3d_vav ! vertical avg of dphi_dzr_3d + !WHL - debug for reprosum + character(len=64) :: binary_str ! string representation of binary number (chain of 0's and 1's) + character(len=64) :: binary_str1, binary_str2 + contains !**************************************************************************** @@ -671,6 +677,9 @@ subroutine glissade_velo_higher_solve(model, & use glide_thck, only: glide_calclsrf use profile, only: t_startf, t_stopf + !WHL - debug + use glissade_utils, only: write_array_to_file + !---------------------------------------------------------------- ! Input-output arguments !---------------------------------------------------------------- @@ -1068,6 +1077,15 @@ subroutine glissade_velo_higher_solve(model, & integer :: itest, jtest ! coordinates of diagnostic point integer :: rtest ! task number for processor containing diagnostic point + !WHL - debug + real(dp), dimension(nNodeNeighbors_2d) :: & + sum_Auu, sum_Auv, sum_Avu, sum_Avv + real(dp) :: sum_uvel, sum_vvel + real(dp) :: sum_bu, sum_bv + real(dp) :: sum_flwa, sum_flwafact, sum_btrx, sum_btry, sum_stagusrf, sum_stagthck + real(dp) :: sum_betax, sum_betay, sum_omega, sum_stag_omega + real(dp), dimension(:,:), allocatable :: arr_global ! temporary global array + call t_startf('glissade_vhs_init') rtest = -999 itest = 1 @@ -1587,19 +1605,21 @@ subroutine glissade_velo_higher_solve(model, & ! unique local ID to each such node. !------------------------------------------------------------------------------ -!pw call t_startf('glissade_get_vertex_geom') - call get_vertex_geometry(nx, ny, & - nz, nhalo, & - parallel, & - dx, dy, & - itest, jtest, rtest, & - ice_mask, & - xVertex, yVertex, & - active_cell, active_vertex, & - nNodesSolve, nVerticesSolve, & - nodeID, vertexID, & - iNodeIndex, jNodeIndex, kNodeIndex, & - iVertexIndex, jVertexIndex) + !pw call t_startf('glissade_get_vertex_geom') + call get_vertex_geometry(& + nx, ny, & + nz, nhalo, & + parallel, & + model%general%x0, model%general%y0, & + dx, dy, & + itest, jtest, rtest, & + ice_mask, & + xVertex, yVertex, & + active_cell, active_vertex, & + nNodesSolve, nVerticesSolve, & + nodeID, vertexID, & + iNodeIndex, jNodeIndex, kNodeIndex, & + iVertexIndex, jVertexIndex) !pw call t_stopf('glissade_get_vertex_geom') ! Zero out the velocity for inactive vertices @@ -2302,6 +2322,10 @@ subroutine glissade_velo_higher_solve(model, & whichbeta_limit, & itest = itest, jtest = jtest, rtest = rtest) + if (verbose_basal) then + call point_diag(beta_internal, 'After calcbeta, beta', itest, jtest, rtest, 7, 7, '(f12.0)') + endif + ! if (verbose_beta) then ! maxbeta = maxval(beta_internal(:,:)) ! maxbeta = parallel_reduce_max(maxbeta) @@ -2329,6 +2353,36 @@ subroutine glissade_velo_higher_solve(model, & usav_2d(:,:) = uvel_2d(:,:) vsav_2d(:,:) = vvel_2d(:,:) + !WHL - debug - BFB check + sum_uvel = parallel_global_sum_stagger(uvel_2d, parallel) + sum_vvel = parallel_global_sum_stagger(vvel_2d, parallel) + sum_flwa = parallel_global_sum_stagger(flwa, parallel) + sum_flwafact = parallel_global_sum_stagger(flwafact, parallel) + sum_btrx = parallel_global_sum_stagger(btractx, parallel) + sum_btry = parallel_global_sum_stagger(btracty, parallel) + sum_stagusrf = parallel_global_sum_stagger(stagusrf, parallel) + sum_stagthck = parallel_global_sum_stagger(stagthck, parallel) +!! if (this_rank == rtest) then + if (0 == 1) then + write(iulog,*) ' ' + call double_to_binary(sum_uvel, binary_str) + write(iulog,*) 'Before assembly: sum_uvel, binary_str:', sum_uvel, binary_str + call double_to_binary(sum_vvel, binary_str) + write(iulog,*) 'Before assembly: sum_vvel, binary_str:', sum_vvel, binary_str + call double_to_binary(sum_flwa, binary_str) + write(iulog,*) 'Before assembly: sum_flwa, binary_str:', sum_flwa, binary_str + call double_to_binary(sum_flwafact, binary_str) + write(iulog,*) 'Before assembly: sum_flwafact, binary_str:', sum_flwafact, binary_str + call double_to_binary(sum_btrx, binary_str) + write(iulog,*) 'Before assembly: sum_btrx, binary_str:', sum_btrx, binary_str + call double_to_binary(sum_btry, binary_str) + write(iulog,*) 'Before assembly: sum_btry, binary_str:', sum_btry, binary_str + call double_to_binary(sum_stagusrf, binary_str) + write(iulog,*) 'Before assembly: sum_stagusrf, binary_str:', sum_stagusrf, binary_str + call double_to_binary(sum_stagthck, binary_str) + write(iulog,*) 'Before assembly: sum_stagthck, binary_str:', sum_stagthck, binary_str + endif + ! Assemble the matrix call assemble_stiffness_matrix_2d(nx, ny, & @@ -2353,6 +2407,53 @@ subroutine glissade_velo_higher_solve(model, & omega_k, omega, & efvs_qp_3d) + !WHL - debug - BFB check + if (0 == 1) then +!! if (verbose_reprosum .and. counter == 1) then + if (main_task) write(iulog,*) 'Write out matrices after assemble_stiffness_matrix' + call write_array_to_file(Auu_2d, 21, 'global_Auu1', parallel, cycle_indices = .true.) + call write_array_to_file(Auv_2d, 22, 'global_Auv1', parallel, cycle_indices = .true.) + call write_array_to_file(Avu_2d, 23, 'global_Avu1', parallel, cycle_indices = .true.) + call write_array_to_file(Avv_2d, 24, 'global_Avv1', parallel, cycle_indices = .true.) + call write_array_to_file(bu_2d, 25, 'global_bu1', parallel) + call write_array_to_file(bv_2d, 26, 'global_bv1', parallel) + endif + + !WHL - debug - BFB check +!! if (0 == 1) then + if (verbose_reprosum) then + sum_Auu(:) = parallel_global_sum_stagger(Auu_2d, nNodeNeighbors_2d, parallel) + sum_Auv(:) = parallel_global_sum_stagger(Auv_2d, nNodeNeighbors_2d, parallel) + sum_Avu(:) = parallel_global_sum_stagger(Avu_2d, nNodeNeighbors_2d, parallel) + sum_Avv(:) = parallel_global_sum_stagger(Avv_2d, nNodeNeighbors_2d, parallel) + if (main_task) then + write(iulog,*) ' ' + write(iulog,*) 'After assembly: n, sum_Auu(n), binary_str:' + do n = 1, nNodeNeighbors_2d + call double_to_binary(sum_Auu(n), binary_str) + write(iulog,*) n, sum_Auu(n), binary_str + enddo + write(iulog,*) ' ' + write(iulog,*) 'After assembly: n, sum_Auv(n), binary_str:' + do n = 1, nNodeNeighbors_2d + call double_to_binary(sum_Auv(n), binary_str) + write(iulog,*) n, sum_Auv(n), binary_str + enddo + write(iulog,*) ' ' + write(iulog,*) 'After assembly: n, sum_Avu(n), binary_str:' + do n = 1, nNodeNeighbors_2d + call double_to_binary(sum_Avu(n), binary_str) + write(iulog,*) n, sum_Avu(n), binary_str + enddo + write(iulog,*) ' ' + write(iulog,*) 'After assembly: n, sum_Avv(n), binary_str:' + do n = 1, nNodeNeighbors_2d + call double_to_binary(sum_Avv(n), binary_str) + write(iulog,*) n, sum_Avv(n), binary_str + enddo + endif + endif ! verbose_reprosum + if (whichapprox == HO_APPROX_DIVA) then ! Halo update for omega @@ -2411,8 +2512,8 @@ subroutine glissade_velo_higher_solve(model, & beta_eff_x(:,:) = 0.d0 beta_eff_y(:,:) = 0.d0 - !Note: The 'if' is not strictly needed, since the corrected beta_eff is equal - ! to the uncorrected beta_eff whe slope_factor = 1.0 and theta_slope = 0.0 + !Note: The 'if diva_slope_correction' is not strictly needed, since the more complicated + ! equations reduce to the simpler ones when slope_factor = 1.0 and theta_slope = 0.0. if (diva_slope_correction) then ! compute a larger beta_eff at each vertex based on the slope if (whichbabc == HO_BABC_NO_SLIP) then @@ -2435,8 +2536,8 @@ subroutine glissade_velo_higher_solve(model, & beta_eff_y = 1.d0 / stag_omega endwhere else ! slip allowed at bed - beta_eff_x = beta_internal(:,:) / (1.d0 + beta_internal*stag_omega) - beta_eff_y = beta_internal(:,:) / (1.d0 + beta_internal*stag_omega) + beta_eff_x = beta_internal / (1.d0 + beta_internal*stag_omega) + beta_eff_y = beta_internal / (1.d0 + beta_internal*stag_omega) endif endif @@ -2454,6 +2555,36 @@ subroutine glissade_velo_higher_solve(model, & call point_diag(beta_eff_y, 'beta_eff_y', itest, jtest, rtest, 7, 7, '(e10.3)') endif + if (verbose_reprosum) then + sum_omega = parallel_global_sum_stagger(omega, parallel) + sum_stag_omega = parallel_global_sum_stagger(stag_omega, parallel) + sum_betax = parallel_global_sum_stagger(beta_eff_x, parallel) + sum_betay = parallel_global_sum_stagger(beta_eff_y, parallel) + if (main_task) then + call double_to_binary(sum_omega, binary_str) + write(iulog,*) 'Before bc_2d: sum_omega, binary_str:', sum_omega, binary_str + call double_to_binary(sum_stag_omega, binary_str) + write(iulog,*) ' sum_stag_omega, binary_str:', sum_stag_omega, binary_str + call double_to_binary(sum_betax, binary_str) + write(iulog,*) ' sum_betax, binary_str:', sum_betax, binary_str + call double_to_binary(sum_betay, binary_str) + write(iulog,*) ' sum_betay, binary_str:', sum_betay, binary_str + endif + if (this_rank == rtest) then + do j = jtest-2, jtest+2 + do i = itest-2, itest+2 + if (i >= staggered_ilo .and. i <= staggered_ihi .and. & + j >= staggered_jlo .and. j <= staggered_jhi) then + call double_to_binary(stag_omega(i,j), binary_str) + write(iulog,*) 'i, j, stag_omega:', i, j, binary_str + call double_to_binary(beta_eff_x(i,j), binary_str) + write(iulog,*) ' beta_eff_x:', i, j, binary_str + endif + enddo + enddo + endif + endif ! verbose_reprosum + if (diva_slope_correction) then ! Incorporate basal sliding boundary conditions with basal curvature, @@ -2568,6 +2699,18 @@ subroutine glissade_velo_higher_solve(model, & call staggered_parallel_halo(bv_2d(:,:), parallel) call t_stopf('glissade_halo_bxxs') + !WHL - debug - Write all the matrix elements and rhs elements (in binary form) to files +!! if (verbose_reprosum .and. counter == 1) then + if (0 == 1) then + if (main_task) write(iulog,*) 'Write out matrices after adding BC' + call write_array_to_file(Auu_2d, 21, 'global_Auu3', parallel, cycle_indices = .true.) + call write_array_to_file(Auv_2d, 22, 'global_Auv3', parallel, cycle_indices = .true.) + call write_array_to_file(Avu_2d, 23, 'global_Avu3', parallel, cycle_indices = .true.) + call write_array_to_file(Avv_2d, 24, 'global_Avv3', parallel, cycle_indices = .true.) + call write_array_to_file(bu_2d, 25, 'global_bu3', parallel) + call write_array_to_file(bv_2d, 26, 'global_bv3', parallel) + endif + !--------------------------------------------------------------------------- ! Check symmetry of assembled matrix ! @@ -2597,6 +2740,41 @@ subroutine glissade_velo_higher_solve(model, & active_vertex, & nNonzeros) + if (verbose_reprosum) then + sum_Auu(:) = parallel_global_sum_stagger(Auu_2d, nNodeNeighbors_2d, parallel) + sum_Auv(:) = parallel_global_sum_stagger(Auv_2d, nNodeNeighbors_2d, parallel) + sum_Avu(:) = parallel_global_sum_stagger(Avu_2d, nNodeNeighbors_2d, parallel) + sum_Avv(:) = parallel_global_sum_stagger(Avv_2d, nNodeNeighbors_2d, parallel) + sum_bu = parallel_global_sum_stagger(bu_2d, parallel) + sum_bv = parallel_global_sum_stagger(bv_2d, parallel) + if (main_task) then + write(iulog,*) ' ' + write(iulog,*) 'After assembly: n, sum_Auu(n), binary_str:' + do n = 1, nNodeNeighbors_2d + call double_to_binary(sum_Auu(n), binary_str) + write(iulog,*) n, sum_Auu(n), binary_str + enddo + write(iulog,*) ' ' + write(iulog,*) 'After assembly: n, sum_Auv(n), binary_str:' + do n = 1, nNodeNeighbors_2d + call double_to_binary(sum_Auv(n), binary_str) + write(iulog,*) n, sum_Auv(n), binary_str + enddo + write(iulog,*) ' ' + write(iulog,*) 'After assembly: n, sum_Avu(n), binary_str:' + do n = 1, nNodeNeighbors_2d + call double_to_binary(sum_Avu(n), binary_str) + write(iulog,*) n, sum_Avu(n), binary_str + enddo + write(iulog,*) ' ' + write(iulog,*) 'After assembly: n, sum_Avv(n), binary_str:' + do n = 1, nNodeNeighbors_2d + call double_to_binary(sum_Avv(n), binary_str) + write(iulog,*) n, sum_Avv(n), binary_str + enddo + endif + endif ! verbose_reprosum + if (write_matrix) then if (counter == 1) then ! first outer iteration only call t_startf('glissade_wrt_mat') @@ -2610,7 +2788,7 @@ subroutine glissade_velo_higher_solve(model, & endif endif ! write_matrix - if (verbose_matrix .and. this_rank==rtest) then + if (verbose_matrix .and. main_task) then i = itest j = jtest write(iulog,*) ' ' @@ -3886,6 +4064,7 @@ end subroutine glissade_velo_higher_scale_output subroutine get_vertex_geometry(nx, ny, & nz, nhalo, & parallel, & + x0, y0, & dx, dy, & itest, jtest, rtest, & ice_mask, & @@ -3916,11 +4095,17 @@ subroutine get_vertex_geometry(nx, ny, & nhalo ! number of halo layers type(parallel_type), intent(in) :: & - parallel ! info for parallel communication + parallel ! info for parallel communication + + real(dp), dimension(nx-1), intent(in) :: & + x0 ! x coordinates of vertices + + real(dp), dimension(ny-1), intent(in) :: & + y0 ! y coordinates of vertices real(dp), intent(in) :: & - dx, dy ! grid cell length and width (m) - ! assumed to have the same value for each grid cell + dx, dy ! grid cell length and width (m) + ! assumed to have the same value for each grid cell integer, intent(in) :: & itest, jtest, rtest ! coordinates of diagnostic point @@ -3976,13 +4161,11 @@ subroutine get_vertex_geometry(nx, ny, & ! By convention, vertex (i,j) lies at the NE corner of cell(i,j). !---------------------------------------------------------------- - xVertex(:,:) = 0.d0 - yVertex(:,:) = 0.d0 do j = 1, ny-1 - do i = 1, nx-1 - xVertex(i,j) = dx * i - yVertex(i,j) = dy * j - enddo + do i = 1, nx-1 + xVertex(i,j) = x0(i) + yVertex(i,j) = y0(j) + enddo enddo ! Identify the active cells. @@ -4254,7 +4437,8 @@ subroutine load_vector_gravity(nx, ny, & loadv(kNode,iNode,jNode) = loadv(kNode,iNode,jNode) - & rhoi*grav * wqp_3d(p) * detJ/vol0 * dsdy_qp * phi_3d(n,p) - if (verbose_load .and. this_rank==rtest .and. i==itest .and. j==jtest .and. k==ktest .and. p==ptest) then + if (verbose_load .and. this_rank==rtest .and. & + i==itest .and. j==jtest .and. k==ktest .and. p==ptest) then write(iulog,*) ' ' write(iulog,*) 'n, phi_3d(n), delta(loadu), delta(loadv):', n, phi_3d(n,p), & rhoi*grav*wqp_3d(p)*detJ/vol0 * dsdx_qp * phi_3d(n,p), & @@ -5122,7 +5306,7 @@ subroutine assemble_stiffness_matrix_2d(nx, ny, & thck ! ice thickness (m) real(dp), dimension(nx-1,ny-1), intent(in), optional :: & - btractx, btracty ! components of basal traction (Pa) + btractx, btracty ! components of basal traction (Pa) real(dp), dimension(nz,nx,ny), intent(out), optional :: & omega_k ! single integral, defined by Goldberg (2011) eq. 32 @@ -5881,6 +6065,11 @@ subroutine compute_3d_velocity_L1L2(nx, ny, & uvel(1:nz-1,:,:) = 0.d0 vvel(1:nz-1,:,:) = 0.d0 + !WHL - debug + call point_diag(xVertex, 'xVertex', itest, jtest, rtest, 7, 7) + call point_diag(yVertex, 'yVertex', itest, jtest, rtest, 7, 7) + call point_diag(active_cell, 'active_cell', itest, jtest, rtest, 7, 7) + ! Compute viscosity integral and strain rates in elements. ! Loop over all cells that border locally owned vertices. @@ -6280,9 +6469,9 @@ subroutine get_basis_function_derivatives_3d(xNode, yNode, zNode, prod = matmul(Jac,Jinv) do col = 1, 3 do row = 1, 3 - if (abs(prod(row,col) - identity3(row,col)) > 1.d-11) then - write(iulog,*) 'stopping, Jac * Jinv /= identity' - write(iulog,*) 'i, j, k, p:', i, j, k, p + if (abs(prod(row,col) - identity3(row,col)) > eps10) then + write(iulog,*) '3d Jacobian, stopping, Jac * Jinv /= identity' + write(iulog,*) 'rank, i, j, k, p:', this_rank, i, j, k, p write(iulog,*) 'Jac*Jinv:' write(iulog,*) prod(1,:) write(iulog,*) prod(2,:) @@ -6478,8 +6667,8 @@ subroutine get_basis_function_derivatives_2d(xNode, yNode, & do col = 1, 2 do row = 1, 2 if (abs(prod(row,col) - identity3(row,col)) > 1.d-12) then - write(iulog,*) 'stopping, Jac * Jinv /= identity' - write(iulog,*) 'i, j, p:', i, j, p + write(iulog,*) '2d Jacobian, stopping, Jac * Jinv /= identity' + write(iulog,*) 'rank, i, j, p:', this_rank, i, j, p write(iulog,*) 'Jac*Jinv:' write(iulog,*) prod(1,:) write(iulog,*) prod(2,:) @@ -6936,7 +7125,7 @@ subroutine compute_internal_stress (nx, ny, & integer :: i, j, k, n, p integer :: iNode, jNode, kNode - + ! initialize stresses tau_xz (:,:,:) = 0.d0 tau_yz (:,:,:) = 0.d0 @@ -7545,7 +7734,7 @@ subroutine compute_effective_viscosity_diva(whichefvs, efvs, & itest, jtest, rtest, & i, j, p ) - + ! Compute the effective viscosity at each layer of an ice column corresponding ! to a particular quadrature point, based on the depth-integrated formulation. ! See Goldberg(2011) for details. @@ -7917,7 +8106,7 @@ subroutine compute_element_matrix(whichapprox, nNodesPerElement, & (2.d0*dphi_dy(nr)*dphi_dx(nc) + dphi_dx(nr)*dphi_dy(nc)) Kvv(nr,nc) = Kvv(nr,nc) + efvs_factor * & - ( 4.d0*dphi_dy(nr)*dphi_dy(nc) + dphi_dx(nr)*dphi_dx(nc) & + ( 4.d0*dphi_dy(nr)*dphi_dy(nc) + dphi_dx(nr)*dphi_dx(nc) & + dphi_dz(nr)*dphi_dz(nc) ) enddo ! nr (rows) @@ -8050,7 +8239,7 @@ subroutine element_to_global_matrix_2d(nx, ny, & do nc = 1, nNodesPerElement_2d ! columns of K ! Determine column of A to be incremented - iA = ishift(nr,nc) ! similarly for i and j indices + iA = ishift(nr,nc) ! similarly for i and j indices jA = jshift(nr,nc) ! these indices can take values -1, 0 and 1 m = indxA_2d(iA,jA) @@ -8063,6 +8252,8 @@ subroutine element_to_global_matrix_2d(nx, ny, & if (verbose_matrix .and. this_rank==rtest .and. iElement==itest .and. jElement==jtest) then write(iulog,*) 'Increment Auu, element i, j, nr, nc =', iElement, jElement, nr, nc write(iulog,*) ' i, j, m, Kuu, new Auu:', i, j, m, Kuu(nr,nc), Auu(i,j,m) +!! write(iulog,*) 'Increment Avv, element i, j, nr, nc =', iElement, jElement, nr, nc +!! write(iulog,*) ' i, j, m, Kvv, new Avv:', i, j, m, Kvv(nr,nc), Avv(i,j,m) endif enddo ! nc @@ -8080,7 +8271,7 @@ end subroutine element_to_global_matrix_2d ! Set diva_slope_correction = F to reproduce older results. !TODO - Call this subroutine for both 2D and 3D solvers. - ! First need to switch the index order for 3D matrices. + ! First would need to switch the index order for 3D matrices. subroutine basal_sliding_bc_2d(nx, ny, & nNeighbors, nhalo, & parallel, & @@ -8093,7 +8284,6 @@ subroutine basal_sliding_bc_2d(nx, ny, & whichassemble_beta, & Auu, Avv) - !------------------------------------------------------------------------ ! Increment the Auu and Avv matrices with basal traction terms. ! Do a surface integral over all basal faces that contain at least one node with a stress BC. @@ -8166,6 +8356,7 @@ subroutine basal_sliding_bc_2d(nx, ny, & dphi_dx_2d, dphi_dy_2d, dphi_dz_2d ! derivatives of basis functions, evaluated at quad pts real(dp) :: & + increment, & ! incremental change in matrix element beta_qp, & ! beta evaluated at quadrature point detJ ! determinant of Jacobian for the transformation ! between the reference element and true element @@ -8173,24 +8364,23 @@ subroutine basal_sliding_bc_2d(nx, ny, & real(dp), dimension(nNodesPerElement_2d, nNodesPerElement_2d) :: & Kuu, Kvv ! components of element matrix associated with basal sliding - if (verbose_basal) then - call point_diag(beta, 'beta', itest, jtest, rtest, 7, 7, '(f10.0)') - endif - if (whichassemble_beta == HO_ASSEMBLE_BETA_LOCAL) then - if (nNeighbors == nNodeNeighbors_3d) then ! 3D problem - m = indxA_3d(0,0,0) - else ! 2D problem - m = indxA_2d(0,0) - endif - + m = indxA_2d(0,0) ! Sum over active vertices + !WHL, 12/13/25: The following minor change makes the results independent of processor count. + ! Without the commented code, the 1-core result differs from the 4-core result + ! for some cells near the 4-core processor boundary (i = 211 in a GrIS run). + ! The difference is in the last of 64 bits. + ! I don't know why the original results differ or why the change fixes it. do j = 1, ny-1 do i = 1, nx-1 if (active_vertex(i,j)) then - Auu(i,j,m) = Auu(i,j,m) + dx*dy/vol0 * beta(i,j) - Avv(i,j,m) = Avv(i,j,m) + dx*dy/vol0 * beta(i,j) +! Auu(i,j,m) = Auu(i,j,m) + (dx*dy/vol0) * beta(i,j) +! Avv(i,j,m) = Avv(i,j,m) + (dx*dy/vol0) * beta(i,j) + increment = (dx*dy/vol0) * beta(i,j) + Auu(i,j,m) = Auu(i,j,m) + increment + Avv(i,j,m) = Avv(i,j,m) + increment endif ! active_vertex enddo ! i enddo ! j @@ -8331,20 +8521,6 @@ subroutine basal_sliding_bc_2d(nx, ny, & endif ! whichassemble_beta - if (verbose_basal .and. this_rank==rtest) then - i = itest - j = jtest - if (nNeighbors == nNodeNeighbors_3d) then ! 3D problem - m = indxA_3d(0,0,0) - else - m = indxA_2d(0,0) - endif - write(iulog,*) ' ' - write(iulog,*) 'Basal BC: i, j, diagonal index =', i, j, m - write(iulog,*) 'New Auu diagonal:', Auu(i,j,m) - write(iulog,*) 'New Avv diagonal:', Avv(i,j,m) - endif - end subroutine basal_sliding_bc_2d !**************************************************************************** @@ -10407,7 +10583,7 @@ subroutine check_symmetry_element_matrix(nNodesPerElement, & do j = 1, nNodesPerElement do i = j, nNodesPerElement - if (abs(Kuu(i,j) - Kuu(j,i)) > eps10) then + if (abs(Kuu(i,j) - Kuu(j,i)) > eps11) then write(iulog,*) 'Kuu is not symmetric' write(iulog,*) 'i, j, Kuu(i,j), Kuu(j,i):', i, j, Kuu(i,j), Kuu(j,i) stop @@ -10419,7 +10595,7 @@ subroutine check_symmetry_element_matrix(nNodesPerElement, & do j = 1, nNodesPerElement do i = j, nNodesPerElement - if (abs(Kvv(i,j) - Kvv(j,i)) > eps10) then + if (abs(Kvv(i,j) - Kvv(j,i)) > eps11) then write(iulog,*) 'Kvv is not symmetric' write(iulog,*) 'i, j, Kvv(i,j), Kvv(j,i):', i, j, Kvv(i,j), Kvv(j,i) stop @@ -10431,7 +10607,7 @@ subroutine check_symmetry_element_matrix(nNodesPerElement, & do j = 1, nNodesPerElement do i = 1, nNodesPerElement - if (abs(Kuv(i,j) - Kvu(j,i)) > eps10) then + if (abs(Kuv(i,j) - Kvu(j,i)) > eps11) then write(iulog,*) 'Kuv /= (Kvu)^T' write(iulog,*) 'i, j, Kuv(i,j), Kvu(j,i):', i, j, Kuv(i,j), Kvu(j,i) stop @@ -10483,12 +10659,14 @@ subroutine check_symmetry_assembled_matrix_3d(nx, ny, nz, & real(dp) :: val1, val2 ! values of matrix coefficients - real(dp) :: maxdiff, diag_entry, avg_val + real(dp) :: maxdiff, global_maxdiff, diag_entry, avg_val integer :: & staggered_ilo, staggered_ihi, & ! bounds of locally owned vertices on staggered grid staggered_jlo, staggered_jhi + integer :: rmax, imax, jmax, kmax, mmax + staggered_ilo = parallel%staggered_ilo staggered_ihi = parallel%staggered_ihi staggered_jlo = parallel%staggered_jlo @@ -10503,6 +10681,7 @@ subroutine check_symmetry_assembled_matrix_3d(nx, ny, nz, & ! and/or aborts. maxdiff = 0.d0 + rmax = 0; imax = 0; jmax = 0; kmax = 0; mmax = 0 ! Loop over locally owned vertices. ! Each active vertex is associate with 2*nz matrix rows belonging to this processor. @@ -10541,12 +10720,15 @@ subroutine check_symmetry_assembled_matrix_3d(nx, ny, nz, & if (val2 /= val1) then - if (abs(val2 - val1) > maxdiff) maxdiff = abs(val2 - val1) + if (abs(val2 - val1) > maxdiff) then + maxdiff = abs(val2 - val1) + rmax = this_rank; imax = i; jmax = j; kmax = k; mmax = m + endif ! if difference is small, then fix the asymmetry by averaging values ! else print a warning and abort - if ( abs(val2-val1) < eps08*abs(diag_entry) ) then + if ( abs(val2-val1) < eps11*abs(diag_entry) ) then avg_val = 0.5d0 * (val1 + val2) Auu( m, k, i, j ) = avg_val Auu(mm, k+kA,i+iA,j+jA) = avg_val @@ -10568,12 +10750,15 @@ subroutine check_symmetry_assembled_matrix_3d(nx, ny, nz, & if (val2 /= val1) then - if (abs(val2 - val1) > maxdiff) maxdiff = abs(val2 - val1) + if (abs(val2 - val1) > maxdiff) then + maxdiff = abs(val2 - val1) + rmax = this_rank; imax = i; jmax = j; kmax = k; mmax = m + endif ! if difference is small, then fix the asymmetry by averaging values ! else print a warning and abort - if ( abs(val2-val1) < eps08*abs(diag_entry) ) then + if ( abs(val2-val1) < eps11*abs(diag_entry) ) then avg_val = 0.5d0 * (val1 + val2) Auv( m, k, i, j ) = avg_val Avu(mm, k+kA,i+iA,j+jA) = avg_val @@ -10623,12 +10808,15 @@ subroutine check_symmetry_assembled_matrix_3d(nx, ny, nz, & if (val2 /= val1) then - if (abs(val2 - val1) > maxdiff) maxdiff = abs(val2 - val1) + if (abs(val2 - val1) > maxdiff) then + maxdiff = abs(val2 - val1) + rmax = this_rank; imax = i; jmax = j; kmax = k; mmax = m + endif ! if difference is small, then fix the asymmetry by averaging values ! else print a warning and abort - if ( abs(val2-val1) < eps08*abs(diag_entry) ) then + if ( abs(val2-val1) < eps11*abs(diag_entry) ) then avg_val = 0.5d0 * (val1 + val2) Avv( m, k, i, j ) = avg_val Avv(mm, k+kA,i+iA,j+jA) = avg_val @@ -10648,14 +10836,17 @@ subroutine check_symmetry_assembled_matrix_3d(nx, ny, nz, & val1 = Avu( m, k, i, j) ! value of Avu(row,col) val2 = Auv(mm, k+kA, i+iA, j+jA) ! value of Auv(col,row) - if (abs(val2 - val1) > maxdiff) maxdiff = abs(val2 - val1) + if (abs(val2 - val1) > maxdiff) then + maxdiff = abs(val2 - val1) + rmax = this_rank; imax = i; jmax = j; kmax = k; mmax = m + endif if (val2 /= val1) then ! if difference is small, then fix the asymmetry by averaging values ! else print a warning and abort - if ( abs(val2-val1) < eps08*abs(diag_entry) ) then + if ( abs(val2-val1) < eps11*abs(diag_entry) ) then avg_val = 0.5d0 * (val1 + val2) Avu( m, k, i, j ) = avg_val Auv(mm, k+kA,i+iA,j+jA) = avg_val @@ -10681,11 +10872,14 @@ subroutine check_symmetry_assembled_matrix_3d(nx, ny, nz, & enddo ! i enddo ! j - if (verbose_matrix) maxdiff = parallel_reduce_max(maxdiff) - - if (verbose_matrix .and. main_task) then - write(iulog,*) ' ' - write(iulog,*) 'Max difference from symmetry =', maxdiff + if (verbose_matrix) then + global_maxdiff = parallel_reduce_max(maxdiff) + if (maxdiff == global_maxdiff) then + ! maxdiff is on this processor; compute and broadcast the global index + call parallel_globalindex(imax, jmax, iglobal, jglobal, parallel) + write(iulog,*) 'Max asymmetry =', global_maxdiff + write(iulog,*) ' i, j, ig, jg, k, m =', imax, jmax, iglobal, jglobal, kmax, mmax + endif endif end subroutine check_symmetry_assembled_matrix_3d @@ -10730,7 +10924,9 @@ subroutine check_symmetry_assembled_matrix_2d(nx, ny, & real(dp) :: val1, val2 ! values of matrix coefficients - real(dp) :: maxdiff, diag_entry, avg_val + real(dp) :: maxdiff, global_maxdiff, diag_entry, avg_val + + integer :: rmax, imax, jmax, mmax integer :: & staggered_ilo, staggered_ihi, & ! bounds of locally owned vertices on staggered grid @@ -10750,6 +10946,7 @@ subroutine check_symmetry_assembled_matrix_2d(nx, ny, & ! and/or aborts. maxdiff = 0.d0 + rmax = 0; imax = 0; jmax = 0; mmax = 0 ! Loop over locally owned vertices. ! Each active vertex is associate with 2*nz matrix rows belonging to this processor. @@ -10778,13 +10975,18 @@ subroutine check_symmetry_assembled_matrix_2d(nx, ny, & val1 = Auu(i, j, m ) ! value of Auu(row,col) val2 = Auu(i+iA, j+jA, mm) ! value of Auu(col,row) if (val2 /= val1) then - if (abs(val2 - val1) > maxdiff) maxdiff = abs(val2 - val1) + if (abs(val2 - val1) > maxdiff) then + maxdiff = abs(val2 - val1) + rmax = this_rank; imax = i; jmax = j; mmax = m + endif ! if difference is small, then fix the asymmetry by averaging values + !WHL - Here and below, I commented out the code to average asymmetric values. + ! The hope is that the asymmetries are too small to matter. ! else print a warning and abort - if ( abs(val2-val1) < eps08*abs(diag_entry) ) then - avg_val = 0.5d0 * (val1 + val2) - Auu(i, j, m ) = avg_val - Auu(i+iA, j+jA, mm) = avg_val + if ( abs(val2-val1) < eps11*abs(diag_entry) ) then +! avg_val = 0.5d0 * (val1 + val2) +! Auu(i, j, m ) = avg_val +! Auu(i+iA, j+jA, mm) = avg_val else write(iulog,*) 'WARNING: Auu is not symmetric: this_rank, i, j, iA, jA =', this_rank, i, j, iA, jA write(iulog,*) 'Auu(row,col), Auu(col,row), diff/diag:', val1, val2, (val2 - val1)/diag_entry @@ -10798,13 +11000,16 @@ subroutine check_symmetry_assembled_matrix_2d(nx, ny, & val1 = Auv(i, j, m ) ! value of Auv(row,col) val2 = Avu(i+iA, j+jA, mm) ! value of Avu(col,row) if (val2 /= val1) then - if (abs(val2 - val1) > maxdiff) maxdiff = abs(val2 - val1) + if (abs(val2 - val1) > maxdiff) then + maxdiff = abs(val2 - val1) + rmax = this_rank; imax = i; jmax = j; mmax = m + endif ! if difference is small, then fix the asymmetry by averaging values ! else print a warning and abort - if ( abs(val2-val1) < eps08*abs(diag_entry) ) then - avg_val = 0.5d0 * (val1 + val2) - Auv(i, j, m ) = avg_val - Avu(i+iA, j+jA, mm) = avg_val + if ( abs(val2-val1) < eps11*abs(diag_entry) ) then +! avg_val = 0.5d0 * (val1 + val2) +! Auv(i, j, m ) = avg_val +! Avu(i+iA, j+jA, mm) = avg_val else write(iulog,*) 'WARNING: Auv is not equal to (Avu)^T, this_rank, i, j, iA, jA =', this_rank, i, j, iA, jA write(iulog,*) 'Auv(row,col), Avu(col,row), diff/diag:', val1, val2, (val2 - val1)/diag_entry @@ -10822,14 +11027,16 @@ subroutine check_symmetry_assembled_matrix_2d(nx, ny, & val2 = Avv(i+iA, j+jA, mm) ! value of Avv(col,row) if (val2 /= val1) then - if (abs(val2 - val1) > maxdiff) maxdiff = abs(val2 - val1) - + if (abs(val2 - val1) > maxdiff) then + maxdiff = abs(val2 - val1) + rmax = this_rank; imax = i; jmax = j; mmax = m + endif ! if difference is small, then fix the asymmetry by averaging values ! else print a warning and abort - if ( abs(val2-val1) < eps08*abs(diag_entry) ) then - avg_val = 0.5d0 * (val1 + val2) - Avv(i, j, m ) = avg_val - Avv(i+iA, j+jA, mm) = avg_val + if ( abs(val2-val1) < eps11*abs(diag_entry) ) then +! avg_val = 0.5d0 * (val1 + val2) +! Avv(i, j, m ) = avg_val +! Avv(i+iA, j+jA, mm) = avg_val else write(iulog,*) 'WARNING: Avv is not symmetric: this_rank, i, j, iA, jA =', this_rank, i, j, iA, jA write(iulog,*) 'Avv(row,col), Avv(col,row), diff/diag:', val1, val2, (val2 - val1)/diag_entry @@ -10844,16 +11051,17 @@ subroutine check_symmetry_assembled_matrix_2d(nx, ny, & val1 = Avu(i, j, m ) ! value of Avu(row,col) val2 = Auv(i+iA, j+jA, mm) ! value of Auv(col,row) - if (abs(val2 - val1) > maxdiff) maxdiff = abs(val2 - val1) - if (val2 /= val1) then - + if (abs(val2 - val1) > maxdiff) then + maxdiff = abs(val2 - val1) + rmax = this_rank; imax = i; jmax = j; mmax = m + endif ! if difference is small, then fix the asymmetry by averaging values ! else print a warning and abort - if ( abs(val2-val1) < eps08*abs(diag_entry) ) then - avg_val = 0.5d0 * (val1 + val2) - Avu(i, j, m ) = avg_val - Auv(i+iA, j+jA, mm) = avg_val + if ( abs(val2-val1) < eps11*abs(diag_entry) ) then +! avg_val = 0.5d0 * (val1 + val2) +! Avu(i, j, m ) = avg_val +! Auv(i+iA, j+jA, mm) = avg_val else write(iulog,*) 'WARNING: Avu is not equal to (Auv)^T, this_rank, i, j, iA, jA =', this_rank, i, j, iA, jA write(iulog,*) 'Avu(row,col), Auv(col,row), diff/diag:', val1, val2, (val2 - val1)/diag_entry @@ -10870,11 +11078,14 @@ subroutine check_symmetry_assembled_matrix_2d(nx, ny, & enddo ! iA enddo ! jA - if (verbose_matrix) maxdiff = parallel_reduce_max(maxdiff) - - if (verbose_matrix .and. main_task) then - write(iulog,*) ' ' - write(iulog,*) 'Max difference from symmetry =', maxdiff + if (verbose_matrix) then + global_maxdiff = parallel_reduce_max(maxdiff) + if (maxdiff == global_maxdiff) then + ! maxdiff is on this processor; compute and broadcast the global index + call parallel_globalindex(imax, jmax, iglobal, jglobal, parallel) + write(iulog,*) 'Max asymmetry =', global_maxdiff + write(iulog,*) ' ig, jg, m =', iglobal, jglobal, mmax + endif endif end subroutine check_symmetry_assembled_matrix_2d diff --git a/libglissade/glissade_velo_higher_pcg.F90 b/libglissade/glissade_velo_higher_pcg.F90 index 8d31e098..6d162af7 100644 --- a/libglissade/glissade_velo_higher_pcg.F90 +++ b/libglissade/glissade_velo_higher_pcg.F90 @@ -531,7 +531,7 @@ subroutine pcg_solver_standard_3d(nx, ny, & if (niters == maxiters) then if (verbose_pcg .and. main_task) then - write(iulog,*) 'Glissade PCG solver not converged' + write(iulog,*) 'Glissade PCG solver not yet converged' write(iulog,*) 'niters, err, tolerance:', niters, err, tolerance endif endif @@ -746,7 +746,7 @@ subroutine pcg_solver_standard_2d(nx, ny, & iter_loop: do iter = 1, maxiters if (verbose_pcg .and. main_task) then - write(iulog,*) 'iter =', iter +! write(iulog,*) 'iter =', iter endif call t_startf("pcg_precond") @@ -933,7 +933,7 @@ subroutine pcg_solver_standard_2d(nx, ny, & if (niters == maxiters) then if (verbose_pcg .and. main_task) then - write(iulog,*) 'Glissade PCG solver not converged' + write(iulog,*) 'Glissade PCG solver not yet converged' write(iulog,*) 'niters, err, tolerance:', niters, err, tolerance endif endif @@ -1233,12 +1233,6 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & ilocal = staggered_ihi - staggered_ilo + 1 jlocal = staggered_jhi - staggered_jlo + 1 - !WHL - debug - if (verbose_pcg .and. this_rank == rtest) then - write(iulog,*) 'stag_ihi, stag_ilo, ilocal:', staggered_ihi, staggered_ilo, ilocal - write(iulog,*) 'stag_jhi, stag_jlo, jlocal:', staggered_jhi, staggered_jlo, jlocal - endif - !---- Set up matrices for preconditioning call t_startf("pcg_precond_init") @@ -1483,28 +1477,6 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & active_vertex, & Mvv, rv, zv) ! solve Mvv*zv = rv for zv - !WHL - debug - if (verbose_pcg .and. this_rank == rtest) then - j = jtest - write(iulog,*) 'Standard SIA PC:' - write(iulog,*) ' ' - write(iulog,*) 'i, zu_sia(1), zu_sia(nz):' - do i = itest-3, itest+3 - write(iulog,*) ' ' - do k = 1, nz - write(iulog,'(i4, 2f16.10)') i, zu(1,i,j), zu(nz,i,j) - enddo - enddo ! i - write(iulog,*) ' ' - write(iulog,*) 'i, zv_sia(1), zv_sia(nz):' - do i = itest-3, itest+3 - write(iulog,*) ' ' - do k = 1, nz - write(iulog,'(i4, 2f16.10)') i, zv(1,i,j), zv(nz,i,j) - enddo - enddo ! i - endif - elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then ! Use a local tridiagonal solver to find an approximate solution of A*z = r @@ -1618,30 +1590,14 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & rv(:,:,:) = rv(:,:,:) - alpha*qv(:,:,:) call t_stopf("pcg_vecupdate") - !WHL - debug - if (verbose_pcg .and. this_rank == rtest) then - j = jtest - write(iulog,*) ' ' - write(iulog,*) 'alpha =', alpha - write(iulog,*) 'iter = 1: i, k, xu, xv, ru, rv:' - do i = staggered_ilo, staggered_ihi -!! do i = itest-3, itest+3 - write(iulog,*) ' ' - do k = 1, nz - write(iulog,'(2i4, 4f16.10)') i, k, xu(k,i,j), xv(k,i,j), ru(k,i,j), rv(k,i,j) - enddo - enddo - endif - !--------------------------------------------------------------- ! Iterate to solution !--------------------------------------------------------------- iter_loop: do iter = 2, maxiters_chrongear ! first iteration done above - if (verbose_pcg .and. this_rank == rtest) then - write(iulog,*) ' ' - write(iulog,*) 'iter =', iter + if (verbose_pcg .and. main_task) then +! write(iulog,*) 'iter =', iter endif !---- Compute PC(r) = solution z of Mz = r @@ -1727,9 +1683,6 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & endif ! precond - !WHL - debug - if (verbose_pcg .and. this_rank == rtest) write(iulog,*) 'L1' - call t_stopf("pcg_precond_iter") !---- Compute Az = A*z @@ -1832,7 +1785,7 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & if (mod(iter, linear_solve_ncheck) == 0 .or. iter == 5) then - if (verbose_pcg .and. this_rank == rtest) then + if (verbose_pcg .and. main_task) then write(iulog,*) ' ' write(iulog,*) 'Check convergence, iter =', iter endif @@ -1870,7 +1823,7 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & L2_resid = sqrt(rr) ! L2 norm of residual err = L2_resid/L2_rhs ! normalized error - if (verbose_pcg .and. this_rank == rtest) then + if (verbose_pcg .and. main_task) then write(iulog,*) 'iter, L2_resid, L2_rhs, error =', iter, L2_resid, L2_rhs, err endif @@ -1903,14 +1856,14 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & if (err < tolerance) then niters = iter - if (verbose_pcg .and. this_rank == rtest) then + if (verbose_pcg .and. main_task) then write(iulog,*) 'Glissade PCG solver has converged, iter =', niters write(iulog,*) ' ' endif exit iter_loop elseif (niters == maxiters_chrongear) then - if (verbose_pcg .and. this_rank == rtest) then - write(iulog,*) 'Glissade PCG solver not converged' + if (verbose_pcg .and. main_task) then + write(iulog,*) 'Glissade PCG solver not yet converged' write(iulog,*) 'niters, err, tolerance:', niters, err, tolerance write(iulog,*) ' ' endif @@ -2114,6 +2067,7 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & !WHL - debug integer :: iu_max, ju_max, iv_max, jv_max real(dp) :: ru_max, rv_max + real(dp) :: sum_temp staggered_ilo = parallel%staggered_ilo staggered_ihi = parallel%staggered_ihi @@ -2133,7 +2087,7 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & maxiters_chrongear = maxiters endif - if (verbose_pcg .and. this_rank == rtest) then + if (verbose_pcg .and. main_task) then write(iulog,*) 'Using native PCG solver (Chronopoulos-Gear)' write(iulog,*) 'tolerance, maxiters, precond =', tolerance, maxiters_chrongear, precond endif @@ -2148,7 +2102,7 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & if (precond == HO_PRECOND_NONE) then ! no preconditioner - if (verbose_pcg .and. this_rank == rtest) then + if (verbose_pcg .and. main_task) then write(iulog,*) 'Using no preconditioner' endif @@ -2170,15 +2124,15 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then - !WHL - debug - if (verbose_tridiag .and. this_rank==rtest) then - i = itest - j = jtest - write(iulog,*) ' ' - write(iulog,*) 'r, i, j =', this_rank, i, j - write(iulog,*) 'Auu =', Auu(i,j,:) - write(iulog,*) 'Avv =', Avv(i,j,:) - endif + !WHL - debug + if (verbose_tridiag .and. this_rank==rtest) then + i = itest + j = jtest + write(iulog,*) ' ' + write(iulog,*) 'r, i, j =', this_rank, i, j + write(iulog,*) 'Auu =', Auu(i,j,:) + write(iulog,*) 'Avv =', Avv(i,j,:) + endif allocate(Adiag_u (nx-1,ny-1)) allocate(Asubdiag_u(nx-1,ny-1)) @@ -2282,7 +2236,7 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & endif ! precond !WHL - debug - if (verbose_pcg .and. this_rank == rtest) write(iulog,*) 'Done in PC setup' + if (verbose_pcg .and. main_task) write(iulog,*) 'Done in PC setup' call t_stopf("pcg_precond_init") @@ -2421,7 +2375,7 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & enddo endif - if (verbose_pcg .and. this_rank == rtest) then + if (verbose_pcg .and. main_task) then write(iulog,*) 'call tridiag_solver_local_2d' endif @@ -2617,11 +2571,10 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & !WHL - debug if (verbose_pcg .and. this_rank == rtest) then j = jtest - write(iulog,*) ' ' - write(iulog,*) 'iter = 1: i, xu, xv, ru, rv:' -!! do i = itest-3, itest+3 - do i = staggered_ilo, staggered_ihi - write(iulog,'(i4, 4f16.10)') i, xu(i,j), xv(i,j), ru(i,j), rv(i,j) +!! write(iulog,*) ' ' +!! write(iulog,*) 'iter = 1: i, xu, xv, ru, rv:' + do i = itest-3, itest+3 +!! write(iulog,'(i4, 4f16.10)') i, xu(i,j), xv(i,j), ru(i,j), rv(i,j) enddo ! i endif @@ -2631,9 +2584,8 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & iter_loop: do iter = 2, maxiters_chrongear ! first iteration done above - if (verbose_pcg .and. this_rank == rtest) then - write(iulog,*) ' ' - write(iulog,*) 'iter =', iter + if (verbose_pcg .and. main_task) then +! write(iulog,*) 'iter =', iter endif !---- Compute PC(r) = solution z of Mz = r @@ -2740,17 +2692,16 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & enddo !WHL - debug - if (verbose_pcg .and. this_rank == rtest) then - j = jtest - write(iulog,*) ' ' - write(iulog,*) 'Before global tridiag PC u solve, r, j =', rtest, jtest - write(iulog,*) ' ' - write(iulog,*) 'i, Adiag_u, Asubdiag_u, Asupdiag_u, b_u:' - do i = itest-3, itest+3 - write(iulog,'(i4, 4e16.8)') i, Adiag_u(i,j), Asubdiag_u(i,j), Asupdiag_u(i,j), b_u(i,j) - enddo - endif - + if (verbose_pcg .and. this_rank == rtest) then + j = jtest + write(iulog,*) ' ' + write(iulog,*) 'Before global tridiag PC u solve, r, j =', rtest, jtest + write(iulog,*) ' ' + write(iulog,*) 'i, Adiag_u, Asubdiag_u, Asupdiag_u, b_u:' + do i = itest-3, itest+3 + write(iulog,'(i4, 4e16.8)') i, Adiag_u(i,j), Asubdiag_u(i,j), Asupdiag_u(i,j), b_u(i,j) + enddo + endif call tridiag_solver_global_2d(ilocal, jlocal, & parallel, tasks_row, & @@ -2787,17 +2738,6 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & enddo enddo - if (verbose_pcg .and. this_rank == rtest) then - j = jtest - write(iulog,*) ' ' - write(iulog,*) 'Before global tridiag PC v solve, r, j =', rtest, jtest - write(iulog,*) ' ' - write(iulog,*) 'i, Adiag_v, Asubdiag_v, Asupdiag_v, b_v:' - do i = itest-3, itest+3 - write(iulog,'(i4, 4e16.8)') i, Adiag_v(i,j), Asubdiag_v(i,j), Asupdiag_v(i,j), b_v(i,j) - enddo - endif - call tridiag_solver_global_2d(jlocal, ilocal, & parallel, tasks_col, & 'col', & ! tridiagonal solve for each column @@ -2911,16 +2851,6 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & call t_stopf("pcg_vecupdate") - !WHL - debug - if (verbose_pcg .and. this_rank == rtest) then - j = jtest - write(iulog,*) 'i, xu, xv, ru, rv:' -!! do i = staggered_ihi, staggered_ilo, -1 - do i = itest-3, itest+3 - write(iulog,'(i4, 4f16.10)') i, xu(i,j), xv(i,j), ru(i,j), rv(i,j) - enddo - endif - ! Check for convergence every linear_solve_ncheck iterations. ! Also check at iter = 5, to reduce iterations when the nonlinear solver is close to convergence. ! TODO: Check at iter = linear_solve_ncheck/2 instead of 5? This would be answer-changing. @@ -2928,15 +2858,18 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & ! For convergence check, use r = b - Ax if (mod(iter, linear_solve_ncheck) == 0 .or. iter == 5) then -!! if (mod(iter, linear_solve_ncheck) == 0 .or. iter == linear_solve_ncheck/2) then - if (verbose_pcg .and. this_rank == rtest) then + if (verbose_pcg .and. main_task) then write(iulog,*) ' ' write(iulog,*) ' check convergence, iter =', iter endif !---- Compute z = A*x (use z as a temp vector for A*x) + !WHL - debug - don't think this is needed, but try just in case +!! call staggered_parallel_halo(xu, parallel) +!! call staggered_parallel_halo(xv, parallel) + call t_startf("pcg_matmult_resid") call matvec_multiply_structured_2d(nx, ny, & parallel, & @@ -2949,11 +2882,22 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & !---- Compute residual r = b - A*x + !WHL - debug - don't think this is needed, but try just in case +!! call staggered_parallel_halo(bu, parallel) +!! call staggered_parallel_halo(bv, parallel) + !WHL - debug - don't think this is needed, but try just in case +!! call staggered_parallel_halo(zu, parallel) +!! call staggered_parallel_halo(zv, parallel) + call t_startf("pcg_vecupdate") ru(:,:) = bu(:,:) - zu(:,:) rv(:,:) = bv(:,:) - zv(:,:) call t_stopf("pcg_vecupdate") + !WHL - debug - don't think this is needed, but try just in case +!! call staggered_parallel_halo(ru, parallel) +!! call staggered_parallel_halo(rv, parallel) + !---- Compute dot product (r, r) call t_startf("pcg_dotprod") @@ -2968,7 +2912,7 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & L2_resid = sqrt(rr) ! L2 norm of residual err = L2_resid/L2_rhs ! normalized error - if (verbose_pcg .and. this_rank == rtest) then + if (verbose_pcg .and. main_task) then write(iulog,*) 'iter, L2_resid, L2_rhs, error =', iter, L2_resid, L2_rhs, err endif @@ -3001,13 +2945,13 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & if (err < tolerance) then niters = iter - if (verbose_pcg .and. this_rank == rtest) then + if (verbose_pcg .and. main_task) then write(iulog,*) 'Glissade PCG solver has converged, iter =', niters endif exit iter_loop elseif (niters == maxiters_chrongear) then - if (verbose_pcg .and. this_rank == rtest) then - write(iulog,*) 'Glissade PCG solver not converged' + if (verbose_pcg .and. main_task) then + write(iulog,*) 'Glissade PCG solver not yet converged' write(iulog,*) 'niters, err, tolerance:', niters, err, tolerance endif endif diff --git a/utils/build/generate_ncvars.py b/utils/build/generate_ncvars.py index 993c2c0a..e791670d 100755 --- a/utils/build/generate_ncvars.py +++ b/utils/build/generate_ncvars.py @@ -193,9 +193,9 @@ def write(self,vars): self.print_warning() for l in self.infile.readlines(): for token in self.handletoken: - if l.find(token) is not -1: + if l.find(token) != -1: break - if l.find(token) is not -1: + if l.find(token) != -1: for v in vars.keys(): self.handletoken[token](vars[v]) else: @@ -267,9 +267,9 @@ def write(self,vars): for k in module.keys(): l = l.replace(k.upper(),module[k]) for token in self.handletoken: - if l.find(token) is not -1: + if l.find(token) != -1: break - if l.find(token) is not -1: + if l.find(token) != -1: for v in vars.keys(): self.handletoken[token](vars[v]) elif '!GENVAR_DIMS!' in l: From deb5384fcd189b3f6869d6921f04cfa7902a3927 Mon Sep 17 00:00:00 2001 From: William Lipscomb Date: Mon, 5 Jan 2026 15:03:40 -0700 Subject: [PATCH 05/21] Change for reprosum testing This commit makes a very small change in glissade_velo_higher.F90, subroutine basal_sliding_bc_2d. A recent commit swaps the following lines: Auu(i,j,m) = Auu(i,j,m) + (dx*dy/vol0) * beta(i,j) Avv(i,j,m) = Avv(i,j,m) + (dx*dy/vol0) * beta(i,j) for these: increment = (dx*dy/vol0) * beta(i,j) Auu(i,j,m) = Auu(i,j,m) + increment Avv(i,j,m) = Avv(i,j,m) + increment Before the swap, the reprosum capability was not working for a Greenland test case. The results differ for a 1-year run with 1 task v. 4 tasks. After the swap, the results for 1 task v. 4 tasks are BFB. I discovered this more or less by accident. I don't see why such a change would matter for the reprosum algorithm. This commit backs out the change above. With the change backed out, the 1-task and 4-task results differ. A working hypothesis is that the differences are due to a logic error in the original reprosum code by Pat Worley. In 2023, Pat introduced some corrections. The next step is to implement Pat's corrections and see if they fix the reprosum capability. --- libglissade/glissade_velo_higher.F90 | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/libglissade/glissade_velo_higher.F90 b/libglissade/glissade_velo_higher.F90 index 3b3ec31b..73b45f52 100644 --- a/libglissade/glissade_velo_higher.F90 +++ b/libglissade/glissade_velo_higher.F90 @@ -8376,11 +8376,11 @@ subroutine basal_sliding_bc_2d(nx, ny, & do j = 1, ny-1 do i = 1, nx-1 if (active_vertex(i,j)) then -! Auu(i,j,m) = Auu(i,j,m) + (dx*dy/vol0) * beta(i,j) -! Avv(i,j,m) = Avv(i,j,m) + (dx*dy/vol0) * beta(i,j) - increment = (dx*dy/vol0) * beta(i,j) - Auu(i,j,m) = Auu(i,j,m) + increment - Avv(i,j,m) = Avv(i,j,m) + increment + Auu(i,j,m) = Auu(i,j,m) + (dx*dy/vol0) * beta(i,j) + Avv(i,j,m) = Avv(i,j,m) + (dx*dy/vol0) * beta(i,j) +!! increment = (dx*dy/vol0) * beta(i,j) +!! Auu(i,j,m) = Auu(i,j,m) + increment +!! Avv(i,j,m) = Avv(i,j,m) + increment endif ! active_vertex enddo ! i enddo ! j From b9d44df3aab71ccd30b722a15ed534cd1cac0d54 Mon Sep 17 00:00:00 2001 From: William Lipscomb Date: Mon, 5 Jan 2026 17:14:02 -0700 Subject: [PATCH 06/21] Replaced the reprosum code with Pat Worley's 2023 version This commit replaces the old cism_reprosum_mod - which was based on the current copy of shr_reprosum_mod in the CESM repo - with a new cism_reprosum_mod which is based on a current copy of shr_reprosum_mod in the E3SM repo. The E3SM version includes changes made by Pat Worley in 2023. Pat's changes are contained in three PRs described here: * https://github.com/E3SM-Project/E3SM/pull/5534 * https://github.com/E3SM-Project/E3SM/pull/5549 * https://github.com/E3SM-Project/E3SM/pull/5560 The new copy of shr_reprosum_mod is here: * https://github.com/E3SM-Project/E3SM/blob/master/share/util/shr_reprosum_mod.F90 Many of Pat's changes were cosmetic, but he also changed some logic to prevent potential failures of reproducibility. I had hoped that these changes would fix the problem described in the previous commit ('Change for reprosum testing'). However, I am getting the same answers as with the previous reprosum code. The runs with 1 and 4 tasks still fail the BFB test with the unswapped code in glissade_velo_higher.F90, while passing the BFB test with the swapped code. This commit restores the code swap. I still don't understand why this swap is needed. Thanks to Bill Sacks for pointing me to Pat's changes. --- libglimmer/cism_reprosum_mod.F90 | 1613 +++++++++++++++++--------- libglissade/glissade_velo_higher.F90 | 10 +- 2 files changed, 1088 insertions(+), 535 deletions(-) diff --git a/libglimmer/cism_reprosum_mod.F90 b/libglimmer/cism_reprosum_mod.F90 index 2f071f82..0740f459 100644 --- a/libglimmer/cism_reprosum_mod.F90 +++ b/libglimmer/cism_reprosum_mod.F90 @@ -1,8 +1,5 @@ -!WHL, Nov. 2025: Adapted from shr_reprosum_mod.F90, part of CESM shared code -! Changed 'shr' to 'cism' to avoid name conflicts with shared code - module cism_reprosum_mod -!----------------------------------------------------------------------- +!------------------------------------------------------------------------ ! ! Purpose: ! Compute reproducible global sums of a set of arrays across an MPI @@ -12,105 +9,136 @@ module cism_reprosum_mod ! Compute using either or both a scalable, reproducible algorithm and a ! scalable, nonreproducible algorithm: ! * Reproducible (scalable): -! Convert to fixed point (integer vector representation) to enable -! reproducibility when using MPI_Allreduce +! Convert each floating point summand to an integer vector +! representation, to enable reproducibility when using +! MPI_Allreduce, then convert the resulting global sum back to a +! floating point representation locally; ! * Alternative usually reproducible (scalable): ! Use parallel double-double algorithm due to Helen He and -! Chris Ding, based on David Bailey's/Don Knuth's DDPDD algorithm +! Chris Ding, based on David Bailey's/Don Knuth's DDPDD algorithm; ! * Nonreproducible (scalable): ! Floating point and MPI_Allreduce based. ! If computing both reproducible and nonreproducible sums, compare ! these and report relative difference (if absolute difference ! less than sum) or absolute difference back to calling routine. ! -! Author: P. Worley (based on suggestions from J. White for fixed -! point algorithm and on He/Ding paper for ddpdd +! Author: P. Worley (based on suggestions from J. White for integer +! vector algorithm and on He/Ding paper for DDPDD ! algorithm) ! -!----------------------------------------------------------------------- - -!----------------------------------------------------------------------- -!- use statements ------------------------------------------------------ -!----------------------------------------------------------------------- -!!#if ( defined noI8 ) -!! ! Workaround for when shr_kind_i8 is not supported. -!! use shr_kind_mod, only: r8 => shr_kind_r8, i8 => shr_kind_i4 -!!#else -!! use shr_kind_mod, only: r8 => shr_kind_r8, i8 => shr_kind_i8 -!!#endif - !WHL - Use CISM modules instead of shared modules if possible +! William Lipscomb, Jan. 2026: +! Renamed this module to cism_reprosum_mod. Modified as follows to build +! without using CESM shared code: +! * Replaced some use statements +! * Changed 'shr' to 'cism' to avoid name conflicts with shared code +! * Output directed to iulog (declared in glimmer_paramets) +! * Added some optional debugging statements +! I started from a version that includes some logic fixes and code cleanup +! done by Pat Worley in 2023. Pat's revised version differs from the code +! in the CESM repo as of Jan. 2026. +!------------------------------------------------------------------------ + +!------------------------------------------------------------------------ +!- use statements ------------------------------------------------------- +!------------------------------------------------------------------------ + !WHL - Use CISM modules instead of shared modules +!#if ( defined noI8 ) +! ! Workaround for when shr_kind_i8 is not supported. +! use shr_kind_mod, only: r8 => shr_kind_r8, i8 => shr_kind_i4 +!#else +! use shr_kind_mod, only: r8 => shr_kind_r8, i8 => shr_kind_i8 +!#endif +! use shr_log_mod, only: s_loglev => shr_log_Level +! use shr_log_mod, only: s_logunit => shr_log_Unit +! use shr_sys_mod, only: shr_sys_abort +! use shr_infnan_mod,only: shr_infnan_inf_type, assignment(=), & +! shr_infnan_posinf, shr_infnan_neginf, & +! shr_infnan_nan, & +! shr_infnan_isnan, shr_infnan_isinf, & +! shr_infnan_isposinf, shr_infnan_isneginf +!#ifndef EAMXX_STANDALONE +! use perf_mod +!#endif use glimmer_global, only: r8 => dp use glimmer_global, only: i8 use glimmer_paramets, only: iulog use glimmer_utils, only: double_to_binary -! use shr_log_mod, only: s_loglev => shr_log_Level -! use shr_log_mod, only: s_logunit => shr_log_Unit -! use shr_sys_mod, only: shr_sys_abort use cism_infnan_mod,only: cism_infnan_inf_type, assignment(=), & - cism_infnan_posinf, cism_infnan_neginf, & - cism_infnan_nan, & - cism_infnan_isnan, cism_infnan_isinf, & - cism_infnan_isposinf, cism_infnan_isneginf - -#ifdef TIMING - use perf_mod -#endif -!----------------------------------------------------------------------- -!- module boilerplate -------------------------------------------------- -!----------------------------------------------------------------------- + cism_infnan_posinf, cism_infnan_neginf, & + cism_infnan_nan, & + cism_infnan_isnan, cism_infnan_isinf, & + cism_infnan_isposinf, cism_infnan_isneginf + use profile, only: t_startf, t_stopf !WHL - replace with perf_mod? + ! end WHL mods + + ! Import MPI fcns/types + use mpi + +!------------------------------------------------------------------------ +!- module boilerplate --------------------------------------------------- +!------------------------------------------------------------------------ implicit none private -!----------------------------------------------------------------------- -!- include statements -------------------------------------------------- -!----------------------------------------------------------------------- -#include - save -!----------------------------------------------------------------------- -! Public interfaces ---------------------------------------------------- -!----------------------------------------------------------------------- +!------------------------------------------------------------------------ +! Public interfaces ----------------------------------------------------- +!------------------------------------------------------------------------ public :: & -!! shr_reprosum_setopts, &! set runtime options -!! shr_reprosum_calc, &! calculate distributed sum -!! shr_reprosum_tolExceeded ! utility function to check relative -!! ! differences against the tolerance cism_reprosum_setopts, &! set runtime options cism_reprosum_calc, &! calculate distributed sum cism_reprosum_tolExceeded ! utility function to check relative ! differences against the tolerance -!----------------------------------------------------------------------- -! Public data ---------------------------------------------------------- -!----------------------------------------------------------------------- +!------------------------------------------------------------------------ +! Public data ----------------------------------------------------------- +!------------------------------------------------------------------------ logical, public :: cism_reprosum_recompute = .false. real(r8), public :: cism_reprosum_reldiffmax = -1.0_r8 - logical, parameter, public :: verbose_reprosum = .false. -!! logical, parameter, public :: verbose_reprosum = .true. + !WHL mod +!! logical, parameter, public :: verbose_reprosum = .false. + logical, parameter, public :: verbose_reprosum = .true. + ! end WHL mod -!----------------------------------------------------------------------- +!------------------------------------------------------------------------ ! Private interfaces ---------------------------------------------------- -!----------------------------------------------------------------------- +!------------------------------------------------------------------------ private :: & ddpdd, &! double-double sum routine split_indices ! split indices among OMP threads -!----------------------------------------------------------------------- +!------------------------------------------------------------------------ ! Private data ---------------------------------------------------------- -!----------------------------------------------------------------------- +!------------------------------------------------------------------------ - !---------------------------------------------------------------------------- + !--------------------------------------------------------------------- ! cism_reprosum_mod options - !---------------------------------------------------------------------------- + !--------------------------------------------------------------------- logical :: repro_sum_use_ddpdd = .false. logical :: repro_sum_allow_infnan = .false. - !WHLmod -!! integer :: s_logunit = 6 + !WHL - Should this code be declared? + ! Not sure what EAMXX_STANDALONE means +#ifdef EAMXX_STANDALONE + ! Declare the C function interface + interface + subroutine cism_reprosumx86_fix_start(arg) bind(c) + use iso_c_binding + integer, intent(out) :: arg + end subroutine cism_reprosumx86_fix_start + end interface + + interface + subroutine cism_reprosumx86_fix_end(arg) bind(c) + use iso_c_binding + integer, intent(in) :: arg + end subroutine cism_reprosumx86_fix_end + end interface +#endif CONTAINS @@ -124,55 +152,53 @@ subroutine cism_reprosum_setopts(repro_sum_use_ddpdd_in, & repro_sum_master, & repro_sum_logunit ) -!----------------------------------------------------------------------- +!------------------------------------------------------------------------ ! Purpose: Set runtime options ! Author: P. Worley -!----------------------------------------------------------------------- -!------------------------------Arguments-------------------------------- - ! Use DDPDD algorithm instead of fixed precision algorithm +!------------------------------------------------------------------------ +!------------------------------Arguments--------------------------------- + ! Use DDPDD algorithm instead of integer vector algorithm logical, intent(in), optional :: repro_sum_use_ddpdd_in - ! Allow INF or NaN in summands logical, intent(in), optional :: repro_sum_allow_infnan_in - ! maximum permissible difference between reproducible and ! nonreproducible sums real(r8), intent(in), optional :: repro_sum_rel_diff_max_in - ! recompute using different algorithm when difference between ! reproducible and nonreproducible sums is too great logical, intent(in), optional :: repro_sum_recompute_in - - ! flag indicating whether this process should output log messages + ! flag indicating whether this MPI task should output + ! log messages logical, intent(in), optional :: repro_sum_master - ! unit number for log messages integer, intent(in), optional :: repro_sum_logunit - -!---------------------------Local Workspace----------------------------- +!---------------------------Local Workspace------------------------------ integer logunit ! unit number for log messages logical master ! local master? logical,save :: firstcall = .true. ! first call integer :: ierr ! MPI error return -!----------------------------------------------------------------------- +!------------------------------------------------------------------------ if ( present(repro_sum_master) ) then master = repro_sum_master else - master = .false. !WHL master = main_task? + master = .false. endif if ( present(repro_sum_logunit) ) then logunit = repro_sum_logunit else !! logunit = s_logunit - logunit = iulog ! CISM default + logunit = iulog endif if (.not. firstcall) then -!! call shr_sys_abort('shr_reprosum_setopts ERROR: multiple calls') + !WHL mod +! write(logunit,*) 'shr_reprosum_setopts: ERROR can only be called once' +! call shr_sys_abort('shr_reprosum_setopts ERROR: multiple calls') write(logunit,*) 'cism_reprosum_setopts: ERROR can only be called once' call mpi_abort(MPI_COMM_WORLD, 1001, ierr) + ! end WHL mod endif firstcall = .false. @@ -190,34 +216,34 @@ subroutine cism_reprosum_setopts(repro_sum_use_ddpdd_in, & endif if (master) then if ( repro_sum_use_ddpdd ) then - write(logunit,*) 'CISM_REPROSUM_SETOPTS: ',& + write(logunit,*) 'cism_REPROSUM_SETOPTS: ',& 'Using double-double-based (scalable) usually reproducible ', & 'distributed sum algorithm' else - write(logunit,*) 'CISM_REPROSUM_SETOPTS: ',& - 'Using fixed-point-based (scalable) reproducible ', & + write(logunit,*) 'cism_REPROSUM_SETOPTS: ',& + 'Using integer-vector-based (scalable) reproducible ', & 'distributed sum algorithm' endif if ( repro_sum_allow_infnan ) then - write(logunit,*) 'CISM_REPROSUM_SETOPTS: ',& + write(logunit,*) 'cism_REPROSUM_SETOPTS: ',& 'Will calculate sum when INF or NaN are included in summands' else - write(logunit,*) 'CISM_REPROSUM_SETOPTS: ',& + write(logunit,*) 'cism_REPROSUM_SETOPTS: ',& 'Will abort if INF or NaN are included in summands' endif - if (cism_reprosum_reldiffmax >= 0._r8) then + if (cism_reprosum_reldiffmax >= 0.0_r8) then write(logunit,*) ' ',& 'with a maximum relative error tolerance of ', & - cism_reprosum_reldiffmax + cism_reprosum_reldiffmax if (cism_reprosum_recompute) then write(logunit,*) ' ',& 'If tolerance exceeded, sum is recomputed using ', & 'a serial algorithm.' else write(logunit,*) ' ',& - 'If tolerance exceeded, fixed-precision is sum used ', & + 'If tolerance exceeded, integer-vector-based sum is used ', & 'but a warning is output.' endif else @@ -239,48 +265,192 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands, & gbl_max_nsummands, gbl_max_nsummands_out,& gbl_count, repro_sum_validate, & repro_sum_stats, rel_diff, commid ) -!---------------------------------------------------------------------- +!------------------------------------------------------------------------ ! ! Purpose: -! Compute the global sum of each field in "arr" using the indicated +! Compute the global sum of each field in 'arr' using the indicated ! communicator with a reproducible yet scalable implementation based -! on a fixed point algorithm. An alternative is to use an "almost -! always reproducible" floating point algorithm, as described below. -! -! The accuracy of the fixed point algorithm is controlled by the -! number of "levels" of integer expansion. The algorithm will calculate -! the number of levels that is required for the sum to be essentially -! exact. (The sum as represented by the integer expansion will be exact, -! but roundoff may perturb the least significant digit of the returned -! real*8 representation of the sum.) The optional parameter arr_max_levels -! can be used to override the calculated value. The optional parameter -! arr_max_levels_out can be used to return the values used. -! -! The algorithm also requires an upper bound on -! the maximum summand (in absolute value) for each field, and will -! calculate this internally. However, if the optional parameters +! on first converting each floating point summand into an equivalent +! representation using a vector of integers, summing the integer +! vectors, then converting the resulting sum back to a floating point +! representation. An alternative is to use an 'almost always +! reproducible' floating point algorithm (DDPDD), as described below. +! +! Description of integer vector algorithm: +!----------------------------------------- +! The basic idea is to represent the mantissa of each floating point +! value as an integer, add these integers, and then convert back to a +! floating point value. For a real*8 value, there are enough digits in +! an integer*8 variable to not lose any information (in the +! mantissa). However, each of these integers would have a different +! implicit exponent if done in a naive way, and so the sum would not +! be accurate. Also, even with the same 'normalization', the sum might +! exceed the maximum value representable by an integer*8, causing +! an overflow. Instead, a vector of integers is generated, where a +! given element (or level using the terminology used in the code) of +! the vector is associated with a particular exponent. The mantissa +! for a given floating point value is then converted to some number of +! integer values, depending on the exponent of the floating point +! value, the normalization of its mantissa, the maximum number of +! summands, the number of participating MPI tasks and of OpenMP +! threads, and the exponents associated with the levels of the integer +! vector, and added into the appropriate levels of the integer +! vector. Each MPI task has its own integer vector representing the +! local sum. This is then summed across all participating MPI tasks +! using an MPI_Allreduce, and, lastly, converted back to a floating +! point value. Note that the same approach works for a vector of +! integer*4 variables, simply requiring more levels, both for the full +! summation vector and for each individual real*8 summand. This is a +! compile time option in the code, in support of systems for which the +! compiler or MPI library has issues when using integer*8. As +! implemented, this algorithm should work for any floating point and +! integer type as long as they share the same base. The code is +! written as if for real*8 and integer*8 variables, but the only +! dependence is on the types 'r8' and 'i8', which are defined in the +! code, currently with reference to the corresponding types in +! shr_kind_mod. This is how integer*4 support is implemented, by +! defining i8 to be shr_kind_i4 instead of shr_kind_i8. + ! !WHL - The CISM types are declared in glimmer_global. +! For this to work, each MPI task must have the same number of levels +! and same implicit exponent for each level. These levels must be +! sufficient to represent the smallest and largest nonzero individual +! summands (in absolute value) and the largest possible intermediate +! sum, including the final sum. Most of the complexity in the +! algorithm is in identifying the number of levels, the exponent +! associated with each level, and the appropriate levels to target +! when converting a floating point value into its integer vector +! representation. There are also some subtleties in reconstructing the +! final sum from the integer vector, as described below. For each +! floating point value, the exponent and mantissa are extracted using +! the fortran intrinsics 'exponent' and 'fraction'. The mantissa is +! then 'shifted' to match the exponent for a target level in the +! integer vector using the 'scale' intrinsic. 'int(X,i8)' is used +! for the conversion for the given level, and subtraction between +! this integer and the original 'shifted' value identifies the +! remainder that will be converted to an integer for the next level +! in the vector. The logic continues until the remainder is zero. As +! mentioned above, the only requirement, due to the implementation +! using these fortran intrinsics, is that floating point and integer +! models use the same base, e.g. +! radix(1.0_r8) == radix(1_i8) +! for real*8 and integer*8. If not, then the alternative algorithm +! DDPDD mentioned above and described below is used instead. The +! integer representation must also have enough digits for the +! potential growth of the sum for each level, so could conceivably be +! too small for a large number of summands. +! +! Upper bounds on the total number of summands and on all intermediate +! sums are calculated as +! * +! and +! * +! * +! respectively. The maximum number of summands per MPI task and the +! maximum absolute value over all nonzero summands are global +! information that need to be determined with additional MPI +! collectives. The minimum nonzero absolute value summand is also +! global information. Fortunately, all of these can be determined with +! a single MPI_Allreduce call, so only one more than that required for +! the sum itself. (Note that, in actuality, the exponents of max and +! min summands are determined, and these are used to calculate bounds +! on the maximum and minimum, allowing the use of an MPI_INTEGER +! vector in the MPI_Allreduce call.) +! +! The actual code is made a little messier by (a) supporting summation +! of multiple fields without increasing the number of MPI_Allreduce +! calls, (b) supporting OpenMP threading of the local arithmetic, (c) +! allowing the user to specify estimates for the global information +! (to avoid the additional MPI_Allreduce), (d) including a check of +! whether user specified bounds were sufficient and, if not, +! determining the actual bounds and recomputing the sum, and (e) +! allowing the user to specify the maximum number of levels to use, +! potentially losing accuracy but still preserving reproducibility and +! being somewhat cheaper to compute. +! +! The conversion of the local summands to vectors of integers, the +! summation of the local vectors of integers, and the summation of the +! distributed vectors of integers will be exact (if optional parameters +! are not used to decrease the accuracy - see below). However, the +! conversion of the vector of integer representation to a floating +! point value may be subject to rounding errors. Before the +! conversion, the vector of integers is adjusted so that all elements +! have the same sign, and so that the value, in absolute value, at a +! given level is strictly less than what can be represented at the +! next lower level (larger exponent) and strictly greater than what +! can represented at the next higher level (smaller exponent). Since +! all elements have the same sign, the sign is set to positive +! temporarily and then restored when the conversion to floating point +! is complete. These are all integer operations, so no accuracy is +! lost. These adjustments eliminate the possibility of catastrophic +! cancellation. Also, when converting the individual elements to +! floating point values and summing them, the summation is now +! equivalent to concatenating the digits in the mantissas for the +! component summands. In consequence, in the final step when each +! element of this modified vector of integers is converted to a +! floating point value and added into the intermediate sum, any +! rounding is limited to the least significant digit representable +! in the final floating point sum. +! +! Any such rounding error will be sensitive to the particular floating +! values generated from the integer vector, and so will be +! sensitive to the number of levels in the vector and the implicit +! exponent associated with each level, which are themselves functions +! of the numbers of MPI tasks and OpenMP threads and the number of +! digits representable in an integer. To avoid this sensitivity, +! (effectively) generate a new integer vector in which each component +! integer has a fixed number of significant digits (e.g., +! digits(1.0_r8)) and generate the floating point values from these +! before summing. (See comments in code for more details.) This +! creates a sequence of floating point values to be summed that are +! independent of, for example, the numbers of MPI tasks and OpenMP +! threads or whether using integer*8 or integer*4 internal +! representations in the integer vector, and thus ensure +! reproducibility with respect to these options. +! +! Description of optional parameters for integer vector algorithm: +!----------------------------------------------------------------- +! The accuracy of the integer vector algorithm is controlled by the +! total number of levels of integer expansion. The algorithm +! calculates the number of levels that is required for the sum to be +! essentially exact. (The sum as represented by the integer expansion +! is exact, but roundoff may perturb the least significant digit of +! the returned floating point representation of the sum.) The optional +! parameter arr_max_levels can be used to override the calculated +! value for each field. The optional parameter arr_max_levels_out can +! be used to return the values used. +! +! The algorithm requires an upper bound on the maximum summand +! (in absolute value) for each field, and will calculate this internally +! using an MPI_Allreduce. However, if the optional parameters ! arr_max_levels and arr_gbl_max are both set, then the algorithm will -! use the values in arr_gbl_max for the upper bounds instead. If these -! are not upper bounds, or if the upper bounds are not tight enough -! to achieve the requisite accuracy, and if the optional parameter -! repro_sum_validate is NOT set to .false., the algorithm will repeat the -! computation with appropriate upper bounds. If only arr_gbl_max is present, -! then the maxima are computed internally (and the specified values are -! ignored). The optional parameter arr_gbl_max_out can be -! used to return the values used. -! -! Finally, the algorithm requires an upper bound on the number of -! local summands across all processes. This will be calculated internally, -! using an MPI collective, but the value in the optional argument -! gbl_max_nsummands will be used instead if (1) it is present, (2) -! it is > 0, and (3) the maximum value and required number of levels -! are also specified. (If the maximum value is calculated, the same -! MPI collective is used to determine the maximum number of local -! summands.) The accuracy of the user-specified value is not checked. -! However, if set to < 1, the value will instead be calculated. If the -! optional parameter gbl_max_nsummands_out is present, then the value -! used (gbl_max_nsummands if >= 1; calculated otherwise) will be -! returned. +! use the values in arr_gbl_max for the upper bounds instead. If only +! arr_gbl_max is present, then the maxima are computed internally +! (and the specified values are ignored). The optional parameter +! arr_gbl_max_out can be used to return the values used. +! +! The algorithm also requires an upper bound on the number of +! local summands across all MPI tasks. (By definition, the number of +! local summands is the same for each field on a given MPI task, i.e., +! the input parameter nsummands.) This will be calculated internally, +! using an MPI_Allreduce, but the value in the optional argument +! gbl_max_nsummands will be used instead if (1) it is present, +! (2) the value is > 0, and (3) the maximum values and required number +! of levels are also specified. (If the maximum values are calculated, +! then the same MPI_Allreduce is used to determine the maximum numbers +! of local summands.) The accuracy of the user-specified value is not +! checked. However, if set to < 1, the value will instead be calculated. +! If the optional parameter gbl_max_nsummands_out is present, +! then the value used (gbl_max_nsummands if >= 1; calculated otherwise) +! will be returned. +! +! If the user-specified upper bounds on maximum summands are +! inaccurate or if the user-specified upper bounds (maximum summands +! and number of local summands) and numbers of levels causes +! any of the global sums to have fewer than the expected +! number of significant digits, and if the optional parameter +! repro_sum_validate is NOT set to .false., then the algorithm will +! repeat the computations with internally calculated values for +! arr_max_levels, arr_gbl_max, and gbl_max_nsummands. ! ! If requested (by setting cism_reprosum_reldiffmax >= 0.0 and passing in ! the optional rel_diff parameter), results are compared with a @@ -288,32 +458,40 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands, & ! ! Note that the cost of the algorithm is not strongly correlated with ! the number of levels, which primarily shows up as a (modest) increase -! in cost of the MPI_Allreduce as a function of vector length. Rather the -! cost is more a function of (a) the number of integers required to -! represent an individual summand and (b) the number of MPI_Allreduce -! calls. The number of integers required to represent an individual -! summand is 1 or 2 when using 8-byte integers for 8-byte real summands -! when the number of local summands is not too large. As the number of -! local summands increases, the number of integers required increases. -! The number of MPI_Allreduce calls is either 2 (specifying nothing) or -! 1 (specifying gbl_max_nsummands, arr_max_levels, and arr_gbl_max -! correctly). When specifying arr_max_levels and arr_gbl_max +! in the cost of the MPI_Allreduce as a function of vector length. +! Rather the cost is more a function of (a) the number of integers +! required to represent an individual summand and (b) the number of +! MPI_Allreduce calls. The number of integers required to represent an +! individual summand is 1 or 2 when using 8-byte integers for 8-byte +! real summands when the number of local summands and number of MPI +! tasks are not too large. As the magnitude of either of these increase, +! the number of integers required increases. The number of +! MPI_Allreduce calls is either 2 (specifying nothing or just +! arr_max_levels and arr_gbl_max correctly) or 1 (specifying +! gbl_max_nsummands, arr_max_levels, and arr_gbl_max correctly). +! When specifying arr_max_nsummands, arr_max_levels, or arr_gbl_max ! incorrectly, 3 or 4 MPI_Allreduce calls will be required. ! +! Description of alternative (DDPDD) algorithm: +!---------------------------------------------- ! The alternative algorithm is a minor modification of a parallel ! implementation of David Bailey's routine DDPDD by Helen He -! and Chris Ding. Bailey uses the Knuth trick to implement quadruple -! precision summation of double precision values with 10 double -! precision operations. The advantage of this algorithm is that +! and Chris Ding. See, for example, +! Y. He, and C. Ding, 'Using Accurate Arithmetics to Improve +! Numerical Reproducibility and Stability in Parallel Applications,' +! J. Supercomputing, vol. 18, no. 3, 2001, pp. 259–277 +! and the citations therein. Bailey uses the Knuth trick to implement +! quadruple precision summation of double precision values with 10 +! double precision operations. The advantage of this algorithm is that ! it requires a single MPI_Allreduce and is less expensive per summand -! than is the fixed precision algorithm. The disadvantage is that it +! than is the integer vector algorithm. The disadvantage is that it ! is not guaranteed to be reproducible (though it is reproducible -! much more often than is the standard algorithm). This alternative -! is used when the optional parameter ddpdd_sum is set to .true. It is -! also used if the fixed precision algorithm radix assumption does not -! hold. +! much more often than is the standard floating point algorithm). +! This alternative is used when the optional parameter ddpdd_sum is +! set to .true. It is also used if the integer vector algorithm radix +! assumption does not hold. ! -!---------------------------------------------------------------------- +!------------------------------------------------------------------------ ! ! Arguments ! @@ -328,7 +506,7 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands, & logical, intent(in), optional :: ddpdd_sum ! use ddpdd algorithm instead - ! of fixed precision algorithm + ! of integer vector algorithm logical, intent(in), optional :: allow_infnan ! if .true., allow INF or NaN input values. @@ -351,11 +529,11 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands, & integer, intent(in), optional :: gbl_max_nsummands ! maximum of nsummand over all - ! processes + ! MPI tasks integer, intent(out), optional :: gbl_max_nsummands_out ! calculated maximum nsummands - ! over all processes + ! over all MPI tasks integer, intent(in), optional :: gbl_count ! was total number of summands; @@ -377,8 +555,8 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands, & real(r8), intent(out), optional :: rel_diff(2,nflds) ! relative and absolute - ! differences between fixed - ! and floating point sums + ! differences between integer + ! vector and floating point sums integer, intent(in), optional :: commid ! MPI communicator @@ -418,11 +596,11 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands, & ! input array integer :: omp_nthreads ! number of OpenMP threads integer :: mpi_comm ! MPI subcommunicator - integer :: mypid ! MPI process ID (COMM_WORLD) - integer :: tasks ! number of MPI processes + integer :: mypid ! MPI task ID (COMM_WORLD) + integer :: tasks ! number of MPI tasks integer :: ierr ! MPI error return integer :: ifld, isum, ithread ! loop variables - integer :: max_nsummands ! max nsummands over all processes + integer :: max_nsummands ! max nsummands over all MPI tasks ! or threads (used in both ways) integer, allocatable :: isum_beg(:), isum_end(:) @@ -447,26 +625,30 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands, & integer :: max_levels(nflds) ! maximum number of levels of ! integer expansion to use integer :: max_level ! maximum value in max_levels + integer :: extra_levels ! number of extra levels needed + ! to guarantee that sum over threads + ! or tasks does not cause overflow - real(r8) :: xmax_nsummands ! dble of max_nsummands + real(r8) :: xmax_nsummands ! real(max_nsummands,r8) real(r8) :: arr_lsum(nflds) ! local sums real(r8) :: arr_gsum_fast(nflds) ! global sum calculated using ! fast, nonreproducible, ! floating point alg. real(r8) :: abs_diff ! absolute difference between - ! fixed and floating point + ! integer vector and floating point ! sums + !WHL mod character(len=64) :: binary_str ! string to represent 64 bits of i8 integer integer :: n - + !end WHL mod #ifdef _OPENMP integer omp_get_max_threads external omp_get_max_threads #endif - -!----------------------------------------------------------------------- ! -! initialize local statistics variables +!------------------------------------------------------------------------ +! +! Initialize local statistics variables gbl_lor_red = 0 gbl_max_red = 0 repro_sum_fast = 0 @@ -474,31 +656,40 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands, & repro_sum_both = 0 nonrepro_sum = 0 -! set MPI communicator +! Set MPI communicator if ( present(commid) ) then mpi_comm = commid else mpi_comm = MPI_COMM_WORLD endif -#ifdef TIMING - call t_barrierf('sync_repro_sum',mpi_comm) +#ifndef EAMXX_STANDALONE +!WHL - commented out since the profile mod does not include tbarrier_f +! call t_barrierf('sync_repro_sum',mpi_comm) #endif -! check whether should abort if input contains NaNs or INFs + +! Check whether should abort if input contains NaNs or INFs abort_inf_nan = .not. repro_sum_allow_infnan if ( present(allow_infnan) ) then abort_inf_nan = .not. allow_infnan endif -#ifdef TIMING + +! With Fujitsu always abort on NaNs or INFs in input +#ifdef CPRFJ + abort_inf_nan = .true. +#endif + +#ifndef EAMXX_STANDALONE call t_startf('cism_reprosum_INF_NaN_Chk') #endif -! initialize flags to indicate that no NaNs or INFs are present in the input data + +! Initialize flags to indicate that no NaNs or INFs are present in the input data inf_nan_gchecks = .false. arr_gsum_infnan = .false. !TODO - Remove the inf_nan option; assume abort_inf_nan = T, as in CICE if (abort_inf_nan) then -! check whether input contains NaNs or INFs, and abort if so +! Check whether input contains NaNs or INFs, and abort if so nan_check = any(cism_infnan_isnan(arr)) inf_check = any(cism_infnan_isinf(arr)) @@ -509,20 +700,23 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands, & if ((nan_count > 0) .or. (inf_count > 0)) then call mpi_comm_rank(MPI_COMM_WORLD, mypid, ierr) -!! write(s_logunit,37) real(nan_count,r8), real(inf_count,r8), mypid +! write(s_logunit,37) real(nan_count,r8), real(inf_count,r8), mypid write(iulog,37) real(nan_count,r8), real(inf_count,r8), mypid -37 format("CISM_REPROSUM_CALC: Input contains ",e12.5, & - " NaNs and ", e12.5, " INFs on process ", i7) -!! call shr_sys_abort("shr_reprosum_calc ERROR: NaNs or INFs in input") - write(iulog,*) "cism_reprosum_calc ERROR: NaNs or INFs in input" +37 format("cism_REPROSUM_CALC: Input contains ",e12.5, & + " NaNs and ", e12.5, " INFs on MPI task ", i7) + !WHL mod +! call shr_sys_abort("shr_reprosum_calc ERROR: NaNs or INFs in input") + write(iulog,*) 'cism_reprosum_calc ERROR: NaNs or INFs in input' call mpi_abort(MPI_COMM_WORLD, 1001, ierr) + !end WHL mod endif endif +#ifndef CPRFJ else -! determine whether any fields contain NaNs or INFs, and avoid processing them +! Determine whether any fields contain NaNs or INFs, and avoid processing them ! via integer expansions inf_nan_lchecks = .false. @@ -531,69 +725,79 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands, & inf_nan_lchecks(2,ifld) = any(cism_infnan_isposinf(arr(:,ifld))) inf_nan_lchecks(3,ifld) = any(cism_infnan_isneginf(arr(:,ifld))) end do -#ifdef TIMING +#ifndef EAMXX_STANDALONE call t_startf("repro_sum_allr_lor") #endif call mpi_allreduce (inf_nan_lchecks, inf_nan_gchecks, 3*nflds, & MPI_LOGICAL, MPI_LOR, mpi_comm, ierr) gbl_lor_red = 1 -#ifdef TIMING +#ifndef EAMXX_STANDALONE call t_stopf("repro_sum_allr_lor") #endif + do ifld=1,nflds arr_gsum_infnan(ifld) = any(inf_nan_gchecks(:,ifld)) enddo +#endif endif -#ifdef TIMING + +#ifndef EAMXX_STANDALONE call t_stopf('cism_reprosum_INF_NaN_Chk') #endif -! check whether should use cism_reprosum_ddpdd algorithm + +! Check whether should use cism_reprosum_ddpdd algorithm use_ddpdd_sum = repro_sum_use_ddpdd if ( present(ddpdd_sum) ) then use_ddpdd_sum = ddpdd_sum endif -! check whether intrinsic-based algorithm will work on this system +! Check whether intrinsic-based algorithm will work on this system ! (requires floating point and integer bases to be the same) ! If not, always use ddpdd. - use_ddpdd_sum = use_ddpdd_sum .or. (radix(0._r8) /= radix(0_i8)) + use_ddpdd_sum = use_ddpdd_sum .or. (radix(1.0_r8) /= radix(1_i8)) if ( use_ddpdd_sum ) then -#ifdef TIMING + +#ifndef EAMXX_STANDALONE call t_startf('cism_reprosum_ddpdd') #endif + call cism_reprosum_ddpdd(arr, arr_gsum, nsummands, dsummands, & - nflds, mpi_comm) + nflds, mpi_comm) repro_sum_fast = 1 -#ifdef TIMING + +#ifndef EAMXX_STANDALONE call t_stopf('cism_reprosum_ddpdd') #endif + else -#ifdef TIMING + +#ifndef EAMXX_STANDALONE call t_startf('cism_reprosum_int') #endif -! get number of MPI tasks + +! Get number of MPI tasks call mpi_comm_size(mpi_comm, tasks, ierr) -! get number of OpenMP threads +! Get number of OpenMP threads #ifdef _OPENMP omp_nthreads = omp_get_max_threads() #else omp_nthreads = 1 #endif -! see if have sufficient information to not require max/min allreduce +! See if have sufficient information to not require max/min allreduce recompute = .true. validate = .false. if ( present(arr_gbl_max) .and. present(arr_max_levels) ) then recompute = .false. -! setting lower bound on max_level*nflds to be 64 to improve OpenMP +! Setting lower bound on max_level*nflds to be 64 to improve OpenMP ! performance for loopb in cism_reprosum_int max_level = (64/nflds) + 1 do ifld=1,nflds - if ((arr_gbl_max(ifld) .ge. 0.0_r8) .and. & + if ((arr_gbl_max(ifld) >= 0.0_r8) .and. & (arr_max_levels(ifld) > 0)) then arr_gmax_exp(ifld) = exponent(arr_gbl_max(ifld)) @@ -607,9 +811,9 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands, & if (.not. recompute) then -! determine maximum number of summands in local phases of the +! Determine maximum number of summands in local phases of the ! algorithm -#ifdef TIMING +#ifndef EAMXX_STANDALONE call t_startf("repro_sum_allr_max") #endif if ( present(gbl_max_nsummands) ) then @@ -625,40 +829,63 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands, & MPI_INTEGER, MPI_MAX, mpi_comm, ierr) gbl_max_red = 1 endif -#ifdef TIMING +#ifndef EAMXX_STANDALONE call t_stopf("repro_sum_allr_max") #endif -! determine maximum shift. Shift needs to be small enough that summation -! does not exceed maximum number of digits in i8. -! if requested, return max_nsummands before it is redefined +! Determine maximum shift. Shift needs to be small enough that summation, +! in absolute value, does not exceed maximum value representable by i8. + +! If requested, return max_nsummands before it is redefined if ( present( gbl_max_nsummands_out) ) then gbl_max_nsummands_out = max_nsummands endif -! summing within each thread first +! Summing within each thread first (adding 1 to max_nsummands +! to ensure that integer division rounds up) max_nsummands = (max_nsummands/omp_nthreads) + 1 ! then over threads and tasks max_nsummands = max(max_nsummands, tasks*omp_nthreads) ! A 'max' is used in the above calculation because the partial sum for ! each thread, calculated in cism_reprosum_int, is postprocessed so that ! each integer in the corresponding vector of integers is reduced in -! magnitude to be less than (radix(IX_8)**arr_max_shift). Therefore, +! magnitude to be less than (radix(1_i8)**arr_max_shift). Therefore, ! the maximum shift can be calculated separately for per thread sums ! and sums over threads and tasks, and the smaller value used. This is ! equivalent to using max_nsummands as defined above. - xmax_nsummands = dble(max_nsummands) - arr_max_shift = digits(0_i8) - (exponent(xmax_nsummands) + 1) + xmax_nsummands = real(max_nsummands,r8) + arr_max_shift = digits(1_i8) - (exponent(xmax_nsummands) + 1) if (arr_max_shift < 2) then -!! call shr_sys_abort('repro_sum failed: number of summands too '// & -!! 'large for fixed precision algorithm' ) - write(iulog,*) & - 'repro_sum failed: number of summands too large for fixed precision algorithm' + !WHL mod +! call shr_sys_abort('repro_sum failed: number of summands too '// & +! 'large for integer vector algorithm' ) + write(iulog,*) 'repro_sum failed: number of summands too '// & + 'large for integer vector algorithm' call mpi_abort(MPI_COMM_WORLD, 1001, ierr) + ! end WHL mod endif - -! calculate sum +! Note: by construction, each floating point value will be decomposed +! into a vector of integers each component of which will be strictly +! less than radix(1_i8)**arr_max_shift in absolute value, and the +! summation of max_nsummands of these, again in absolute value, will +! then be less than +! radix(1_i8)**(arr_max_shift + exponent(xmax_nsummands)) +! or radix(1_i8)**(digits(1_i8) - 1). This is more conservative than +! necessary, but it also allows the postprocessing mentioned above +! (and described later) to proceed without danger of introducing +! overflow. + +! Determine additional number of levels needed to support the +! postprocessing that reduces the magnitude of each component +! of the integer vector of the partial sum for each thread +! to be less than (radix(1_i8)**arr_max_shift). + extra_levels = (digits(1_i8) - 1)/arr_max_shift +! Extra levels are indexed by (-(extra_levels-1):0) +! Derivation of this is described in the comments in +! cism_reprosum_int. + +! Calculate sum if (present(repro_sum_validate)) then validate = repro_sum_validate else @@ -666,15 +893,16 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands, & endif call cism_reprosum_int(arr, arr_gsum, nsummands, dsummands, & nflds, arr_max_shift, arr_gmax_exp, & - arr_max_levels, max_level, arr_gsum_infnan, & - validate, recompute, omp_nthreads, mpi_comm) + arr_max_levels, max_level, extra_levels, & + arr_gsum_infnan, validate, recompute, & + omp_nthreads, mpi_comm) -! record statistics, etc. +! Record statistics, etc. repro_sum_fast = 1 if (recompute) then repro_sum_both = 1 else -! if requested, return specified levels and upper bounds on maxima +! If requested, return specified levels and upper bounds on maxima if ( present(arr_max_levels_out) ) then do ifld=1,nflds arr_max_levels_out(ifld) = arr_max_levels(ifld) @@ -689,40 +917,38 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands, & endif endif -! do not have sufficient information; calculate global max/min and +! Do not have sufficient information; calculate global max/min and ! use to compute required number of levels if (recompute) then -! record statistic +! Record statistic repro_sum_slow = 1 -! determine maximum and minimum (non-zero) summand values and +! Determine maximum and minimum (non-zero) summand values and ! maximum number of local summands -! allocate thread-specific work space +! Allocate thread-specific work space allocate(arr_tlmax_exp(nflds,omp_nthreads)) allocate(arr_tlmin_exp(nflds,omp_nthreads)) allocate(isum_beg(omp_nthreads)) allocate(isum_end(omp_nthreads)) -! split summand index range over OpenMP threads -!! call split_indices(nsummands, omp_nthreads, isum_beg, isum_end) - !WHL - debug - call split_indices(dsummands, omp_nthreads, isum_beg, isum_end) +! Split summand index range over OpenMP threads + call split_indices(nsummands, omp_nthreads, isum_beg, isum_end) !$omp parallel do & !$omp default(shared) & !$omp private(ithread, ifld, isum, arr_exp, arr_exp_tlmin, arr_exp_tlmax) do ithread=1,omp_nthreads -#ifdef TIMING +#ifndef EAMXX_STANDALONE call t_startf('repro_sum_loopa') #endif do ifld=1,nflds - arr_exp_tlmin = MAXEXPONENT(1._r8) - arr_exp_tlmax = MINEXPONENT(1._r8) + arr_exp_tlmin = MAXEXPONENT(1.0_r8) + arr_exp_tlmax = MINEXPONENT(1.0_r8) if (.not. arr_gsum_infnan(ifld)) then do isum=isum_beg(ithread),isum_end(ithread) - if (arr(isum,ifld) .ne. 0.0_r8) then + if (arr(isum,ifld) /= 0.0_r8) then arr_exp = exponent(arr(isum,ifld)) arr_exp_tlmin = min(arr_exp,arr_exp_tlmin) arr_exp_tlmax = max(arr_exp,arr_exp_tlmax) @@ -732,7 +958,7 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands, & arr_tlmin_exp(ifld,ithread) = arr_exp_tlmin arr_tlmax_exp(ifld,ithread) = arr_exp_tlmax end do -#ifdef TIMING +#ifndef EAMXX_STANDALONE call t_stopf('repro_sum_loopa') #endif end do @@ -746,77 +972,100 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands, & arr_lextremes(0,:) = -nsummands arr_lextremes(1:nflds,1) = -arr_lmax_exp(:) arr_lextremes(1:nflds,2) = arr_lmin_exp(:) -#ifdef TIMING +#ifndef EAMXX_STANDALONE call t_startf("repro_sum_allr_minmax") #endif call mpi_allreduce (arr_lextremes, arr_gextremes, 2*(nflds+1), & MPI_INTEGER, MPI_MIN, mpi_comm, ierr) -#ifdef TIMING +#ifndef EAMXX_STANDALONE call t_stopf("repro_sum_allr_minmax") #endif max_nsummands = -arr_gextremes(0,1) arr_gmax_exp(:) = -arr_gextremes(1:nflds,1) arr_gmin_exp(:) = arr_gextremes(1:nflds,2) -! if a field is identically zero or contains INFs or NaNs, arr_gmin_exp -! still equals MAXEXPONENT and arr_gmax_exp still equals MINEXPONENT. -! In this case, set arr_gmin_exp = arr_gmax_exp = MINEXPONENT +! If a field is identically zero or contains INFs or NaNs, arr_gmin_exp +! still equals MAXEXPONENT and arr_gmax_exp still equals MINEXPONENT. +! In this case, set arr_gmin_exp = arr_gmax_exp = MINEXPONENT do ifld=1,nflds arr_gmin_exp(ifld) = min(arr_gmax_exp(ifld),arr_gmin_exp(ifld)) enddo -! if requested, return upper bounds on observed maxima +! If requested, return upper bounds on observed maxima if ( present(arr_gbl_max_out) ) then do ifld=1,nflds arr_gbl_max_out(ifld) = scale(1.0_r8,arr_gmax_exp(ifld)) enddo endif -! if requested, return max_nsummands before it is redefined +! If requested, return max_nsummands before it is redefined if ( present( gbl_max_nsummands_out) ) then gbl_max_nsummands_out = max_nsummands endif -! determine maximum shift (same as in previous branch, but with calculated -! max_nsummands). Shift needs to be small enough that summation does not -! exceed maximum number of digits in i8. +! Determine maximum shift (same as in previous branch, but with calculated +! max_nsummands). Shift needs to be small enough that summation, in absolute +! value, does not exceed maximum value representable by i8. -! summing within each thread first +! Summing within each thread first (adding 1 to max_nsummands +! to ensure that integer division rounds up) max_nsummands = (max_nsummands/omp_nthreads) + 1 ! then over threads and tasks max_nsummands = max(max_nsummands, tasks*omp_nthreads) ! A 'max' is used in the above calculation because the partial sum for ! each thread, calculated in cism_reprosum_int, is postprocessed so that ! each integer in the corresponding vector of integers is reduced in -! magnitude to be less than (radix(IX_8)**arr_max_shift). Therefore, +! magnitude to be less than (radix(1_i8)**arr_max_shift). Therefore, ! the maximum shift can be calculated separately for per thread sums ! and sums over threads and tasks, and the smaller value used. This is ! equivalent to using max_nsummands as defined above. - xmax_nsummands = dble(max_nsummands) - arr_max_shift = digits(0_i8) - (exponent(xmax_nsummands) + 1) + xmax_nsummands = real(max_nsummands,r8) + arr_max_shift = digits(1_i8) - (exponent(xmax_nsummands) + 1) if (arr_max_shift < 2) then -!! call shr_sys_abort('repro_sum failed: number of summands too '// & -!! 'large for fixed precision algorithm' ) - write(iulog,*) & - 'repro_sum failed: number of summands too large for fixed precision algorithm' + !WHL mod +! call shr_sys_abort('repro_sum failed: number of summands too '// & +! 'large for integer vector algorithm' + write(iulog,*) 'repro_sum failed: number of summands too '// & + 'large for integer vector algorithm' call mpi_abort(MPI_COMM_WORLD, 1001, ierr) + ! end WHL mod endif - -! determine maximum number of levels required for each field -! ((digits(0.0_r8) + (arr_gmax_exp(ifld)-arr_gmin_exp(ifld))) / arr_max_shift) -! + 1 because first truncation probably does not involve a maximal shift -! + 1 to guarantee that the integer division rounds up (not down) +! Note: by construction, each floating point value will be decomposed +! into a vector of integers each component of which will be strictly +! less than radix(1_i8)**arr_max_shift in absolute value, and the +! summation of max_nsummands of these, again in absolute value, will +! then be less than +! radix(1_i8)**(arr_max_shift + exponent(xmax_nsummands)) +! or radix(1_i8)**(digits(1_i8) - 1). This is more conservative than +! necessary, but it also allows the postprocessing mentioned above +! (and described later) to proceed without danger of introducing +! overflow. + +! Determine maximum number of levels required for each field. +! Need enough levels to represent both the smallest and largest +! nonzero summands (in absolute value), and any values in between. +! The number of digits from the most significant digit in the +! largest summand to the most significant digit in the smallest +! summand is (arr_gmax_exp(ifld)-arr_gmin_exp(ifld)), and the maximum +! number of digits needed to represent the smallest value is +! digits(1.0_r8). Divide this total number of digits by the number of +! digits per level (arr_max_shift) to get the number of levels +! ((digits(1.0_r8) + (arr_gmax_exp(ifld)-arr_gmin_exp(ifld))) / arr_max_shift) +! with some tweaks: +! + 1 because first truncation for any given summand probably does +! not involve a maximal shift (but this adds only one to the total) +! + 1 to guarantee that the integer division rounds up (not down) ! (setting lower bound on max_level*nflds to be 64 to improve OpenMP ! performance for loopb in cism_reprosum_int) max_level = (64/nflds) + 1 do ifld=1,nflds max_levels(ifld) = 2 + & - ((digits(0.0_r8) + (arr_gmax_exp(ifld)-arr_gmin_exp(ifld))) & + ((digits(1.0_r8) + (arr_gmax_exp(ifld)-arr_gmin_exp(ifld))) & / arr_max_shift) if ( present(arr_max_levels) .and. (.not. validate) ) then -! if validate true, then computation with arr_max_levels failed -! previously +! If validate true, then computation with arr_max_levels failed +! previously if ( arr_max_levels(ifld) > 0 ) then max_levels(ifld) = & min(arr_max_levels(ifld),max_levels(ifld)) @@ -826,37 +1075,52 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands, & max_level = max_levels(ifld) enddo -! if requested, return calculated levels +! If requested, return calculated levels if ( present(arr_max_levels_out) ) then do ifld=1,nflds arr_max_levels_out(ifld) = max_levels(ifld) enddo endif -! calculate sum +! Determine additional number of levels needed to support the +! postprocessing that reduces the magnitude of each component +! of the integer vector of the partial sum for each thread +! to be less than (radix(1_i8)**arr_max_shift). + extra_levels = (digits(1_i8) - 1)/arr_max_shift +! Extra levels are indexed by (-(extra_levels-1):0) +! Derivation of this is described in the comments in +! cism_reprosum_int. + +! Calculate sum validate = .false. call cism_reprosum_int(arr, arr_gsum, nsummands, dsummands, & nflds, arr_max_shift, arr_gmax_exp, & - max_levels, max_level, arr_gsum_infnan, & - validate, recompute, omp_nthreads, mpi_comm) + max_levels, max_level, extra_levels, & + arr_gsum_infnan, validate, recompute, & + omp_nthreads, mpi_comm) endif -#ifdef TIMING + +#ifndef EAMXX_STANDALONE call t_stopf('cism_reprosum_int') #endif + endif -! compare fixed and floating point results +! Compare integer vector and floating point results if ( present(rel_diff) ) then if (cism_reprosum_reldiffmax >= 0.0_r8) then -#ifdef TIMING - call t_barrierf('sync_nonrepro_sum',mpi_comm) +#ifndef EAMXX_STANDALONE + !WHL - commented out since the profile mod does not include tbarrier_f +! call t_barrierf('sync_nonrepro_sum',mpi_comm) +#endif +#ifndef EAMXX_STANDALONE call t_startf('nonrepro_sum') #endif -! record statistic +! Record statistic nonrepro_sum = 1 -! compute nonreproducible sum - arr_lsum(:) = 0._r8 +! Compute nonreproducible sum + arr_lsum(:) = 0.0_r8 !$omp parallel do & !$omp default(shared) & !$omp private(ifld, isum) @@ -867,17 +1131,21 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands, & end do endif end do -#ifdef TIMING + +#ifndef EAMXX_STANDALONE call t_startf("nonrepro_sum_allr_r8") #endif call mpi_allreduce (arr_lsum, arr_gsum_fast, nflds, & MPI_REAL8, MPI_SUM, mpi_comm, ierr) -#ifdef TIMING +#ifndef EAMXX_STANDALONE call t_stopf("nonrepro_sum_allr_r8") +#endif +#ifndef EAMXX_STANDALONE call t_stopf('nonrepro_sum') #endif -! determine differences + +! Determine differences !$omp parallel do & !$omp default(shared) & !$omp private(ifld, abs_diff) @@ -914,7 +1182,7 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands, & endif end do -! return statistics +! Return statistics if ( present(repro_sum_stats) ) then repro_sum_stats(1) = repro_sum_stats(1) + repro_sum_fast repro_sum_stats(2) = repro_sum_stats(2) + repro_sum_slow @@ -924,38 +1192,44 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands, & repro_sum_stats(6) = repro_sum_stats(6) + gbl_lor_red endif + !WHL mod: optional diagnostics if (verbose_reprosum) then call mpi_comm_rank(MPI_COMM_WORLD, mypid, ierr) - if (mypid == 0) then - write(iulog,*) 'Exit reprosum, nflds =', nflds - write(iulog,*) ' n, arr_gsum, binary_str:' + if (mypid == 0) then + write(iulog,*) 'Exit reprosum, nflds =', nflds + write(iulog,*) ' n, arr_gsum, binary_str:' !! do n = 1, nflds - do n = 1, min(2,nflds) - call double_to_binary(arr_gsum(n), binary_str) - write(iulog,*) n, arr_gsum(n), binary_str - enddo - endif - endif + do n = 1, min(2,nflds) + call double_to_binary(arr_gsum(n), binary_str) + write(iulog,*) n, arr_gsum(n), binary_str + enddo + endif + endif + ! end WHL mod + + end subroutine cism_reprosum_calc - end subroutine cism_reprosum_calc ! !======================================================================== ! - subroutine cism_reprosum_int (arr, arr_gsum, nsummands, dsummands, nflds, & + subroutine cism_reprosum_int(arr, arr_gsum, nsummands, dsummands, nflds, & arr_max_shift, arr_gmax_exp, max_levels, & - max_level, skip_field, validate, recompute, & - omp_nthreads, mpi_comm ) -!---------------------------------------------------------------------- + max_level, extra_levels, skip_field, & + validate, recompute, omp_nthreads, mpi_comm ) +!------------------------------------------------------------------------ ! ! Purpose: -! Compute the global sum of each field in "arr" using the indicated +! Compute the global sum of each field in 'arr' using the indicated ! communicator with a reproducible yet scalable implementation based -! on a fixed point algorithm. The accuracy of the fixed point algorithm -! is controlled by the number of "levels" of integer expansion, the -! maximum value of which is specified by max_level. +! on first converting each floating point summand into an equivalent +! representation using a vector of integers, summing the integer +! vectors, then converting the resulting sum back to a floating point +! representation. The accuracy of the integer vector algorithm is +! controlled by the number of 'levels' of integer expansion, the maximum +! value of which is specified by max_level. ! -!---------------------------------------------------------------------- +!------------------------------------------------------------------------ ! ! Arguments ! @@ -972,6 +1246,10 @@ subroutine cism_reprosum_int (arr, arr_gsum, nsummands, dsummands, nflds, & ! of integer expansion integer, intent(in) :: max_level ! maximum value in ! max_levels + integer, intent(in) :: extra_levels ! number of extra levels + ! needed to guarantee that + ! sum over threads or tasks + ! does not cause overflow integer, intent(in) :: omp_nthreads ! number of OpenMP threads integer, intent(in) :: mpi_comm ! MPI subcommunicator @@ -996,25 +1274,27 @@ subroutine cism_reprosum_int (arr, arr_gsum, nsummands, dsummands, nflds, & ! ! Local workspace ! - integer, parameter :: max_jlevel = & - 1 + (digits(0_i8)/digits(0.0_r8)) + integer, parameter :: max_svlevel_factor = & + 1 + (digits(1_i8)/digits(1.0_r8)) - integer(i8) :: i8_arr_tlsum_level(0:max_level,nflds,omp_nthreads) + integer(i8) :: i8_arr_tlsum_level(-(extra_levels-1):max_level,nflds,omp_nthreads) ! integer vector representing local ! sum (per thread, per field) - integer(i8) :: i8_arr_lsum_level((max_level+3)*nflds) + integer(i8) :: i8_arr_lsum_level((max_level+extra_levels+2)*nflds) ! integer vector representing local ! sum integer(i8) :: i8_arr_level ! integer part of summand for current ! expansion level - integer(i8) :: i8_arr_gsum_level((max_level+3)*nflds) + integer(i8) :: i8_arr_gsum_level((max_level+extra_levels+2)*nflds) ! integer vector representing global ! sum - integer(i8) :: IX_8 ! integer representation of current - ! jlevels of X_8 ('part' of - ! i8_arr_gsum_level) + integer(i8) :: i8_gsum_level(-(extra_levels-1):max_level) + ! integer vector representing global + ! sum for one field + integer(i8) :: IX_8 ! integer representation of r8 value integer(i8) :: i8_sign ! sign global sum - integer(i8) :: i8_radix ! radix for i8 variables + integer(i8) :: i8_radix ! radix for i8 variables (and r8 + ! variables by earlier if-test) integer :: max_error(nflds,omp_nthreads) ! accurate upper bound on data? @@ -1024,7 +1304,7 @@ subroutine cism_reprosum_int (arr, arr_gsum, nsummands, dsummands, nflds, & integer :: isum_beg(omp_nthreads), isum_end(omp_nthreads) ! range of summand indices for each ! OpenMP thread - integer :: ifld, isum, ithread + integer :: ifld, isum, ithread, jlevel ! loop variables integer :: arr_exp ! exponent of summand integer :: arr_shift ! exponent used to generate integer @@ -1035,14 +1315,31 @@ subroutine cism_reprosum_int (arr, arr_gsum, nsummands, dsummands, nflds, & ! expansion of current ifld integer :: voffset ! modification to offset used to ! include validation metrics + integer :: min_level ! index of minimum levels (including + ! extra levels) for i8_arr_tlsum_level integer :: ioffset ! offset(ifld) - integer :: jlevel ! number of floating point 'pieces' - ! extracted from a given i8 integer + integer :: svlevel ! number of summands in summand_vector integer :: ierr ! MPI error return - integer :: LX(max_jlevel) ! exponent of X_8 (see below) + integer :: LX ! exponent of X_8 (see below) integer :: veclth ! total length of i8_arr_lsum_level - integer :: sum_digits ! lower bound on number of significant - ! in integer expansion of sum + integer :: i8_digit_count ! number of digits in integer + ! expansion of sum + integer :: i8_begin_level ! level starting from in + ! creating next 'exactly representable' + ! floating point value from modified + ! integer expansion of the sum + integer :: i8_trunc_level ! level at which the number of digits in + ! the modified integer expansion of the + ! sum exceeds the number of representable + ! digits in the floating point sum + integer :: i8_trunc_loc ! location of last digit at i8_trunc_level + ! in the modified integer expansion of the + ! sum that is representable in the floating + ! point sum + integer(i8) :: i8_trunc_level_rem + ! truncated digits at i8_trunc_level + ! in the modified integer expansion + ! of the sum integer :: curr_exp ! exponent of partial sum during ! reconstruction from integer vector integer :: corr_exp ! exponent of current summand in @@ -1051,35 +1348,38 @@ subroutine cism_reprosum_int (arr, arr_gsum, nsummands, dsummands, nflds, & real(r8) :: arr_frac ! fraction of summand real(r8) :: arr_remainder ! part of summand remaining after ! current level of integer expansion - real(r8) :: X_8(max_jlevel) ! r8 vector representation of current + real(r8) :: X_8 ! r8 representation of current ! i8_arr_gsum_level - real(r8) :: RX_8 ! r8 representation of difference - ! between current i8_arr_gsum_level - ! and current jlevels of X_8 - ! (== IX_8). Also used in final - ! scaling step - - logical :: first ! flag used to indicate that just - ! beginning reconstruction of sum - ! from integer vector - - !WHL - debug - integer :: mypid, k - call mpi_comm_rank(MPI_COMM_WORLD, mypid, ierr) -! -!----------------------------------------------------------------------- + real(r8) :: RX_8 ! r8 representation of (other) + ! integers used in calculation. + real(r8) :: summand_vector((max_level+extra_levels)*max_svlevel_factor) + ! vector of r8 values generated from + ! integer vector representation to be + ! summed to generate global sum + + logical :: first_stepd_iteration + ! flag used to indicate whether first + ! time through process of converting + ! vector of integers into a floating + ! point value, as it requires + ! special logic +! +!------------------------------------------------------------------------ ! Save radix of i8 variables in an i8 variable i8_radix = radix(IX_8) ! If validating upper bounds, reserve space for validation metrics -! In both cases, reserve an extra level for overflow from the top level +! In both cases, reserve extra levels for overflows from the top level if (validate) then - voffset = 3 + voffset = extra_levels + 2 else - voffset = 1 + voffset = extra_levels endif - ! compute offsets for each field +! For convenience, define minimum level index for i8_arr_tlsum_level + min_level = -(extra_levels-1) + +! Compute offsets for each field offset(1) = voffset do ifld=2,nflds offset(ifld) = offset(ifld-1) & @@ -1087,29 +1387,23 @@ subroutine cism_reprosum_int (arr, arr_gsum, nsummands, dsummands, nflds, & enddo veclth = offset(nflds) + max_levels(nflds) -! split summand index range over OpenMP threads - call split_indices(dsummands, omp_nthreads, isum_beg, isum_end) +! Split summand index range over OpenMP threads + call split_indices(nsummands, omp_nthreads, isum_beg, isum_end) -! convert local summands to vector of integers and sum +! Convert local summands to vector of integers and sum ! (Using scale instead of set_exponent because arr_remainder may not be -! "normal" after level 1 calculation) +! 'normal' after level 1 calculation) i8_arr_lsum_level(:) = 0_i8 - !WHLmod - initialized the gsum also. - ! This avoids having some garbage (lots of nonzero values) in the vector returned from mpi_reduce_sum. - ! (Though I don't think the nonzero values do any harm - i8_arr_gsum_level(:) = 0_i8 - !end WHLmod - !$omp parallel do & !$omp default(shared) & !$omp private(ithread, ifld, ioffset, isum, arr_frac, arr_exp, & !$omp arr_shift, ilevel, i8_arr_level, arr_remainder, RX_8, IX_8) do ithread=1,omp_nthreads -#ifdef TIMING - call t_startf('repro_sum_loopb') +#ifndef EAMXX_STANDALONE + call t_startf('repro_sum_loopb') #endif - do ifld=1,nflds + do ifld=1,nflds ioffset = offset(ifld) max_error(ifld,ithread) = 0 @@ -1119,25 +1413,24 @@ subroutine cism_reprosum_int (arr, arr_gsum, nsummands, dsummands, nflds, & if (skip_field(ifld)) cycle do isum=isum_beg(ithread),isum_end(ithread) - arr_remainder = 0.0_r8 - if (arr(isum,ifld) .ne. 0.0_r8) then + if (arr(isum,ifld) /= 0.0_r8) then arr_exp = exponent(arr(isum,ifld)) arr_frac = fraction(arr(isum,ifld)) -! test that global maximum upper bound is an upper bound +! Test that global maximum upper bound is an upper bound if (arr_exp > arr_gmax_exp(ifld)) then max_error(ifld,ithread) = 1 exit endif -! calculate first shift +! Calculate first shift arr_shift = arr_max_shift - (arr_gmax_exp(ifld)-arr_exp) -! determine first (probably) nonzero level (assuming initial fraction is -! 'normal' - algorithm still works if this is not true) -! NOTE: this is critical; scale will set to zero if min exponent is too small. +! Determine first (probably) nonzero level (assuming initial fraction is +! 'normal' - algorithm still works if this is not true) +! NOTE: this is critical; scale will set to zero if min exponent is too small. if (arr_shift < 1) then ilevel = (1 + (arr_gmax_exp(ifld)-arr_exp))/arr_max_shift arr_shift = ilevel*arr_max_shift - (arr_gmax_exp(ifld)-arr_exp) @@ -1150,67 +1443,118 @@ subroutine cism_reprosum_int (arr, arr_gsum, nsummands, dsummands, nflds, & ilevel = 1 endif - if (ilevel .le. max_levels(ifld)) then -! apply first shift/truncate, add it to the relevant running + if (ilevel <= max_levels(ifld)) then +! Apply first shift/truncate, add it to the relevant running ! sum, and calculate the remainder. arr_remainder = scale(arr_frac,arr_shift) i8_arr_level = int(arr_remainder,i8) i8_arr_tlsum_level(ilevel,ifld,ithread) = & i8_arr_tlsum_level(ilevel,ifld,ithread) + i8_arr_level arr_remainder = arr_remainder - i8_arr_level - -! while the remainder is non-zero, continue to shift, truncate, + +! While the remainder is non-zero, continue to shift, truncate, ! sum, and calculate new remainder - do while ((arr_remainder .ne. 0.0_r8) & + do while ((arr_remainder /= 0.0_r8) & .and. (ilevel < max_levels(ifld))) ilevel = ilevel + 1 arr_remainder = scale(arr_remainder,arr_max_shift) i8_arr_level = int(arr_remainder,i8) i8_arr_tlsum_level(ilevel,ifld,ithread) = & - i8_arr_tlsum_level(ilevel,ifld,ithread) + i8_arr_level - + i8_arr_tlsum_level(ilevel,ifld,ithread) + i8_arr_level arr_remainder = arr_remainder - i8_arr_level - - enddo ! arr_remainder /= 0.0_r8 + enddo endif - endif ! arr(isum,ifld) /= 0.0_r8 + endif - if (arr_remainder .ne. 0.0_r8) then + if (arr_remainder /= 0.0_r8) then not_exact(ifld,ithread) = 1 endif - enddo ! isum - -! postprocess integer vector to eliminate potential for overlap in the following -! sums over threads and processes: if value larger than or equal to -! (radix(IX_8)**arr_max_shift), add this 'overlap' to next larger integer in -! vector, resulting in nonoverlapping ranges for each component. Note that -! "ilevel-1==0" corresponds to an extra level used to guarantee that the sums -! over threads and processes do not overflow for ilevel==1. - do ilevel=max_levels(ifld),1,-1 - RX_8 = i8_arr_tlsum_level(ilevel,ifld,ithread) - IX_8 = int(scale(RX_8,-arr_max_shift),i8) - if (IX_8 .ne. 0_i8) then + enddo +! Postprocess integer vector to eliminate possibility of overflow +! during subsequent sum over threads and tasks, as per earlier +! comment on logic behind definition of max_nsummands. If value at a +! given level is larger than or equal to +! (radix(1_i8)**arr_max_shift), subtract this 'overlap' from the +! current value and add it (appropriately shifted) to the value at +! the next smaller level in the vector. +! (a) As described earlier, prior to this postprocessing the integer +! components are each strictly less than +! radix(1_i8)**(digits(1_i8) - 1) in absolute value. So, after +! shifting, the absolute value of the amount added to level +! max_levels(ifld)-1 from level max_levels(ifld) is less than +! radix(1_i8)**(digits(1_i8) - 1 - arr_max_shift) with the +! resulting sum, in absolute value, being less than +! (radix(1_i8)**(digits(1_i8) - 1))*(1 + radix(1_i8)**(-arr_max_shift)). +! Any overlap from this component is then added to the level +! max_levels(ifld)-2, etc., with resulting intermediate sums, in +! absolute value, for levels 1 to max_levels(ifld) being bounded +! from above by +! (radix(1_i8)**(digits(1_i8) - 1))*sum{i=0,inf}(radix(1_i8)**(-i*arr_max_shift)). +! Since radix(1_i8) >= 2 and arr_max_shift is also required to be +! >= 2 (otherwise the code exits with an error) this is less than +! or equal to +! (radix(1_i8)**(digits(1_i8) - 1))*sum{i=0,inf}(2**(-2i)), +! or +! (radix(1_i8)**(digits(1_i8) - 1))*(4/3). +! In summary, this shows that no absolute value generated during +! this process will exceed the maximum value representable in i8, +! i.e. (radix(1_i8)**(digits(1_i8)) - 1), as long as +! digits(1_i8) >= 2. +! (b) 'ilevel==0,...,-(extra_levels-1)' correspond to extra levels +! used to continue the above process until values at all levels +! are less than radix(1_i8)**arr_max_shift in absolute value +! (except level -(extra_levels-1), as described below). The +! result of shifting the overlap from level 1 to level 0, which +! is initially zero, is bounded in absolute value by +! (radix(1_i8)**(digits(1_i8) - 1 - arr_max_shift))*(4/3). +! After removing any overlap from level 0, the upper bound for +! level -1, which is also initially zero, is +! (radix(1_i8)**(digits(1_i8) - 1 - 2*arr_max_shift))*(4/3). +! Continuing the process, when get to level -(extra_levels-1), +! the upper bound is +! (radix(1_i8)**(digits(1_i8) - 1 - extra_levels*arr_max_shift))*(4/3). +! If we define +! extra_levels = ceiling[(digits(1_i8) - 1)/arr_max_shift - 1] +! then the upper bound is +! (radix(1_i8)**(arr_max_shift))*(4/3). +! Setting +! extra_levels = (digits(1_i8) - 1)/arr_max_shift +! is then a slightly conservative estimate that achieves the same +! upper bound. While the above upper bound at level +! -(extra_levels-1)is a factor of (4/3) larger than the target +! radix(1_i8)**arr_max_shift, it is still small enough so that +! the sum over threads and tasks, bounded from above in absolute +! value by +! (radix(1_i8)**(digits(1_i8) - 1))*(4/3), +! will not cause an overflow at level -(extra_levels-1) as long as +! digits(1_i8) >= 2. + do ilevel=max_levels(ifld),min_level+1,-1 + if (abs(i8_arr_tlsum_level(ilevel,ifld,ithread)) >= & + (i8_radix**arr_max_shift)) then + + IX_8 = i8_arr_tlsum_level(ilevel,ifld,ithread) & + / (i8_radix**arr_max_shift) i8_arr_tlsum_level(ilevel-1,ifld,ithread) = & i8_arr_tlsum_level(ilevel-1,ifld,ithread) + IX_8 + IX_8 = IX_8*(i8_radix**arr_max_shift) i8_arr_tlsum_level(ilevel,ifld,ithread) = & i8_arr_tlsum_level(ilevel,ifld,ithread) - IX_8 endif enddo enddo -#ifdef TIMING +#ifndef EAMXX_STANDALONE call t_stopf('repro_sum_loopb') #endif - enddo ! ithread + enddo -! sum contributions from different threads +! Sum contributions from different threads do ifld=1,nflds - ioffset = offset(ifld) do ithread = 1,omp_nthreads - do ilevel = 0,max_levels(ifld) + do ilevel = min_level,max_levels(ifld) i8_arr_lsum_level(ioffset+ilevel) = & i8_arr_lsum_level(ioffset+ilevel) & + i8_arr_tlsum_level(ilevel,ifld,ithread) @@ -1218,7 +1562,7 @@ subroutine cism_reprosum_int (arr, arr_gsum, nsummands, dsummands, nflds, & enddo enddo -! record if upper bound was inaccurate or if level expansion stopped +! Record if upper bound was inaccurate or if level expansion stopped ! before full accuracy was achieved if (validate) then do ifld=1,nflds @@ -1228,188 +1572,396 @@ subroutine cism_reprosum_int (arr, arr_gsum, nsummands, dsummands, nflds, & enddo endif -! sum integer vector element-wise +! Sum integer vector element-wise #if ( defined noI8 ) ! Workaround for when shr_kind_i8 is not supported. -#ifdef TIMING +#ifndef EAMXX_STANDALONE call t_startf("repro_sum_allr_i4") #endif call mpi_allreduce (i8_arr_lsum_level, i8_arr_gsum_level, & veclth, MPI_INTEGER, MPI_SUM, mpi_comm, ierr) - -#ifdef TIMING +#ifndef EAMXX_STANDALONE call t_stopf("repro_sum_allr_i4") #endif #else -#ifdef TIMING +#ifndef EAMXX_STANDALONE call t_startf("repro_sum_allr_i8") #endif - call mpi_allreduce (i8_arr_lsum_level, i8_arr_gsum_level, & veclth, MPI_INTEGER8, MPI_SUM, mpi_comm, ierr) - -#ifdef TIMING +#ifndef EAMXX_STANDALONE call t_stopf("repro_sum_allr_i8") #endif #endif +#ifndef EAMXX_STANDALONE + call t_startf('repro_sum_finalsum') +#endif ! Construct global sum from integer vector representation: ! 1) arr_max_shift is the shift applied to fraction(arr_gmax) . -! When shifting back, need to "add back in" true arr_gmax exponent. This was -! removed implicitly by working only with the fraction . -! 2) want to add levels into sum in reverse order (smallest to largest). However, -! even this can generate floating point rounding errors if signs of integers -! alternate. To avoid this, do some arithmetic with integer vectors so that all -! components have the same sign. This should keep relative difference between -! using different integer sizes (e.g. i8 and i4) to machine epsilon -! 3) assignment to X_8 will usually lose accuracy since maximum integer -! size is greater than the max number of 'digits' in r8 value (if xmax_nsummands -! correction not very large). Calculate remainder and add in first (since -! smaller). One correction is sufficient for r8 (53 digits) and i8 (63 digits). -! For r4 (24 digits) may need to correct twice. Code is written in a general -! fashion, to work no matter how many corrections are necessary (assuming -! max_jlevel parameter calculation is correct). +! When shifting back, need to 'add back in' the true arr_gmax exponent. +! This was removed implicitly by working only with the fraction. +! 2) To avoid the possibility of catastrophic cancellation, and +! an unacceptable floating point rounding error, can do some arithmetic +! with the integer vector so that all components have the same sign. +! 3) If convert each integer in the integer vector to a floating +! point value and then add these together, smallest to largest, to +! calculate the final sum, there may be roundoff error in the least +! significant digit. This error will be sensitive to the particular +! floating values generated from the integer vector, and so will be +! sensitive to the number of levels in the vector and the implicit +! exponent associated with each level. So this approach is not +! guaranteed to be reproducible with respect to a change in the +! number of MPI tasks and OpenMP threads (as this changes the +! definition of max_nsummands, and thus also arr_max_shift). It is +! also not guaranteed to be reproducible with respect to changing +! the integer size, e.g. from i8 to i4, as this also changes +! arr_max_shift. However, can eliminate this potential loss of +! reproducibility by taking the following steps. +! a) Manipulate the integer vector so that +! i) the component values do not 'overlap', that is, the value +! represented by a component is strictly less than the value +! represented by the least significant digit in the previous +! component, and +! ii) all components are positive (saving the sign to be restored +! to the final result). +! b) Identify the digit in the resulting integer vector that is the +! last representable in the floating point representation, then +! truncate the vector at this point, i.e., all digits of lesser +! significance in the given component and all components +! representing digits of lesser significance (call this the +! remainder). +! c) Convert each integer component in the modified integer vector +! to its corresponding floating point value and sum the +! sequence. (Order is unimportant, as explained below, but here +! add largest to smallest.) +! d) Repeat (b) and (c) for the remainder (recursively, as +! necessary). +! e) Sum all floating point numbers generated by step (c), smallest +! to largest. +! f) Restore the sign. +! With the manipulations in (a) and (b), the summation in (c) is +! equivalent to concatenating the digits in the mantissas for the +! component summands, so rounding is irrelevant (so far). Repeating +! this with the remainder(s) generates a sequence of 'exact' +! floating point numbers. Summing these can still generate a +! rounding error in the least significant digit in the largest +! floating point value (which is the last representable digit in the +! final result), but the floating point values being summed and +! order of summation are independent of the number of levels and +! implicit exponents, so reproducibility is ensured. +! +! Note that assignment of an i8 integer value to an r8 floating point +! variable in step (c) can lead to a loss of accuracy because the +! maximum number of digits in the i8 integer can be greater than the +! maximum number of digits representable in the r8 variable (if the +! xmax_nsummands correction is not very large). With the same sign +! and nonoverlapping properties of the integer components, these lost +! digits will also not be representable in the final sum. The process +! described above of truncating at this last representable digit, and +! then separately generating floating point value(s) for the +! remainder, takes care of this automatically. Similar reasoning +! applies to r4 floating point values with either i8 or i4 integer +! components. recompute = .false. do ifld=1,nflds arr_gsum(ifld) = 0.0_r8 ioffset = offset(ifld) + svlevel = 0 -! if validate is .true., test whether the summand upper bound -! was exceeded on any of the processes +! If validate is .true., test whether the summand upper bound +! was exceeded on any of the MPI tasks if (validate) then - if (i8_arr_gsum_level(ioffset-voffset+1) .ne. 0_i8) then + if (i8_arr_gsum_level(ioffset-voffset+1) /= 0_i8) then recompute = .true. endif endif if (.not. recompute) then +! Copy integer vector for current field from i8_arr_gsum_level, so that +! can be modified without changing i8_arr_gsum_level. (Preserving +! i8_arr_gsum_level unchanged is not necessary, but is convenient for debugging +! and makes indexing clearer and less error prone.) + i8_gsum_level(:) = 0_i8 + do ilevel=min_level,max_levels(ifld) + i8_gsum_level(ilevel) = i8_arr_gsum_level(ioffset+ilevel) + enddo -! preprocess integer vector: -! a) if value larger than or equal to (radix(IX_8)**arr_max_shift), add this 'overlap' -! to next larger integer in vector, resulting in nonoverlapping ranges for each -! component. Note that have "ilevel-1=0" level here as described above. - do ilevel=max_levels(ifld),1,-1 - RX_8 = i8_arr_gsum_level(ioffset+ilevel) - IX_8 = int(scale(RX_8,-arr_max_shift),i8) - if (IX_8 .ne. 0_i8) then - i8_arr_gsum_level(ioffset+ilevel-1) = i8_arr_gsum_level(ioffset+ilevel-1) & - + IX_8 - IX_8 = IX_8*(i8_radix**arr_max_shift) - i8_arr_gsum_level(ioffset+ilevel) = i8_arr_gsum_level(ioffset+ilevel) & - - IX_8 - endif - enddo -! b) subtract +/- 1 from larger and add +/- 1 to smaller when necessary -! so that all vector components have the same sign (eliminating loss -! of accuracy arising from difference of large values when -! reconstructing r8 sum from integer vector) - ilevel = 0 - do while ((i8_arr_gsum_level(ioffset+ilevel) .eq. 0_i8) & - .and. (ilevel < max_levels(ifld))) - ilevel = ilevel + 1 - enddo -! - if (ilevel < max_levels(ifld)) then - if (i8_arr_gsum_level(ioffset+ilevel) > 0_i8) then - i8_sign = 1_i8 - else - i8_sign = -1_i8 - endif - do jlevel=ilevel,max_levels(ifld)-1 - if (sign(1_i8,i8_arr_gsum_level(ioffset+jlevel)) & - .ne. sign(1_i8,i8_arr_gsum_level(ioffset+jlevel+1))) then - i8_arr_gsum_level(ioffset+jlevel) = i8_arr_gsum_level(ioffset+jlevel) & - - i8_sign - i8_arr_gsum_level(ioffset+jlevel+1) = i8_arr_gsum_level(ioffset+jlevel+1) & - + i8_sign*(i8_radix**arr_max_shift) - endif - enddo +! Preprocess integer vector (as described in 3(a) above): +! i) If value larger than or equal to (radix(1_i8)**arr_max_shift), +! add this 'overlap' to the value at the next smaller level +! in the vector, resulting in nonoverlapping ranges for each +! component. +! +! As before, no intermediate sums for levels +! max_levels(ifld) to -(extra_levels-2), in absolute value, +! will exceed the the maximum value representable in i8, but the +! upper bound on the final sum, in absolute value, at +! level -(extra_levels-1) is now +! (radix(1_i8)**(digits(1_i8) - 1))*(4/3) + +! + sum{i=1,inf}(radix(1_i8)**(-i*arr_max_shift)) +! = (radix(1_i8)**(digits(1_i8) - 1))* +! ((4/3) + sum{i=1,inf}(radix(1_i8)**(-i*arr_max_shift)). +! which is less than or equal to +! (radix(1_i8)**(digits(1_i8) - 1))*((4/3) + (1/3)) +! or +! (radix(1_i8)**(digits(1_i8) - 1))*(5/3) +! which will not cause an overflow at level -(extra_levels-1) +! as long as digits(1_i8) >= 3. +! +! Since the exponents associated with each successive level +! differ by arr_max_shift, monotonically decreasing with +! increasing level, the absolute value at each level after this +! preprocessing is strictly less than what can be represented at +! the next lower level (larger exponent). If nonzero, it is also +! strictly greater than what is represented at the next higher +! level (smaller exponent). Note that the smallest level, +! -(extra_levels-1), does not have to be less than +! (radix(1_i8)**arr_max_shift) for this 'nonoverlap' property to +! hold. + do ilevel=max_levels(ifld),min_level+1,-1 + if (abs(i8_gsum_level(ilevel)) >= & + (i8_radix**arr_max_shift)) then + + IX_8 = i8_gsum_level(ilevel) & + / (i8_radix**arr_max_shift) + i8_gsum_level(ilevel-1) = & + i8_gsum_level(ilevel-1) + IX_8 + + IX_8 = IX_8*(i8_radix**arr_max_shift) + i8_gsum_level(ilevel) = & + i8_gsum_level(ilevel) - IX_8 + endif + enddo + +! ii) Working consecutively from the first level with a nonzero value +! up to level max_levels(ifld), subtract +/- 1 from level with +! larger exponent (e.g., ilevel) and add +/- +! (i8_radix**arr_max_shift) to level with smaller exponent +! (ilevel+1), when necessary, so that the value at ilevel+1 +! has the same sign as the value at ilevel. Treat a zero value at +! ilevel+1 as always a different sign from the value at ilevel so +! that the process always makes this nonzero. (Otherwise, the +! wrong sign could be reintroduced by subtracting from a zero +! value at the next step.) When finished with the process values +! at all levels are either greater than or equal to zero or all +! are less than or equal to zero. Note that this can decrease +! (but not increase) the absolute value at level +! -(extra_levels-1) by 1. All other levels are now less than or +! equal to (radix(1_i8)**arr_max_shift) in absolute value rather +! than strictly less than. + ilevel = min_level + do while ((i8_gsum_level(ilevel) == 0_i8) & + .and. (ilevel < max_levels(ifld))) + ilevel = ilevel + 1 + enddo +! + if (i8_gsum_level(ilevel) < 0_i8) then + i8_sign = -1_i8 + else + i8_sign = 1_i8 + endif +! + if (ilevel < max_levels(ifld)) then + do jlevel=ilevel,max_levels(ifld)-1 + if ((sign(1_i8,i8_gsum_level(jlevel)) & + /= sign(1_i8,i8_gsum_level(jlevel+1)))& + .or. (i8_gsum_level(jlevel+1) == 0_i8)) then + i8_gsum_level(jlevel) = & + i8_gsum_level(jlevel) - i8_sign + i8_gsum_level(jlevel+1) = & + i8_gsum_level(jlevel+1) & + + i8_sign*(i8_radix**arr_max_shift) + endif + enddo endif -! start with maximum shift, and work up to larger values - arr_shift = arr_gmax_exp(ifld) & - - max_levels(ifld)*arr_max_shift - curr_exp = 0 - first = .true. - do ilevel=max_levels(ifld),0,-1 - - if (i8_arr_gsum_level(ioffset+ilevel) .ne. 0_i8) then - jlevel = 1 - -! r8 representation of higher order bits in integer - X_8(jlevel) = i8_arr_gsum_level(ioffset+ilevel) - LX(jlevel) = exponent(X_8(jlevel)) - -! calculate remainder - IX_8 = int(X_8(jlevel),i8) - RX_8 = (i8_arr_gsum_level(ioffset+ilevel) - IX_8) - -! repeat using remainder - do while (RX_8 .ne. 0.0_r8) - jlevel = jlevel + 1 - X_8(jlevel) = RX_8 - LX(jlevel) = exponent(RX_8) - IX_8 = IX_8 + int(RX_8,i8) - RX_8 = (i8_arr_gsum_level(ioffset+ilevel) - IX_8) - enddo +! iii) If 'same sign' is negative, then change to positive +! temporarily. + if (i8_sign < 0_i8) then + do jlevel=ilevel,max_levels(ifld) + i8_gsum_level(jlevel) = -i8_gsum_level(jlevel) + enddo + endif + +! iv) Nonoverlap property can be lost after imposition of same sign +! over components. Reintroduce this property (retaining same sign +! property). Note that carryover is never more than '1' to the +! next smaller level, so, again, no intermediate or final sums +! will exceed the maximum value representable in i8, including +! level -(extra_levels-1) as long as digits(1_i8) >= 4. + do ilevel=max_levels(ifld),min_level+1,-1 + if (abs(i8_gsum_level(ilevel)) >= & + (i8_radix**arr_max_shift)) then + + IX_8 = i8_gsum_level(ilevel)/ & + (i8_radix**arr_max_shift) + i8_gsum_level(ilevel-1) = & + i8_gsum_level(ilevel-1) + IX_8 + + IX_8 = IX_8*(i8_radix**arr_max_shift) + i8_gsum_level(ilevel) = & + i8_gsum_level(ilevel) - IX_8 + endif + enddo -! add in contributions, smaller to larger, rescaling for each -! addition to guarantee that exponent of working summand is always -! larger than minexponent - do while (jlevel > 0) - if (first) then - curr_exp = LX(jlevel) + arr_shift - arr_gsum(ifld) = fraction(X_8(jlevel)) - first = .false. +! Step 3(d): iterate over steps 3(b) and 3(c), truncating integer +! vector to 'fit' into a floating point value, then repeating with +! remainder + first_stepd_iteration = .true. + arr_shift = arr_gmax_exp(ifld) - (min_level)*arr_max_shift + i8_digit_count = 0 + i8_begin_level = min_level + do while (i8_begin_level <= max_levels(ifld)) + +! Determine at which level the total number of integer digits equals +! or exceeds the number of digits representable in the floating point +! sum. Then determine which digit at this level is the last +! representable in the floating point sum. Note that this location +! (i8_trunc_loc) is zero-based, i.e. smallest digit is at location +! 0. Note that the exponent is a count of the number of digits for the +! first nonzero level. All subsequent levels contribute arr_max_shift +! digits. + i8_trunc_loc = 0 + i8_trunc_level = max_levels(ifld) + do ilevel=i8_begin_level,max_levels(ifld) + if (first_stepd_iteration) then +! Special logic for first time through. Subsequent iterations treat +! leading zeroes as significant. + if (i8_digit_count == 0) then + if (i8_gsum_level(ilevel) /= 0_i8) then + X_8 = i8_gsum_level(ilevel) + LX = exponent(X_8) +! Note that even if i8_gsum_level(ilevel) is truncated when assigned +! to X_8, the exponent LX will still capture the original number of +! digits. + else + LX = 0 + endif else - corr_exp = curr_exp - (LX(jlevel) + arr_shift) - arr_gsum(ifld) = fraction(X_8(jlevel)) & - + scale(arr_gsum(ifld),corr_exp) - curr_exp = LX(jlevel) + arr_shift + LX = arr_max_shift endif - jlevel = jlevel - 1 - enddo + else +! If i8_digit_count /= 0 during the first iteration +! (ilevel == i8_begin_level), then there is a remainder left at the +! previous i8_trunc_level and LX should be set to zero for this +! iteration. + if ((ilevel == i8_begin_level) .and. (i8_digit_count /= 0)) then + LX = 0 + else + LX = arr_max_shift + endif + endif + if (i8_digit_count + LX >= digits(1.0_r8)) then + i8_trunc_level = ilevel + i8_trunc_loc = (i8_digit_count + LX) - digits(1.0_r8) + exit + else + i8_digit_count = i8_digit_count + LX + endif + enddo + first_stepd_iteration = .false. + +! Truncate at i8_trunc_loc as needed and determine what the remainder +! is. + if (i8_trunc_loc == 0) then +! No truncation is necessary, and remainder is just the components +! for the remaining levels + i8_trunc_level_rem = 0 + else +! Shift right to identify the digits to be preserved and truncate +! there + IX_8 = i8_gsum_level(i8_trunc_level)/ & + (i8_radix**i8_trunc_loc) +! Shift left to put digits in the correct location (right fill with +! zeroes) + IX_8 = IX_8*(i8_radix**i8_trunc_loc) +! Calculate local remainder + i8_trunc_level_rem = (i8_gsum_level(i8_trunc_level) - IX_8) +! Update level with the truncated value + i8_gsum_level(i8_trunc_level) = IX_8 + endif + +! Calculate floating point value corresponding to modified integer +! vector. Note that, by construction, i8 integer value will fit into +! r8 floating point value, so do not need to test for this. + svlevel = svlevel + 1 + summand_vector(svlevel) = 0.0_r8 + do ilevel=i8_begin_level,i8_trunc_level + if (i8_gsum_level(ilevel) /= 0_i8) then + +! Convert integer to floating point representation + X_8 = i8_gsum_level(ilevel) + LX = exponent(X_8) + +! Add to vector of floating point summands, scaling first if exponent +! is too small to apply directly + curr_exp = LX + arr_shift + if (curr_exp >= MINEXPONENT(1.0_r8)) then + summand_vector(svlevel) = & + summand_vector(svlevel) + set_exponent(X_8,curr_exp) + else + RX_8 = set_exponent(X_8, & + curr_exp-MINEXPONENT(1.0_r8)) + summand_vector(svlevel) = & + summand_vector(svlevel) + scale(RX_8,MINEXPONENT(1.0_r8)) + endif + + endif +! Note that same arr_shift should be used for next 'step 3(d)' +! iteration if i8_trunc_loc > 0. + if ((ilevel < i8_trunc_level) .or. (i8_trunc_loc == 0)) then + arr_shift = arr_shift - arr_max_shift + endif + + enddo + + if (i8_trunc_loc == 0) then + i8_digit_count = 0 + i8_begin_level = i8_trunc_level + 1 + else + i8_digit_count = i8_trunc_loc + i8_begin_level = i8_trunc_level + i8_gsum_level(i8_trunc_level) = i8_trunc_level_rem endif - arr_shift = arr_shift + arr_max_shift enddo -! apply final exponent correction, scaling first if exponent is too small -! to apply directly - corr_exp = curr_exp + exponent(arr_gsum(ifld)) - if (corr_exp .ge. MINEXPONENT(1._r8)) then - arr_gsum(ifld) = set_exponent(arr_gsum(ifld),corr_exp) - else - RX_8 = set_exponent(arr_gsum(ifld), & - corr_exp-MINEXPONENT(1._r8)) - arr_gsum(ifld) = scale(RX_8,MINEXPONENT(1._r8)) - endif +! Step 3(e): sum vector of floating point values, smallest to largest + arr_gsum(ifld) = 0.0_r8 + do jlevel=svlevel,1,-1 + arr_gsum(ifld) = arr_gsum(ifld) + summand_vector(jlevel) + enddo -! if validate is .true. and some precision lost, test whether 'too much' -! was lost, due to too loose an upper bound, too stringent a limit on number -! of levels of expansion, cancellation, .... Calculated by comparing lower -! bound on number of sigificant digits with number of digits in 1.0_r8 . +! Step 3(f): restore the sign + arr_gsum(ifld) = i8_sign*arr_gsum(ifld) + +! If validate is .true. and some precision lost, test whether 'too +! much' was lost, due to too loose an upper bound, too stringent a +! limit on number of levels of expansion, cancellation, ... +! Calculated by comparing lower bound on number of significant digits +! with number of digits in 1.0_r8 . if (validate) then - if (i8_arr_gsum_level(ioffset-voffset+2) .ne. 0_i8) then - -! find first nonzero level and use exponent for this level, then assume all -! subsequent levels contribute arr_max_shift digits. - sum_digits = 0 - do ilevel=0,max_levels(ifld) - if (sum_digits .eq. 0) then - if (i8_arr_gsum_level(ioffset+ilevel) .ne. 0_i8) then - X_8(1) = i8_arr_gsum_level(ioffset+ilevel) - LX(1) = exponent(X_8(1)) - sum_digits = LX(1) + if (i8_arr_gsum_level(ioffset-voffset+2) /= 0_i8) then + +! Find first nonzero level and use exponent for this level, then +! assume all subsequent levels contribute arr_max_shift digits. + i8_digit_count = 0 + do ilevel=min_level,max_levels(ifld) + if (i8_digit_count == 0) then + if (i8_arr_gsum_level(ioffset+ilevel) /= 0_i8) then + X_8 = i8_arr_gsum_level(ioffset+ilevel) + LX = exponent(X_8) + i8_digit_count = LX endif else - sum_digits = sum_digits + arr_max_shift + i8_digit_count = i8_digit_count + arr_max_shift endif enddo - if (sum_digits < digits(1.0_r8)) then + if (i8_digit_count < digits(1.0_r8)) then recompute = .true. endif endif @@ -1418,6 +1970,9 @@ subroutine cism_reprosum_int (arr, arr_gsum, nsummands, dsummands, nflds, & endif enddo +#ifndef EAMXX_STANDALONE + call t_stopf('repro_sum_finalsum') +#endif end subroutine cism_reprosum_int @@ -1425,28 +1980,28 @@ end subroutine cism_reprosum_int !======================================================================== ! - logical function cism_reprosum_tolExceeded (name, nflds, master, & + logical function cism_reprosum_tolExceeded(name, nflds, master, & logunit, rel_diff ) -!---------------------------------------------------------------------- +!------------------------------------------------------------------------ ! ! Purpose: ! Test whether distributed sum exceeds tolerance and print out a ! warning message. ! -!---------------------------------------------------------------------- +!------------------------------------------------------------------------ ! ! Arguments ! character(len=*), intent(in) :: name ! distributed sum identifier integer, intent(in) :: nflds ! number of fields - logical, intent(in) :: master ! process that will write + logical, intent(in) :: master ! MPI task that will write ! warning messages? integer, optional, intent(in) :: logunit! unit warning messages ! written to real(r8), intent(in) :: rel_diff(2,nflds) ! relative and absolute - ! differences between fixed - ! and floating point sums + ! differences between integer + ! vector and floating point sums ! ! Local workspace @@ -1460,7 +2015,7 @@ logical function cism_reprosum_tolExceeded (name, nflds, master, & real(r8) :: max_abs_diff ! maximum absolute difference integer :: max_abs_diff_idx ! field index for max. abs. diff. ! -!----------------------------------------------------------------------- +!------------------------------------------------------------------------ ! cism_reprosum_tolExceeded = .false. if (cism_reprosum_reldiffmax < 0.0_r8) return @@ -1472,7 +2027,7 @@ logical function cism_reprosum_tolExceeded (name, nflds, master, & llogunit = iulog endif - ! check that "fast" reproducible sum is accurate enough. +! Check that 'fast' reproducible sum is accurate enough. exceeds_limit = 0 max_rel_diff = 0.0_r8 max_abs_diff = 0.0_r8 @@ -1494,7 +2049,7 @@ logical function cism_reprosum_tolExceeded (name, nflds, master, & if (exceeds_limit > 0) then if (master) then write(llogunit,*) trim(name), & - ': difference in fixed and floating point sums ', & + ': difference between integer vector and floating point sums ', & ' exceeds tolerance in ', exceeds_limit, & ' fields.' write(llogunit,*) ' Maximum relative diff: (rel)', & @@ -1507,23 +2062,22 @@ logical function cism_reprosum_tolExceeded (name, nflds, master, & cism_reprosum_tolExceeded = .true. endif - - end function cism_reprosum_tolExceeded + end function cism_reprosum_tolExceeded ! !======================================================================== ! - subroutine cism_reprosum_ddpdd (arr, arr_gsum, nsummands, dsummands, & + subroutine cism_reprosum_ddpdd(arr, arr_gsum, nsummands, dsummands, & nflds, mpi_comm ) -!---------------------------------------------------------------------- +!------------------------------------------------------------------------ ! ! Purpose: -! Compute the global sum of each field in "arr" using the indicated +! Compute the global sum of each field in 'arr' using the indicated ! communicator with a reproducible yet scalable implementation based ! on He and Ding's implementation of the double-double algorithm. ! -!---------------------------------------------------------------------- +!------------------------------------------------------------------------ ! ! Arguments ! @@ -1555,12 +2109,10 @@ subroutine cism_reprosum_ddpdd (arr, arr_gsum, nsummands, dsummands, & logical, save :: first_time = .true. ! -!----------------------------------------------------------------------- +!------------------------------------------------------------------------ ! - !WHL - This is a C routine in the CESM shared code: shr_reprosumx86.c - ! Commented out the call. - ! Not sure whether this breaks the algorithm. -!! call cism_reprosumx86_fix_start (old_cw) + !WHL - commented out +! call cism_reprosumx86_fix_start (old_cw) if (first_time) then call mpi_op_create(ddpdd, .true., mpi_sumdd, ierr) @@ -1572,8 +2124,7 @@ subroutine cism_reprosum_ddpdd (arr, arr_gsum, nsummands, dsummands, & do isum=1,nsummands - ! Compute arr(isum,ifld) + arr_lsum_dd(ifld) using Knuth''s - ! trick. +! Compute arr(isum,ifld) + arr_lsum_dd(ifld) using Knuth''s trick. t1 = arr(isum,ifld) + real(arr_lsum_dd(ifld)) e = t1 - arr(isum,ifld) t2 = ((real(arr_lsum_dd(ifld)) - e) & @@ -1585,33 +2136,35 @@ subroutine cism_reprosum_ddpdd (arr, arr_gsum, nsummands, dsummands, & enddo enddo -#ifdef TIMING + +#ifndef EAMXX_STANDALONE call t_startf("repro_sum_allr_c16") #endif call mpi_allreduce (arr_lsum_dd, arr_gsum_dd, nflds, & MPI_COMPLEX16, mpi_sumdd, mpi_comm, ierr) -#ifdef TIMING +#ifndef EAMXX_STANDALONE call t_stopf("repro_sum_allr_c16") #endif + do ifld=1,nflds arr_gsum(ifld) = real(arr_gsum_dd(ifld)) enddo - !WHL - commented out; see comment above -!! call cism_reprosumx86_fix_end (old_cw) + !WHL - commented out +! call cism_reprosumx86_fix_end (old_cw) end subroutine cism_reprosum_ddpdd ! -!----------------------------------------------------------------------- +!------------------------------------------------------------------------ ! subroutine DDPDD (dda, ddb, len, itype) -!---------------------------------------------------------------------- +!------------------------------------------------------------------------ ! ! Purpose: ! Modification of original codes written by David H. Bailey ! This subroutine computes ddb(i) = dda(i)+ddb(i) ! -!---------------------------------------------------------------------- +!------------------------------------------------------------------------ ! ! Arguments ! @@ -1625,31 +2178,31 @@ subroutine DDPDD (dda, ddb, len, itype) real(r8) e, t1, t2 integer i ! -!----------------------------------------------------------------------- +!------------------------------------------------------------------------ ! do i = 1, len -! Compute dda + ddb using Knuth's trick. + +! Compute dda + ddb using Knuth's trick. t1 = real(dda(i)) + real(ddb(i)) e = t1 - real(dda(i)) t2 = ((real(ddb(i)) - e) + (real(dda(i)) - (t1 - e))) & + aimag(dda(i)) + aimag(ddb(i)) -! The result is t1 + t2, after normalization. +! The result is t1 + t2, after normalization. ddb(i) = cmplx ( t1 + t2, t2 - ((t1 + t2) - t1), r8 ) enddo - end subroutine DDPDD ! -!----------------------------------------------------------------------- +!------------------------------------------------------------------------ ! subroutine split_indices(total,num_pieces,ibeg,iend) -!---------------------------------------------------------------------- +!------------------------------------------------------------------------ ! ! Purpose: ! Split range into 'num_pieces' ! -!---------------------------------------------------------------------- +!------------------------------------------------------------------------ ! ! Arguments ! @@ -1661,7 +2214,7 @@ subroutine split_indices(total,num_pieces,ibeg,iend) ! integer :: itmp1, itmp2, ioffset, i ! -!----------------------------------------------------------------------- +!------------------------------------------------------------------------ ! itmp1 = total/num_pieces itmp2 = mod(total,num_pieces) diff --git a/libglissade/glissade_velo_higher.F90 b/libglissade/glissade_velo_higher.F90 index 73b45f52..3b3ec31b 100644 --- a/libglissade/glissade_velo_higher.F90 +++ b/libglissade/glissade_velo_higher.F90 @@ -8376,11 +8376,11 @@ subroutine basal_sliding_bc_2d(nx, ny, & do j = 1, ny-1 do i = 1, nx-1 if (active_vertex(i,j)) then - Auu(i,j,m) = Auu(i,j,m) + (dx*dy/vol0) * beta(i,j) - Avv(i,j,m) = Avv(i,j,m) + (dx*dy/vol0) * beta(i,j) -!! increment = (dx*dy/vol0) * beta(i,j) -!! Auu(i,j,m) = Auu(i,j,m) + increment -!! Avv(i,j,m) = Avv(i,j,m) + increment +! Auu(i,j,m) = Auu(i,j,m) + (dx*dy/vol0) * beta(i,j) +! Avv(i,j,m) = Avv(i,j,m) + (dx*dy/vol0) * beta(i,j) + increment = (dx*dy/vol0) * beta(i,j) + Auu(i,j,m) = Auu(i,j,m) + increment + Avv(i,j,m) = Avv(i,j,m) + increment endif ! active_vertex enddo ! i enddo ! j From 3be7b25169439f164222d9d00ccc3d2141726474 Mon Sep 17 00:00:00 2001 From: William Lipscomb Date: Thu, 8 Jan 2026 14:23:26 -0700 Subject: [PATCH 07/21] Reprosum fixes for glaciers This commit modifies the calculation of the glacier boundary mask to support reproducible sums. Previously, this mask (which enforces high values of Cp at vertices that border two different glaciers) could be computed incorrectly near block boundaries. Other changes: * I added an optional argument, zero_global_boundary_no_ice_bc, to subroutine parallel_halo_real8_2d. By default, this subroutine zeroes out the values of scalars in cells adjacent to the global boundary. (The reasons are a bit complicated; they are related to the issue of communicating scalar values to the halo cells of diagonal block neighbors when some blocks are inactive.) Passing in '.false.' for this argument preserves the original value in these cells. For topg and relx, it is better to keep the correct physical values than to set them to zero. * The call to glissade_glacier_init now takes place after thck, topg, lsrf, and usrf have been computed in halo cells. This prevents erroneous nonzero thickness targets near block boundaries. * I added a subroutine called parallel_halo_extrapolate_1d. This subroutine is used to set the correct values of (x0,y0) and (x1,y1) in halo cells. Note: CISM requires (x1,y1) in the input file. If (x0,y0) are not present, they are computed from (x1,y1). * If glacier%length_scale_factor /= 1, then (x0,y0) and (x1,y1) are now scaled at initialization, along with dew and dns. * Removed circular dependencies by deleting 'use cism_parallel' and 'use glimmer_log' from the profile module * Extended the write_array_to_file subroutines to allow either floating-point or binary-string output * Eliminated some glacier diagnostic output for the case that the diagnostic cell belongs to no glacier --- libglide/glide_diagnostics.F90 | 73 +++++++------ libglide/glide_nc_custom.F90 | 7 +- libglimmer/cism_reprosum_mod.F90 | 17 +-- libglimmer/parallel_mpi.F90 | 136 +++++++++++++++++------ libglimmer/profile.F90 | 10 +- libglissade/glissade.F90 | 155 ++++++++++++++------------- libglissade/glissade_glacier.F90 | 61 +++++------ libglissade/glissade_inversion.F90 | 2 +- libglissade/glissade_utils.F90 | 66 +++++++++--- libglissade/glissade_velo_higher.F90 | 35 +++--- 10 files changed, 330 insertions(+), 232 deletions(-) diff --git a/libglide/glide_diagnostics.F90 b/libglide/glide_diagnostics.F90 index 8aec8052..a4571ed1 100644 --- a/libglide/glide_diagnostics.F90 +++ b/libglide/glide_diagnostics.F90 @@ -1266,48 +1266,57 @@ subroutine glide_write_diag (model, time) ng = model%glacier%ngdiag - write(message,'(a35,i14)') 'Diagnostic glacier index (RGI) ', & - model%glacier%cism_to_rgi_glacier_id(ng) - call write_log(trim(message), type = GM_DIAGNOSTIC) + if (ng > 0) then - write(message,'(a35,i14)') 'Diagnostic glacier index (CISM) ', ng - call write_log(trim(message), type = GM_DIAGNOSTIC) + write(message,'(a35,i14)') 'Diagnostic glacier index (RGI) ', & + model%glacier%cism_to_rgi_glacier_id(ng) + call write_log(trim(message), type = GM_DIAGNOSTIC) - write(message,'(a35,f14.6)') 'Glacier area_init (km^2) ', & - model%glacier%area_init(ng) / 1.0d6 - call write_log(trim(message), type = GM_DIAGNOSTIC) + write(message,'(a35,i14)') 'Diagnostic glacier index (CISM) ', ng + call write_log(trim(message), type = GM_DIAGNOSTIC) - write(message,'(a35,f14.6)') 'Glacier area (km^2) ', & - model%glacier%area(ng) / 1.0d6 - call write_log(trim(message), type = GM_DIAGNOSTIC) + write(message,'(a35,f14.6)') 'Glacier area_init (km^2) ', & + model%glacier%area_init(ng) / 1.0d6 + call write_log(trim(message), type = GM_DIAGNOSTIC) - write(message,'(a35,f14.6)') 'Glacier area_init_extent (km^2) ', & - model%glacier%area_init_extent(ng) / 1.0d6 - call write_log(trim(message), type = GM_DIAGNOSTIC) + write(message,'(a35,f14.6)') 'Glacier area (km^2) ', & + model%glacier%area(ng) / 1.0d6 + call write_log(trim(message), type = GM_DIAGNOSTIC) - write(message,'(a35,f14.6)') 'Glacier volume (km^3) ', & - model%glacier%volume(ng) / 1.0d9 - call write_log(trim(message), type = GM_DIAGNOSTIC) + write(message,'(a35,f14.6)') 'Glacier area_init_extent (km^2) ', & + model%glacier%area_init_extent(ng) / 1.0d6 + call write_log(trim(message), type = GM_DIAGNOSTIC) - write(message,'(a35,f14.6)') 'Glacier volume_init (km^3) ', & - model%glacier%volume_init(ng) / 1.0d9 - call write_log(trim(message), type = GM_DIAGNOSTIC) + write(message,'(a35,f14.6)') 'Glacier volume (km^3) ', & + model%glacier%volume(ng) / 1.0d9 + call write_log(trim(message), type = GM_DIAGNOSTIC) - write(message,'(a35,f14.6)') 'Glacier volume_init_extent (km^3) ', & - model%glacier%volume_init_extent(ng) / 1.0d9 - call write_log(trim(message), type = GM_DIAGNOSTIC) + write(message,'(a35,f14.6)') 'Glacier volume_init (km^3) ', & + model%glacier%volume_init(ng) / 1.0d9 + call write_log(trim(message), type = GM_DIAGNOSTIC) - write(message,'(a35,f14.6)') 'mu_star (mm/yr w.e./deg C) ', & - model%glacier%mu_star(ng) - call write_log(trim(message), type = GM_DIAGNOSTIC) + write(message,'(a35,f14.6)') 'Glacier volume_init_extent (km^3) ', & + model%glacier%volume_init_extent(ng) / 1.0d9 + call write_log(trim(message), type = GM_DIAGNOSTIC) - write(message,'(a35,f14.6)') 'alpha_snow ', & - model%glacier%alpha_snow(ng) - call write_log(trim(message), type = GM_DIAGNOSTIC) + write(message,'(a35,f14.6)') 'mu_star (mm/yr w.e./deg C) ', & + model%glacier%mu_star(ng) + call write_log(trim(message), type = GM_DIAGNOSTIC) - write(message,'(a35,f14.6)') 'beta_artm (deg C) ', & - model%glacier%beta_artm(ng) - call write_log(trim(message), type = GM_DIAGNOSTIC) + write(message,'(a35,f14.6)') 'alpha_snow ', & + model%glacier%alpha_snow(ng) + call write_log(trim(message), type = GM_DIAGNOSTIC) + + write(message,'(a35,f14.6)') 'beta_artm (deg C) ', & + model%glacier%beta_artm(ng) + call write_log(trim(message), type = GM_DIAGNOSTIC) + + else ! glacier ID = 0 + + write(message,'(a35,i14)') 'Diagnostic glacier index (CISM) ', ng + call write_log(trim(message), type = GM_DIAGNOSTIC) + + endif call write_log(' ') diff --git a/libglide/glide_nc_custom.F90 b/libglide/glide_nc_custom.F90 index a672203a..bd464a4f 100644 --- a/libglide/glide_nc_custom.F90 +++ b/libglide/glide_nc_custom.F90 @@ -116,7 +116,8 @@ subroutine glide_nc_filldvars(outfile, model) ! This does not work, in general, when computing on active blocks only, because the local versions ! of model%general%x1 may not span the global domain. ! The revised code calls parallel_put_var to write (x0,y0) and (x1,y1) to the output file. - ! This assumes that x1_global and y1_global were read from the input file and saved in a global array. + ! This assumes that x1_global and y1_global were read from the input file and saved in a global array + ! (e.g., in subroutine glide_io_read). status = parallel_inq_varid(NCO%id,'x1',varid) status = parallel_put_var(NCO%id,varid,model%general%x1_global) @@ -126,8 +127,8 @@ subroutine glide_nc_filldvars(outfile, model) status = parallel_put_var(NCO%id,varid,model%general%y1_global) call nc_errorhandle(__FILE__,__LINE__,status) - ! create the x0 and y0 grids from x1 and y1 - + ! create the x0 and y0 grids from x1 and y1; + ! this does not require model%general%x0_global and y0_global to have been filled status = parallel_inq_varid(NCO%id,'x0',varid) do i = 1, global_ewn-1 x0_global(i) = (model%general%x1_global(i) + model%general%x1_global(i+1)) / 2.0d0 diff --git a/libglimmer/cism_reprosum_mod.F90 b/libglimmer/cism_reprosum_mod.F90 index 0740f459..ec202273 100644 --- a/libglimmer/cism_reprosum_mod.F90 +++ b/libglimmer/cism_reprosum_mod.F90 @@ -62,7 +62,6 @@ module cism_reprosum_mod use glimmer_global, only: r8 => dp use glimmer_global, only: i8 use glimmer_paramets, only: iulog - use glimmer_utils, only: double_to_binary use cism_infnan_mod,only: cism_infnan_inf_type, assignment(=), & cism_infnan_posinf, cism_infnan_neginf, & cism_infnan_nan, & @@ -85,6 +84,7 @@ module cism_reprosum_mod !------------------------------------------------------------------------ ! Public interfaces ----------------------------------------------------- !------------------------------------------------------------------------ + !WHL - cism_reprosum_setopts is not currently called public :: & cism_reprosum_setopts, &! set runtime options cism_reprosum_calc, &! calculate distributed sum @@ -98,10 +98,7 @@ module cism_reprosum_mod real(r8), public :: cism_reprosum_reldiffmax = -1.0_r8 - !WHL mod -!! logical, parameter, public :: verbose_reprosum = .false. - logical, parameter, public :: verbose_reprosum = .true. - ! end WHL mod + logical, parameter, public :: verbose_reprosum = .false. !------------------------------------------------------------------------ ! Private interfaces ---------------------------------------------------- @@ -1192,20 +1189,12 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands, & repro_sum_stats(6) = repro_sum_stats(6) + gbl_lor_red endif - !WHL mod: optional diagnostics if (verbose_reprosum) then call mpi_comm_rank(MPI_COMM_WORLD, mypid, ierr) if (mypid == 0) then - write(iulog,*) 'Exit reprosum, nflds =', nflds - write(iulog,*) ' n, arr_gsum, binary_str:' -!! do n = 1, nflds - do n = 1, min(2,nflds) - call double_to_binary(arr_gsum(n), binary_str) - write(iulog,*) n, arr_gsum(n), binary_str - enddo + write(iulog,*) 'Exit reprosum, nflds, arr_gsum =', nflds, arr_gsum endif endif - ! end WHL mod end subroutine cism_reprosum_calc diff --git a/libglimmer/parallel_mpi.F90 b/libglimmer/parallel_mpi.F90 index bab339cd..57c2b20e 100644 --- a/libglimmer/parallel_mpi.F90 +++ b/libglimmer/parallel_mpi.F90 @@ -30,15 +30,7 @@ module cism_parallel use glimmer_global, only : dp, sp use glimmer_paramets, only: iulog -!TODO - Not sure setopts is needed -!TODO - Remove coupled ifdefs and always use the CISM version? -!TODO - Use cism_reprosum_mod from individual functions -#ifdef CCSMCOUPLED - use shr_reprosum_mod, only: shr_reprosum_setopts, shr_reprosum_calc -#else - use cism_reprosum_mod, only: cism_reprosum_setopts, cism_reprosum_calc -#endif - use cism_reprosum_mod, only: verbose_reprosum + use cism_reprosum_mod, only: cism_reprosum_calc, verbose_reprosum implicit none @@ -332,6 +324,7 @@ module cism_parallel end interface interface parallel_halo_extrapolate + module procedure parallel_halo_extrapolate_real8_1d module procedure parallel_halo_extrapolate_integer_2d module procedure parallel_halo_extrapolate_real8_2d end interface @@ -2206,6 +2199,7 @@ function distributed_get_var_real8_1d(ncid, varid, values, parallel, start) global_values(:) = 0.0d0 distributed_get_var_real8_1d = & nf90_get_var(ncid,varid,global_values(1:myn),start) + allocate(displs(tasks+1)) allocate(sendcounts(tasks)) sendcounts(:) = bounds(2,:)-bounds(1,:)+1 @@ -7598,26 +7592,35 @@ subroutine parallel_halo_real4_2d(a, parallel) end subroutine parallel_halo_real4_2d - subroutine parallel_halo_real8_2d(a, parallel, periodic_offset_ew, periodic_offset_ns) + subroutine parallel_halo_real8_2d(a, parallel, & + periodic_offset_ew, periodic_offset_ns, zero_global_boundary_no_ice_bc) - !WHL - added optional arguments for periodic offsets, to support ismip-hom test cases + ! Added optional arguments for periodic offsets, to support ismip-hom test cases + ! Also added an optional argument related to the no_ice BCs use mpi_mod implicit none real(dp),dimension(:,:) :: a type(parallel_type) :: parallel + real(dp), intent(in), optional :: & - periodic_offset_ew, &! offset halo values by this amount - ! if positive, the offset is positive for W halo, negative for E halo - periodic_offset_ns ! offset halo values by this amount - ! if positive, the offset is positive for S halo, negative for N halo + periodic_offset_ew, &! offset halo values by this amount + ! if positive, the offset is positive for W halo, negative for E halo + periodic_offset_ns ! offset halo values by this amount + ! if positive, the offset is positive for S halo, negative for N halo + logical, intent(in), optional :: & + zero_global_boundary_no_ice_bc ! if true, then zero out values in grid cells adjacent + ! to the global boundary when using no_ice BCs + integer :: erequest,ierror,nrequest,srequest,wrequest real(dp),dimension(lhalo, parallel%local_nsn-lhalo-uhalo) :: esend,wrecv real(dp),dimension(uhalo, parallel%local_nsn-lhalo-uhalo) :: erecv,wsend real(dp),dimension(parallel%local_ewn, lhalo) :: nsend,srecv real(dp),dimension(parallel%local_ewn, uhalo) :: nrecv,ssend + logical :: zero_global_boundary_no_ice ! local version of zero_global_boundary_no_ice_bc + ! begin associate( & outflow_bc => parallel%outflow_bc, & @@ -7634,6 +7637,12 @@ subroutine parallel_halo_real8_2d(a, parallel, periodic_offset_ew, periodic_offs northwest_corner => parallel%northwest_corner & ) + if (present(zero_global_boundary_no_ice_bc)) then + zero_global_boundary_no_ice = zero_global_boundary_no_ice_bc + else + zero_global_boundary_no_ice = .true. + endif + ! staggered grid if (size(a,1)==local_ewn-1.and.size(a,2)==local_nsn-1) return @@ -7723,31 +7732,53 @@ subroutine parallel_halo_real8_2d(a, parallel, periodic_offset_ew, periodic_offs elseif (no_ice_bc) then - ! Set values to zero in cells adjacent to the global boundary; - ! includes halo cells and one row of locally owned cells + if (zero_global_boundary_no_ice) then - if (this_rank >= east) then ! at east edge of global domain - a(local_ewn-uhalo:,:) = 0.d0 - endif + ! Set values to zero in cells adjacent to the global boundary; + ! includes halo cells and one row of locally owned cells. - if (this_rank <= west) then ! at west edge of global domain - a(:lhalo+1,:) = 0.d0 - endif + if (this_rank >= east) then ! at east edge of global domain + a(local_ewn-uhalo:,:) = 0.d0 + endif - if (this_rank >= north) then ! at north edge of global domain - a(:,local_nsn-uhalo:) = 0.d0 - endif + if (this_rank <= west) then ! at west edge of global domain + a(:lhalo+1,:) = 0.d0 + endif - if (this_rank <= south) then ! at south edge of global domain - a(:,:lhalo+1) = 0.d0 - endif + if (this_rank >= north) then ! at north edge of global domain + a(:,local_nsn-uhalo:) = 0.d0 + endif - ! Some interior blocks have a single cell at a corner of the global boundary. - ! Set values in corner cells to zero, along with adjacent halo cells. - if (southwest_corner) a(:lhalo+1,:lhalo+1) = 0.d0 - if (southeast_corner) a(local_ewn-lhalo:,:lhalo+1) = 0.d0 - if (northeast_corner) a(local_ewn-lhalo:,local_nsn-lhalo:) = 0.d0 - if (northwest_corner) a(:lhalo+1,local_nsn-lhalo:) = 0.d0 + if (this_rank <= south) then ! at south edge of global domain + a(:,:lhalo+1) = 0.d0 + endif + + ! Some interior blocks have a single cell at a corner of the global boundary. + ! Set values in corner cells to zero, along with adjacent halo cells. + if (southwest_corner) a(:lhalo+1,:lhalo+1) = 0.d0 + if (southeast_corner) a(local_ewn-lhalo:,:lhalo+1) = 0.d0 + if (northeast_corner) a(local_ewn-lhalo:,local_nsn-lhalo:) = 0.d0 + if (northwest_corner) a(:lhalo+1,local_nsn-lhalo:) = 0.d0 + + else ! set values to zero in halo cells but not in locally owned cells + + if (this_rank >= east) then ! at east edge of global domain + a(local_ewn-uhalo+1:,:) = 0.d0 + endif + + if (this_rank <= west) then ! at west edge of global domain + a(:lhalo,:) = 0.d0 + endif + + if (this_rank >= north) then ! at north edge of global domain + a(:,local_nsn-uhalo+1:) = 0.d0 + endif + + if (this_rank <= south) then ! at south edge of global domain + a(:,:lhalo) = 0.d0 + endif + + endif ! zero_global_boundary_no_ice endif ! outflow or no_ice bc @@ -8003,6 +8034,41 @@ subroutine parallel_halo_real8_4d(a, parallel) end subroutine parallel_halo_real8_4d +!======================================================================= + + ! subroutines for 1D halo updates + + subroutine parallel_halo_extrapolate_real8_1d(a, parallel, interval_in) + + !Note: Extrapolate a 1D real8 variable into halo cells to the east and west. + ! Currently used only to compute halo values for grid cell coordinates. + + use mpi_mod + implicit none + real(dp),dimension(:) :: a + type(parallel_type) :: parallel + real(dp),intent(in), optional :: & + interval_in ! uniform difference between adjacent values, e.g. grid cell size dew or dns + + integer :: i + real(dp) :: interval ! local version of interval_in + + if (present(interval_in)) then + interval = interval_in + else + interval = 0.0d0 + endif + + do i = 1, lhalo + a(i) = a(lhalo+1) - interval*(lhalo+1-i) + enddo + + do i = size(a)-uhalo+1, size(a) + a(i) = a(size(a)-uhalo) + interval*(uhalo+i-size(a)) + enddo + + end subroutine parallel_halo_extrapolate_real8_1d + !======================================================================= ! subroutines belonging to the parallel_halo_extrapolate interface diff --git a/libglimmer/profile.F90 b/libglimmer/profile.F90 index f88ee51c..92840b14 100644 --- a/libglimmer/profile.F90 +++ b/libglimmer/profile.F90 @@ -35,8 +35,6 @@ module profile #if (defined CCSMCOUPLED || defined CESMTIMERS) use perf_mod - !TODO - Add an 'only' for 'use cism_parallel'? - use cism_parallel #endif use glimmer_global, only: dp @@ -89,16 +87,18 @@ end subroutine profile_init !> register a new series of meassurements function profile_register(prof,msg) - use glimmer_log + use glimmer_paramets, only: iulog + use mpi implicit none type(profile_type) :: prof !< structure storing profile definitions character(len=*), intent(in) :: msg !< the message to be associated integer profile_register + integer :: ierr prof%nump = prof%nump+1 if (prof%nump > max_prof) then - call write_log('Maximum number of profiles reached',type=GM_FATAL, & - file=__FILE__,line=__LINE__) + write(iulog,*) ('Maximum number of profiles reached') + call mpi_abort(MPI_COMM_WORLD, 1001, ierr) end if profile_register = prof%nump prof%pname(prof%nump) = trim(msg) diff --git a/libglissade/glissade.F90 b/libglissade/glissade.F90 index 6c020822..cfcfec39 100644 --- a/libglissade/glissade.F90 +++ b/libglissade/glissade.F90 @@ -94,10 +94,10 @@ subroutine glissade_initialise(model, evolve_ice) use cism_parallel, only: parallel_type, parallel_finalise, & distributed_grid, distributed_grid_active_blocks, parallel_global_edge_mask, & - parallel_halo, parallel_halo_extrapolate, parallel_reduce_max, & + parallel_halo, parallel_halo_extrapolate, & staggered_parallel_halo_extrapolate, staggered_no_penetration_mask, & parallel_create_comm_row, parallel_create_comm_col, & - parallel_is_zero, not_parallel + parallel_reduce_max, parallel_is_zero, not_parallel use glide_setup use glimmer_ncio, only: openall_in, openall_out, glimmer_nc_get_var, glimmer_nc_get_dimlength @@ -366,34 +366,47 @@ subroutine glissade_initialise(model, evolve_ice) itest = model%numerics%idiag_local jtest = model%numerics%jdiag_local - ! Check whether x0 and y0 were read in. If not, then compute them from x1 and y1. + ! Make sure the grid coordinates (x1,y1) and (x0,y0) have been read in. + ! If (x1,y1) have not been read in, then abort the run. + ! If (x0,y0) have not been read in, then compute them from (x1,y1). + ! Extrapolate these coordinates to halo cells as needed. + ! Note: The extrapolation works only on a regular grid. + !TODO - Put the following code in a subroutine. + if (parallel_is_zero(model%general%x1)) then - if (main_task) write(iulog,*) 'Warning: model%general%x1 = 0' - else -! if (main_task) write(iulog,*) 'x1_global:', model%general%x1_global(:) -! if (main_task) write(iulog,*) 'y1_global:', model%general%y1_global(:) -! if (main_task) write(iulog,*) 'x1:', model%general%x1(:) -! if (main_task) write(iulog,*) 'y1:', model%general%y1(:) + call write_log('model%general%x1 = 0.0 everywhere', GM_FATAL) + else ! extrapolate x1 to halo cells + call parallel_halo_extrapolate(model%general%x1, parallel, model%numerics%dew) + endif + + if (parallel_is_zero(model%general%y1)) then + call write_log('model%general%y1 = 0.0 everywhere', GM_FATAL) + else ! extrapolate y1 to halo cells + call parallel_halo_extrapolate(model%general%y1, parallel, model%numerics%dns) endif + ! Check whether x0 and y0 were read in. If not, then compute them from x1 and y1. if (parallel_is_zero(model%general%x0)) then - if (main_task) write(iulog,*) 'Initialize x0' + if (main_task) write(iulog,*) 'x0 not read in; initialize from x1' do i = 1, model%general%ewn-1 model%general%x0(i) = 0.5d0 * (model%general%x1(i) + model%general%x1(i+1)) enddo + else + ! extrapolate x0 to halo cells + call parallel_halo_extrapolate(model%general%x0, parallel, model%numerics%dew) endif - if (main_task) write(iulog,*) 'x0:', model%general%x0(:) if (parallel_is_zero(model%general%y0)) then - if (main_task) write(iulog,*) 'Initialize y0' + if (main_task) write(iulog,*) 'y0 not read in; initialize from y1' do j = 1, model%general%nsn-1 model%general%y0(j) = 0.5d0 * (model%general%y1(j) + model%general%y1(j+1)) enddo + else + ! extrapolate y0 to halo cells + call parallel_halo_extrapolate(model%general%y0, parallel, model%numerics%dns) endif - if (main_task) write(iulog,*) 'y0:', model%general%y0(:) - ! Check that lat and lon fields were read in, if desired - !TODO - Use the parallel_is_nonzero function instead, here and below + ! Check that lat and lon fields were read in if (model%options%read_lat_lon) then if (parallel_is_zero(model%general%lat)) then call write_log('Failed to read latitude (lat) field from input file', GM_FATAL) @@ -612,50 +625,9 @@ subroutine glissade_initialise(model, evolve_ice) endif ! geothermal heat flux - ! If running with glaciers, then process the input glacier data - ! On start-up, this subroutine counts the glaciers. It should be called before glide_io_createall, - ! which needs to know nglacier to set up glacier output files with the right dimensions. - ! On restart, most of the required glacier arrays are in the restart file, and this subroutine - ! computes a few remaining variable. - - if (model%options%enable_glaciers) then - - ! Glaciers are run with a no-ice BC to allow removal of inactive regions. - ! This can be problematic when running in a sub-region that has glaciers along the global boundary. - ! A halo update here for 'thck' will remove ice from cells along the global boundary. - ! It is best to do this before initializing glaciers, so that ice that initially exists - ! in these cells is removed before computing the area and thickness targets. - !TODO - These calls are repeated a few lines below. Try moving them up, before the call - ! to glissade_glacier_init. I don't think it's possible to move the glissade_glacier_init call - ! down, because we need to compute nglacier before setting up output files. - - call parallel_halo(model%geometry%thck, parallel) - ! calculate the lower and upper ice surface - call glide_calclsrf(model%geometry%thck, model%geometry%topg, model%climate%eus, model%geometry%lsrf) - model%geometry%usrf = max(0.d0, model%geometry%thck + model%geometry%lsrf) - - ! Initialize glaciers - ! Note: This subroutine can return modified values of model%numerics%dew, model%numerics%dns, - ! and model%geometry%cell_area. - ! This is a fix to deal with the fact that actual grid cell dimensions can be different - ! from the nominal dimensions on a projected grid. - ! See comments near the top of glissade_glacier_init. - - call glissade_glacier_init(model, model%glacier) - - endif - - ! open all output files - call openall_out(model) - - ! create glide I/O variables - call glide_io_createall(model, model) - - ! initialize glissade components - ! Set some variables in halo cells ! Note: We need thck and artm in halo cells so that temperature will be initialized correctly - ! (if not read from input file). + ! (if not read from the input file). ! We do an update here for temp in case temp is read from an input file. ! If temp is computed below in glissade_init_therm (based on the value of options%temp_init), ! then the halos will receive the correct values. @@ -664,37 +636,36 @@ subroutine glissade_initialise(model, evolve_ice) call parallel_halo(model%climate%artm, parallel) call parallel_halo(model%temper%temp, parallel) call parallel_halo(model%temper%tempunstag, parallel) - - ! calculate the lower and upper ice surface - call glide_calclsrf(model%geometry%thck, model%geometry%topg, model%climate%eus, model%geometry%lsrf) - model%geometry%usrf = max(0.d0, model%geometry%thck + model%geometry%lsrf) + if (model%options%whichtemp == TEMP_ENTHALPY) & + call parallel_halo(model%temper%waterfrac, parallel) ! Note: For outflow BCs, most fields (thck, usrf, temp, etc.) are set to zero in the global halo, ! to create ice-free conditions. However, we might not want to set topg = 0 in the global halo, ! because then the global halo will be interpreted as ice-free land, whereas we may prefer to ! treat it as ice-free ocean. For this reason, topg is extrapolated from adjacent cells. - ! Similarly, for no_ice BCs, we want to zero out ice state variables adjacent to the global boundary, + ! For no_ice BCs, we want to zero out ice state variables adjacent to the global boundary, ! but we do not want to zero out the topography. - ! Note: For periodic BCs, there is an optional argument periodic_offset_ew for topg. - ! This is for ismip-hom experiments. A positive EW offset means that + ! For periodic BCs, there are optional periodic_offset arguments for topg. + ! These are for ismip-hom experiments or similar geometries. A positive EW offset means that ! the topography in west halo cells will be raised, and the topography ! in east halo cells will be lowered. This ensures that the topography ! and upper surface elevation are continuous between halo cells ! and locally owned cells at the edge of the global domain. - ! In other cases (anything but ismip-hom), periodic_offset_ew = periodic_offset_ns = 0, - ! and this argument will have no effect. + ! After this call, topg does not need another halo update unless isostasy is active. - if (model%general%global_bc == GLOBAL_BC_OUTFLOW .or. & - model%general%global_bc == GLOBAL_BC_NO_ICE) then + if (model%general%global_bc == GLOBAL_BC_OUTFLOW) then call parallel_halo_extrapolate(model%geometry%topg, parallel) + elseif (model%general%global_bc == GLOBAL_BC_NO_ICE) then + call parallel_halo(model%geometry%topg, parallel, zero_global_boundary_no_ice_bc = .false.) else ! other global BCs, including periodic call parallel_halo(model%geometry%topg, parallel, & periodic_offset_ew = model%numerics%periodic_offset_ew, & periodic_offset_ns = model%numerics%periodic_offset_ns) endif - if (model%options%whichtemp == TEMP_ENTHALPY) & - call parallel_halo(model%temper%waterfrac, parallel) + ! calculate the lower and upper ice surface (will be correct in halos following the halo updates above) + call glide_calclsrf(model%geometry%thck, model%geometry%topg, model%climate%eus, model%geometry%lsrf) + model%geometry%usrf = max(0.d0, model%geometry%thck + model%geometry%lsrf) ! halo update for kinbcmask (= 1 where uvel and vvel are prescribed, elsewhere = 0) ! Note: Instead of assuming that kinbcmask is periodic, we extrapolate it into the global halo @@ -702,6 +673,34 @@ subroutine glissade_initialise(model, evolve_ice) ! on the global staggered grid). call staggered_parallel_halo_extrapolate (model%velocity%kinbcmask, parallel) ! = 1 for Dirichlet BCs + if (model%options%enable_glaciers) then + + ! If running with glaciers, then process the input glacier data and initialize glacier arrays + + ! Note: On start-up, this subroutine counts the glaciers. It should be called before glide_io_createall, + ! which needs to know nglacier to set up glacier output files with the right dimensions. + ! On restart, most of the required glacier arrays are in the restart file, and this subroutine + ! computes a few remaining variables. + ! Note: Glaciers are usually run with a no-ice BC to allow removal of inactive regions. + ! This means that any grid cells adjacent to the global boundary are not handled correctly. + ! The preceding halo update for 'thck' removes ice from these grid cells. + ! Note: If glacier%length_scale_factor /= 1, This subroutine modifies the values of model%numerics%dew, + ! model%numerics%dns, model%geometry%cell_area, and the grid coordinates (x0,y0) and (x1,y1). + ! This is done if the true grid cell dimensions differ from the nominal dimensions on a projected grid. + ! See comments near the top of glissade_glacier_init. + + call glissade_glacier_init(model, model%glacier) + + endif + + ! open all output files + call openall_out(model) + + ! create glide I/O variables + call glide_io_createall(model, model) + + ! initialize glissade components + !TODO - Remove call to init_velo in glissade_initialise? ! Most of what's done in init_velo is needed for SIA only, but still need velowk for call to wvelintg call init_velo(model) @@ -861,10 +860,11 @@ subroutine glissade_initialise(model, evolve_ice) ! adjacent to or beyond the global boundary. This is an appropriate treatment for ! ice state variables, but not for bed topography and related fields (like relx). !TODO - Is this halo update necessary? - if (model%general%global_bc == GLOBAL_BC_OUTFLOW .or. & - model%general%global_bc == GLOBAL_BC_NO_ICE) then + if (model%general%global_bc == GLOBAL_BC_OUTFLOW) then call parallel_halo_extrapolate(model%isostasy%relx, parallel) - else + elseif (model%general%global_bc == GLOBAL_BC_NO_ICE) then + call parallel_halo(model%isostasy%relx, parallel, zero_global_boundary_no_ice_bc = .false.) + else ! other global BCs, including periodic call parallel_halo(model%isostasy%relx, parallel) endif @@ -2232,7 +2232,6 @@ subroutine glissade_thickness_tracer_solve(model) ! pre-transport halo updates for thickness and tracers call parallel_halo(model%geometry%thck, parallel) - call parallel_halo(model%geometry%topg, parallel) call parallel_halo_tracers(model%geometry%tracers, parallel) call parallel_halo_tracers(model%geometry%tracers_usrf, parallel) call parallel_halo_tracers(model%geometry%tracers_lsrf, parallel) @@ -3070,12 +3069,14 @@ subroutine glissade_isostasy_solve(model) ! but the argument is included to be on the safe side. ! TODO: Do we need similar logic for halo updates of relx? - if (model%general%global_bc == GLOBAL_BC_OUTFLOW .or. & - model%general%global_bc == GLOBAL_BC_NO_ICE) then + if (model%general%global_bc == GLOBAL_BC_OUTFLOW) then call parallel_halo_extrapolate(model%geometry%topg, parallel) + elseif (model%general%global_bc == GLOBAL_BC_NO_ICE) then + call parallel_halo(model%geometry%topg, parallel, zero_global_boundary_no_ice_bc = .false.) else ! other global BCs, including periodic call parallel_halo(model%geometry%topg, parallel, & - periodic_offset_ew = model%numerics%periodic_offset_ew) + periodic_offset_ew = model%numerics%periodic_offset_ew, & + periodic_offset_ns = model%numerics%periodic_offset_ns) endif ! update the marine connection mask, which depends on topg diff --git a/libglissade/glissade_glacier.F90 b/libglissade/glissade_glacier.F90 index 67744610..3d6d2641 100644 --- a/libglissade/glissade_glacier.F90 +++ b/libglissade/glissade_glacier.F90 @@ -91,7 +91,7 @@ subroutine glissade_glacier_init(model, glacier) integer :: i, j, nc, ng, count integer :: iglobal, jglobal - integer :: ng_west, ng_east, ng_south, ng_north + integer :: ng_ne, ng_nw, ng_se, ng_sw integer :: min_id, max_id real(dp) :: max_glcval real(dp) :: theta_rad ! latitude in radians @@ -155,9 +155,15 @@ subroutine glissade_glacier_init(model, glacier) if (glacier%scale_area) then - ! Optionally, rescale the grid cell dimensions dew and dns + ! Optionally, rescale the grid cell dimensions and coordinates ! This is answer-changing throughout the code. + ! Note: The global arrays model%general%x1_global, etc., which are written to output files, are not rescaled. + ! These arrays are computed from the input file, which typically ignores the scale factor. if (glacier%length_scale_factor /= 1.0d0) then + model%general%x0 = model%general%x0 * glacier%length_scale_factor + model%general%y0 = model%general%y0 * glacier%length_scale_factor + model%general%x1 = model%general%x1 * glacier%length_scale_factor + model%general%y1 = model%general%y1 * glacier%length_scale_factor model%numerics%dew = model%numerics%dew * glacier%length_scale_factor model%numerics%dns = model%numerics%dns * glacier%length_scale_factor dew = model%numerics%dew @@ -661,35 +667,25 @@ subroutine glissade_glacier_init(model, glacier) endif call broadcast(glacier%ngdiag, rtest) - ! Define a mask whose value is 1 at vertices along the boundary between two glaciers. - ! At runtime, Cp is set to a large value at masked vertices to reduce flow between glaciers. + ! Define a mask whose value is 1 at vertices that border two different glaciers. + ! At runtime, Cp is set to a large value at these vertices to reduce mass exchange between glaciers. + !TODO: Consider removing the mask. This would allow CISM to reduce basal friction to thin the ice if needed. glacier%boundary_mask(:,:) = 0 - ! Loop over locally owned cells - do j = nhalo, nsn-nhalo - do i = nhalo, ewn-nhalo - ng = glacier%cism_glacier_id_init(i,j) - if (ng > 0) then - ng_west = glacier%cism_glacier_id_init(i-1,j) - ng_east = glacier%cism_glacier_id_init(i+1,j) - ng_south = glacier%cism_glacier_id_init(i,j-1) - ng_north = glacier%cism_glacier_id_init(i,j+1) - if (ng_west > 0 .and. ng_west /= ng) then - glacier%boundary_mask(i-1,j-1) = 1 - glacier%boundary_mask(i-1,j) = 1 - endif - if (ng_east > 0 .and. ng_east /= ng) then - glacier%boundary_mask(i,j-1) = 1 - glacier%boundary_mask(i,j) = 1 - endif - if (ng_south > 0 .and. ng_south /= ng) then - glacier%boundary_mask(i-1,j-1) = 1 - glacier%boundary_mask(i,j-1) = 1 - endif - if (ng_north > 0 .and. ng_north /= ng) then - glacier%boundary_mask(i-1,j) = 1 - glacier%boundary_mask(i,j) = 1 - endif + ! Loop over locally owned vertices + do j = nhalo+1, nsn-nhalo + do i = nhalo+1, ewn-nhalo + ng_ne = glacier%cism_glacier_id_init(i+1,j+1) + ng_nw = glacier%cism_glacier_id_init(i,j+1) + ng_se = glacier%cism_glacier_id_init(i+1,j) + ng_sw = glacier%cism_glacier_id_init(i,j) + if ( (ng_ne > 0 .and. ng_nw > 0 .and. ng_ne /= ng_nw) .or. & + (ng_ne > 0 .and. ng_se > 0 .and. ng_ne /= ng_se) .or. & + (ng_ne > 0 .and. ng_sw > 0 .and. ng_ne /= ng_sw) .or. & + (ng_nw > 0 .and. ng_se > 0 .and. ng_nw /= ng_se) .or. & + (ng_nw > 0 .and. ng_sw > 0 .and. ng_nw /= ng_sw) .or. & + (ng_se > 0 .and. ng_sw > 0 .and. ng_se /= ng_sw) ) then + glacier%boundary_mask(i,j) = 1 endif enddo enddo @@ -697,6 +693,7 @@ subroutine glissade_glacier_init(model, glacier) call staggered_parallel_halo(glacier%boundary_mask, parallel) if (verbose_glacier) then + call point_diag(glacier%cism_glacier_id_init, 'cism_glacier_id_init', itest, jtest, rtest, 7, 7) call point_diag(glacier%boundary_mask, 'Glacier boundary mask', itest, jtest, rtest, 7, 7) endif @@ -1431,8 +1428,7 @@ subroutine glissade_glacier_update(model, glacier) !------------------------------------------------------------------------- if (verbose_glacier) then - call point_diag(model%geometry%topg, 'topg', itest, jtest, rtest, 7, 7) - call point_diag(thck, 'Before advance_retreat, thck', itest, jtest, rtest, 7, 7) + call point_diag(model%geometry%thck, 'Before advance_retreat, thck', itest, jtest, rtest, 7, 7) endif ! Assign nonzero IDs in grid cells where ice has reached the minimum glacier thickness. @@ -2495,6 +2491,7 @@ subroutine glacier_invert_powerlaw_c(& if (verbose_glacier) then call point_diag(stag_thck, 'stag_thck (m)', itest, jtest, rtest, 7, 7) + call point_diag(stag_thck_target, 'stag_thck_target (m)', itest, jtest, rtest, 7, 7) call point_diag(stag_dthck, 'stag_thck - stag_thck_target (m)', itest, jtest, rtest, 7, 7) call point_diag(stag_dthck_dt, 'stag_dthck_dt (m/yr)', itest, jtest, rtest, 7, 7) call point_diag(powerlaw_c, 'new powerlaw_c', itest, jtest, rtest, 7, 7) @@ -2850,7 +2847,7 @@ subroutine glacier_advance_retreat(& call parallel_halo(cism_glacier_id, parallel) ! Check advanced cells (beyond the initial extent) for problematic glacier IDs. - ! This code protects against glacier 'pirating', which ccan occur when an advanced cell + ! This code protects against glacier 'pirating', which can occur when an advanced cell ! is adjacent to two different glaciers, call them A and B. ! Suppose the cell is fed primarily by glacier A but has the same ID as glacier B, ! and has a more positive SMB as a result of belonging to B rather than A. diff --git a/libglissade/glissade_inversion.F90 b/libglissade/glissade_inversion.F90 index c8d38bff..d12f25ca 100644 --- a/libglissade/glissade_inversion.F90 +++ b/libglissade/glissade_inversion.F90 @@ -47,7 +47,7 @@ module glissade_inversion ! a target ice thickness field. !----------------------------------------------------------------------------- - logical, parameter :: verbose_inversion = .false. + logical, parameter :: verbose_inversion = .false. !*********************************************************************** diff --git a/libglissade/glissade_utils.F90 b/libglissade/glissade_utils.F90 index 0a959f1e..4f1de07f 100644 --- a/libglissade/glissade_utils.F90 +++ b/libglissade/glissade_utils.F90 @@ -687,7 +687,7 @@ subroutine glissade_usrf_to_thck(usrf, topg, eus, thck) ! That is, if topg - eus < 0 (marine-based ice), and if the upper surface is too close ! to sea level to ground the ice, then the ice thickness is chosen to satisfy ! rhoi*H = -rhoo*(topg-eus). - ! Note: usrf, topg, eus and thck must all have the same units (often but not necessarily meters). + ! Note: usrf, topg, eus and thck must all have the same units (usually but not necessarily meters). use glimmer_physcon, only : rhoo, rhoi @@ -952,7 +952,7 @@ end subroutine glissade_input_fluxes ! subroutines belonging to the write_array_to_file interface - subroutine write_array_to_file_real8_2d(arr, fileunit, filename, parallel) + subroutine write_array_to_file_real8_2d(arr, fileunit, filename, parallel, write_binary) ! Copy the input array into a global array and write all values to an output file. ! This can be useful for debugging, if we want to find differences between two fields @@ -970,20 +970,38 @@ subroutine write_array_to_file_real8_2d(arr, fileunit, filename, parallel) integer, intent(in) :: fileunit character(len=*), intent(in) :: filename type(parallel_type), intent(in) :: parallel + logical, intent(in), optional :: write_binary integer :: i, j character(len=64) :: binary_str real(dp), dimension(:,:), allocatable :: arr_global + logical :: binary_output + + if (present(write_binary)) then + binary_output = write_binary + else + binary_output = .false. + endif call gather_var(arr, arr_global, parallel) if (main_task) then open(unit=fileunit, file=trim(filename), status='replace', position='append') - do j = 1, parallel%global_nsn - do i = 1, parallel%global_ewn - call double_to_binary(arr_global(i,j), binary_str) - write (fileunit, '(2i6,a4,a64)') i, j, ' ', binary_str + + if (binary_output) then + do j = 1, parallel%global_nsn + do i = 1, parallel%global_ewn + call double_to_binary(arr_global(i,j), binary_str) + write (fileunit, '(2i6,a4,a64)') i, j, ' ', binary_str + enddo enddo - enddo + else + do j = 1, parallel%global_nsn + do i = 1, parallel%global_ewn + write (fileunit, '(2i6,a4,f24.16)') i, j, ' ', arr_global(i,j) + enddo + enddo + endif + close(unit=fileunit) deallocate(arr_global) endif @@ -991,7 +1009,7 @@ subroutine write_array_to_file_real8_2d(arr, fileunit, filename, parallel) end subroutine write_array_to_file_real8_2d - subroutine write_array_to_file_real8_3d(arr, fileunit, filename, parallel, cycle_indices) + subroutine write_array_to_file_real8_3d(arr, fileunit, filename, parallel, write_binary, cycle_indices) ! Copy the input array into a global array and write all values to an output file. ! This can be useful for debugging, if we want to find differences between two fields @@ -1009,14 +1027,22 @@ subroutine write_array_to_file_real8_3d(arr, fileunit, filename, parallel, cycle integer, intent(in) :: fileunit character(len=*), intent(in) :: filename type(parallel_type), intent(in) :: parallel + logical, intent(in), optional :: write_binary logical, intent(in), optional :: cycle_indices ! if true, then index 3->1, 1->2, 2->3 integer :: i, j, k, kmax character(len=64) :: binary_str real(dp), dimension(:,:,:), allocatable :: arr_global real(dp), dimension(:,:,:), allocatable :: arr_cycle + logical :: binary_output logical :: cycle_ind + if (present(write_binary)) then + binary_output = write_binary + else + binary_output = .false. + endif + if (present(cycle_indices)) then cycle_ind = cycle_indices else @@ -1042,14 +1068,26 @@ subroutine write_array_to_file_real8_3d(arr, fileunit, filename, parallel, cycle if (main_task) then open(unit=fileunit, file=trim(filename), status='unknown') - do j = 1, parallel%global_nsn - do i = 1, parallel%global_ewn - do k = 1, kmax - call double_to_binary(arr_global(k,i,j), binary_str) - write (fileunit, '(3i6,a4,a64)') i, j, k, ' ', binary_str + + if (binary_output) then + do j = 1, parallel%global_nsn + do i = 1, parallel%global_ewn + do k = 1, kmax + call double_to_binary(arr_global(k,i,j), binary_str) + write (fileunit, '(3i6,a4,a64)') i, j, k, ' ', binary_str + enddo enddo enddo - enddo + else + do j = 1, parallel%global_nsn + do i = 1, parallel%global_ewn + do k = 1, kmax + write (fileunit, '(3i6,a4,f24.16)') i, j, k, ' ', arr_global(k,i,j) + enddo + enddo + enddo + endif + close(unit=fileunit) deallocate(arr_global) endif diff --git a/libglissade/glissade_velo_higher.F90 b/libglissade/glissade_velo_higher.F90 index 3b3ec31b..fb6796f7 100644 --- a/libglissade/glissade_velo_higher.F90 +++ b/libglissade/glissade_velo_higher.F90 @@ -2408,15 +2408,12 @@ subroutine glissade_velo_higher_solve(model, & efvs_qp_3d) !WHL - debug - BFB check - if (0 == 1) then -!! if (verbose_reprosum .and. counter == 1) then + if (verbose_reprosum .and. counter == 1) then if (main_task) write(iulog,*) 'Write out matrices after assemble_stiffness_matrix' - call write_array_to_file(Auu_2d, 21, 'global_Auu1', parallel, cycle_indices = .true.) - call write_array_to_file(Auv_2d, 22, 'global_Auv1', parallel, cycle_indices = .true.) - call write_array_to_file(Avu_2d, 23, 'global_Avu1', parallel, cycle_indices = .true.) - call write_array_to_file(Avv_2d, 24, 'global_Avv1', parallel, cycle_indices = .true.) - call write_array_to_file(bu_2d, 25, 'global_bu1', parallel) - call write_array_to_file(bv_2d, 26, 'global_bv1', parallel) + call write_array_to_file(Auu_2d, 21, 'global_Auu1', parallel, write_binary = .true., cycle_indices = .true.) + call write_array_to_file(Auv_2d, 22, 'global_Auv1', parallel, write_binary = .true., cycle_indices = .true.) + call write_array_to_file(Avu_2d, 23, 'global_Avu1', parallel, write_binary = .true., cycle_indices = .true.) + call write_array_to_file(Avv_2d, 24, 'global_Avv1', parallel, write_binary = .true., cycle_indices = .true.) endif !WHL - debug - BFB check @@ -2700,15 +2697,15 @@ subroutine glissade_velo_higher_solve(model, & call t_stopf('glissade_halo_bxxs') !WHL - debug - Write all the matrix elements and rhs elements (in binary form) to files -!! if (verbose_reprosum .and. counter == 1) then - if (0 == 1) then + if (verbose_reprosum .and. counter == 1) then if (main_task) write(iulog,*) 'Write out matrices after adding BC' - call write_array_to_file(Auu_2d, 21, 'global_Auu3', parallel, cycle_indices = .true.) - call write_array_to_file(Auv_2d, 22, 'global_Auv3', parallel, cycle_indices = .true.) - call write_array_to_file(Avu_2d, 23, 'global_Avu3', parallel, cycle_indices = .true.) - call write_array_to_file(Avv_2d, 24, 'global_Avv3', parallel, cycle_indices = .true.) - call write_array_to_file(bu_2d, 25, 'global_bu3', parallel) - call write_array_to_file(bv_2d, 26, 'global_bv3', parallel) +!! call write_array_to_file(Auu_2d(:,:,5), 30, 'global_Auu2', parallel) ! diagonal terms only + call write_array_to_file(Auu_2d, 31, 'global_Auu2', parallel, write_binary = .true., cycle_indices = .true.) + call write_array_to_file(Auv_2d, 32, 'global_Auv2', parallel, write_binary = .true., cycle_indices = .true.) + call write_array_to_file(Avu_2d, 33, 'global_Avu2', parallel, write_binary = .true., cycle_indices = .true.) + call write_array_to_file(Avv_2d, 34, 'global_Avv2', parallel, write_binary = .true., cycle_indices = .true.) + call write_array_to_file(bu_2d, 35, 'global_bu2', parallel, write_binary = .true.) + call write_array_to_file(bv_2d, 36, 'global_bv2', parallel, write_binary = .true.) endif !--------------------------------------------------------------------------- @@ -3192,9 +3189,9 @@ subroutine glissade_velo_higher_solve(model, & ! Optional diagnostics if (verbose_beta .and. counter > 1 .and. mod(counter-1,12)==0) then -!! if (verbose_beta) then - call point_diag(log10(max(beta_internal,1.d-99)), 'log_beta', itest, jtest, rtest, 7, 7, '(f10.5)') + if (this_rank == rtest) write(iulog,*) 'Counter =', counter + call point_diag(log10(max(beta_internal,1.d-99)), 'log_beta', itest, jtest, rtest, 7, 7) if (solve_2d) then call point_diag(uvel_2d, 'Mean uvel (m/yr)', itest, jtest, rtest, 7, 7) call point_diag(vvel_2d, 'Mean vvel (m/yr)', itest, jtest, rtest, 7, 7) @@ -4157,7 +4154,7 @@ subroutine get_vertex_geometry(nx, ny, & staggered_jhi = parallel%staggered_jhi !---------------------------------------------------------------- - ! Compute the x and y coordinates of each vertex. + ! Copy the x and y coordinates of each vertex from x0 and y0. ! By convention, vertex (i,j) lies at the NE corner of cell(i,j). !---------------------------------------------------------------- From 1a8f477a6585137f1f5a15685be6f15a0999aa5e Mon Sep 17 00:00:00 2001 From: William Lipscomb Date: Mon, 12 Jan 2026 18:21:26 -0700 Subject: [PATCH 08/21] Subglacial hydrology changes to support reprosums This commit modifies the routing algorithm of the subglacial flow-routing scheme to support reproducible sums. The challenge is that as water fluxes are routed downstream, they often pass from one processor to another. This means that fluxes must be stored temporarily in halo cells, then routed further downstream on a subsequent call to the routing subroutine. Depending on the number of tasks, fluxes can enter a given cell in a different order as they are accumulated. This means the final fluxes can differ at roundoff level. The fix here is to convert the fluxes and related floating-point arrays (e.g., the weights that compute how much water is refrozen) from dp to i8 variables before passing them to the routing subroutine. Since the real variables are O(1) or less, they are multiplied by a factor of 1.e16 to conserve water and make the results as close as possible to those without reprosums. I was not able to make this work for (1) flux-routing schemes other than D8 or (2) the option with refreezing enabled. If either of these options is turned on, the fluxes are multiplied by floating-point numbers during each step of the routing, breaking reproducibility. I added some code to check these options at startup and to switch, if needed, to options consistent with reproducible sums. I encapsulated most of the reprosum-related code in a subroutine called route_flux_to_margin_or_halo. There are now two versions of this subroutine: one that works with reprosum-safe i8 arrays and one that uses dp arrays as in the original code. To support the new integer8 variables and subroutine, I added subroutines parallel_halo_integer8_2d, parallel_global_sum_integer8_2d, and parallel_reduce_sum_integer8 in the cism_parallel_module. For an AIS test with subglacial hydrology turned on (with D8 routing and without refreezing), I verified that the results are now BFB with different numbers of tasks. The results agree at roundoff level with runs without reprosums. --- libglide/glide_setup.F90 | 16 +- libglimmer/parallel_mpi.F90 | 185 ++++++- libglissade/glissade.F90 | 19 +- libglissade/glissade_basal_water.F90 | 766 ++++++++++++++++++++------- libglissade/glissade_velo_higher.F90 | 96 +--- 5 files changed, 786 insertions(+), 296 deletions(-) diff --git a/libglide/glide_setup.F90 b/libglide/glide_setup.F90 index d28088be..682ff56d 100644 --- a/libglide/glide_setup.F90 +++ b/libglide/glide_setup.F90 @@ -1971,9 +1971,6 @@ subroutine print_options(model) model%basal_hydro%ho_flux_routing_scheme >= size(ho_flux_routing_scheme)) then call write_log('Error, HO flux routing scheme out of range', GM_FATAL) end if - write(message,*) 'ho_flux_routing_scheme : ',model%basal_hydro%ho_flux_routing_scheme, & - ho_flux_routing_scheme(model%basal_hydro%ho_flux_routing_scheme) - call write_log(message) endif write(message,*) 'ho_whicheffecpress : ',model%options%which_ho_effecpress, & @@ -3388,6 +3385,19 @@ subroutine print_basal_hydro(model) model%basal_hydro%ho_flux_routing_scheme >= size(ho_flux_routing_scheme)) then call write_log('Error, HO flux routing scheme out of range', GM_FATAL) end if + if (model%options%reproducible_sums) then + if (model%basal_hydro%ho_flux_routing_scheme /= HO_FLUX_ROUTING_D8) then + write(message,*) 'With reproducible sums, only D8 flux-routing is supported; switching to D8' + model%basal_hydro%ho_flux_routing_scheme = HO_FLUX_ROUTING_D8 + call write_log(message) + endif + if (model%basal_hydro%btemp_scale > 0.0d0) then + write(message,*) 'With reproducible sums, the flux-routing does not support refreezing;' // & + 'setting btemp_scale = 0' + call write_log(message) + model%basal_hydro%btemp_scale = 0.0d0 + endif + endif ! reproducible sums write(message,*) 'ho_flux_routing_scheme : ',model%basal_hydro%ho_flux_routing_scheme, & ho_flux_routing_scheme(model%basal_hydro%ho_flux_routing_scheme) call write_log(message) diff --git a/libglimmer/parallel_mpi.F90 b/libglimmer/parallel_mpi.F90 index 57c2b20e..6a941283 100644 --- a/libglimmer/parallel_mpi.F90 +++ b/libglimmer/parallel_mpi.F90 @@ -27,7 +27,7 @@ module cism_parallel use netcdf - use glimmer_global, only : dp, sp + use glimmer_global, only : dp, sp, i8 use glimmer_paramets, only: iulog use cism_reprosum_mod, only: cism_reprosum_calc, verbose_reprosum @@ -297,6 +297,7 @@ module cism_parallel interface parallel_global_sum module procedure parallel_global_sum_integer_2d module procedure parallel_global_sum_integer_3d + module procedure parallel_global_sum_integer8_2d module procedure parallel_global_sum_real8_2d module procedure parallel_global_sum_real8_3d end interface @@ -321,6 +322,7 @@ module cism_parallel module procedure parallel_halo_real8_2d module procedure parallel_halo_real8_3d module procedure parallel_halo_real8_4d + module procedure parallel_halo_integer8_4d end interface interface parallel_halo_extrapolate @@ -401,6 +403,7 @@ module cism_parallel interface parallel_reduce_sum module procedure parallel_reduce_sum_integer + module procedure parallel_reduce_sum_integer8 module procedure parallel_reduce_sum_real4 module procedure parallel_reduce_sum_real8 module procedure parallel_reduce_sum_integer_nvar @@ -6117,6 +6120,45 @@ function parallel_global_sum_integer_3d(a, parallel, mask_3d) end function parallel_global_sum_integer_3d +!======================================================================= + + function parallel_global_sum_integer8_2d(a, parallel, mask_2d) + + ! Calculates the global sum of a 2D integer(i8) field + + integer(i8), dimension(:,:),intent(in) :: a + type(parallel_type) :: parallel + integer, dimension(:,:), intent(in), optional :: mask_2d + + integer :: i, j + integer, dimension(parallel%local_ewn,parallel%local_nsn) :: mask + integer(i8) :: local_sum + integer(i8) :: parallel_global_sum_integer8_2d + + associate( & + local_ewn => parallel%local_ewn, & + local_nsn => parallel%local_nsn) + + if (present(mask_2d)) then + mask = mask_2d + else + mask = 1 + endif + + local_sum = 0 + do j = nhalo+1, local_nsn-nhalo + do i = nhalo+1, local_ewn-nhalo + if (mask(i,j) == 1) then + local_sum = local_sum + a(i,j) + endif + enddo + enddo + parallel_global_sum_integer8_2d = parallel_reduce_sum(local_sum) + + end associate + + end function parallel_global_sum_integer8_2d + !======================================================================= function parallel_global_sum_real8_2d(a, parallel, mask_2d) @@ -8034,6 +8076,130 @@ subroutine parallel_halo_real8_4d(a, parallel) end subroutine parallel_halo_real8_4d + + subroutine parallel_halo_integer8_4d(a, parallel) + + use mpi_mod + implicit none + integer(i8),dimension(:,:,:,:) :: a + type(parallel_type) :: parallel + + integer :: erequest,ierror,one,nrequest,srequest,wrequest + integer(i8),dimension(size(a,1), size(a,2), lhalo, parallel%local_nsn-lhalo-uhalo) :: esend,wrecv + integer(i8),dimension(size(a,1), size(a,2), uhalo, parallel%local_nsn-lhalo-uhalo) :: erecv,wsend + integer(i8),dimension(size(a,1), size(a,2), parallel%local_ewn, lhalo) :: nsend,srecv + integer(i8),dimension(size(a,1), size(a,2), parallel%local_ewn, uhalo) :: nrecv,ssend + + ! begin + associate( & + outflow_bc => parallel%outflow_bc, & + no_ice_bc => parallel%no_ice_bc, & + local_ewn => parallel%local_ewn, & + local_nsn => parallel%local_nsn, & + east => parallel%east, & + west => parallel%west, & + north => parallel%north, & + south => parallel%south, & + southwest_corner => parallel%southwest_corner, & + southeast_corner => parallel%southeast_corner, & + northeast_corner => parallel%northeast_corner, & + northwest_corner => parallel%northwest_corner & + ) + + ! staggered grid + if (size(a,3)==local_ewn-1.and.size(a,4)==local_nsn-1) return + + ! unknown grid + if (size(a,3)/=local_ewn.or.size(a,4)/=local_nsn) then + write(iulog,*) "Unknown Grid: Size a=(", size(a,1), ",", size(a,2), ",", size(a,3), ",", size(a,4), ") & + &and local_ewn and local_nsn = ", local_ewn, ",", local_nsn + call parallel_stop(__FILE__,__LINE__) + endif + + ! unstaggered grid + call mpi_irecv(wrecv,size(wrecv),mpi_real8,west,west,& + comm,wrequest,ierror) + call mpi_irecv(erecv,size(erecv),mpi_real8,east,east,& + comm,erequest,ierror) + call mpi_irecv(srecv,size(srecv),mpi_real8,south,south,& + comm,srequest,ierror) + call mpi_irecv(nrecv,size(nrecv),mpi_real8,north,north,& + comm,nrequest,ierror) + + esend(:,:,:,:) = & + a(:,:,local_ewn-uhalo-lhalo+1:local_ewn-uhalo,1+lhalo:local_nsn-uhalo) + call mpi_send(esend,size(esend),mpi_real8,east,this_rank,comm,ierror) + wsend(:,:,:,:) = a(:,:,1+lhalo:1+lhalo+uhalo-1,1+lhalo:local_nsn-uhalo) + call mpi_send(wsend,size(wsend),mpi_real8,west,this_rank,comm,ierror) + + call mpi_wait(wrequest,mpi_status_ignore,ierror) + a(:,:,:lhalo,1+lhalo:local_nsn-uhalo) = wrecv(:,:,:,:) + call mpi_wait(erequest,mpi_status_ignore,ierror) + a(:,:,local_ewn-uhalo+1:,1+lhalo:local_nsn-uhalo) = erecv(:,:,:,:) + + nsend(:,:,:,:) = a(:,:,:,local_nsn-uhalo-lhalo+1:local_nsn-uhalo) + call mpi_send(nsend,size(nsend),mpi_real8,north,this_rank,comm,ierror) + ssend(:,:,:,:) = a(:,:,:,1+lhalo:1+lhalo+uhalo-1) + call mpi_send(ssend,size(ssend),mpi_real8,south,this_rank,comm,ierror) + + call mpi_wait(srequest,mpi_status_ignore,ierror) + a(:,:,:,:lhalo) = srecv(:,:,:,:) + call mpi_wait(nrequest,mpi_status_ignore,ierror) + a(:,:,:,local_nsn-uhalo+1:) = nrecv(:,:,:,:) + + if (outflow_bc) then ! set values in global halo to zero + ! interior halo cells should not be affected + + if (this_rank >= east) then ! at east edge of global domain + a(:,:,local_ewn-uhalo+1:,:) = 0 + endif + + if (this_rank <= west) then ! at west edge of global domain + a(:,:,:lhalo,:) = 0 + endif + + if (this_rank >= north) then ! at north edge of global domain + a(:,:,:,local_nsn-uhalo+1:) = 0 + endif + + if (this_rank <= south) then ! at south edge of global domain + a(:,:,:,:lhalo) = 0 + endif + + elseif (no_ice_bc) then + + ! Set values to zero in cells adjacent to the global boundary; + ! includes halo cells and one row of locally owned cells + + if (this_rank >= east) then ! at east edge of global domain + a(:,:,local_ewn-uhalo:,:) = 0 + endif + + if (this_rank <= west) then ! at west edge of global domain + a(:,:,:lhalo+1,:) = 0 + endif + + if (this_rank >= north) then ! at north edge of global domain + a(:,:,:,local_nsn-uhalo:) = 0 + endif + + if (this_rank <= south) then ! at south edge of global domain + a(:,:,:,:lhalo+1) = 0 + endif + + ! Some interior blocks have a single cell at a corner of the global boundary. + ! Set values in corner cells to zero, along with adjacent halo cells. + if (southwest_corner) a(:,:,:lhalo+1,:lhalo+1) = 0 + if (southeast_corner) a(:,:,local_ewn-lhalo:,:lhalo+1) = 0 + if (northeast_corner) a(:,:,local_ewn-lhalo:,local_nsn-lhalo:) = 0 + if (northwest_corner) a(:,:,:lhalo+1,local_nsn-lhalo:) = 0 + + endif ! outflow or no_ice bc + + end associate + + end subroutine parallel_halo_integer8_4d + !======================================================================= ! subroutines for 1D halo updates @@ -9276,6 +9442,23 @@ function parallel_reduce_sum_integer(x) end function parallel_reduce_sum_integer + function parallel_reduce_sum_integer8(x) + + use mpi_mod + implicit none + integer(i8) :: x + + integer :: ierror + integer(i8) :: recvbuf,sendbuf, parallel_reduce_sum_integer8 + + ! begin + sendbuf = x + call mpi_allreduce(sendbuf,recvbuf,1,mpi_integer8,mpi_sum,comm,ierror) + parallel_reduce_sum_integer8 = recvbuf + + end function parallel_reduce_sum_integer8 + + function parallel_reduce_sum_real4(x) use mpi_mod diff --git a/libglissade/glissade.F90 b/libglissade/glissade.F90 index cfcfec39..ceae1895 100644 --- a/libglissade/glissade.F90 +++ b/libglissade/glissade.F90 @@ -1773,6 +1773,7 @@ subroutine glissade_thermal_solve(model, dt) use glissade_masks, only: glissade_get_masks !WHL - debug use cism_parallel, only: parallel_reduce_max + use glissade_utils, only: write_array_to_file implicit none @@ -1911,12 +1912,6 @@ subroutine glissade_thermal_solve(model, dt) endwhere endif - !WHL - debug - Set mask = 0 where thck = 0 for dome test - ! An alternative would be to identify cells that have a path through land to the domain edge - where (model%geometry%thck == 0) - bwat_mask = 0 - endwhere - call parallel_halo(bwat_mask, parallel) ! Set the meltwater source for the basal hydrology scheme. @@ -1931,15 +1926,6 @@ subroutine glissade_thermal_solve(model, dt) model%basal_hydro%bmlt_hydro = 0.0d0 endwhere - call glissade_calcbwat(& - model%options%which_ho_bwat, & - model%basal_hydro, & - dt, & ! s - model%geometry%thck, & ! m - model%numerics%thklim_temp, & ! m - model%basal_melt%bmlt_ground, & ! m/s - model%basal_hydro%bwat) ! m - ! Compute the steady-state basal water flux based on a flux-routing scheme call glissade_bwat_flux_routing(& @@ -1960,7 +1946,8 @@ subroutine glissade_thermal_solve(model, dt) model%basal_hydro%bwat_diag, & ! m model%temper%bhydroflx, & ! W/m2 model%basal_hydro%head, & ! m - model%basal_hydro%grad_head) ! m/m + model%basal_hydro%grad_head, & ! m/m + reprosum_in = model%options%reproducible_sums) ! halo updates (not sure if all are needed) call parallel_halo(model%basal_hydro%bwatflx, parallel) diff --git a/libglissade/glissade_basal_water.F90 b/libglissade/glissade_basal_water.F90 index 593b33b2..8f9e03ef 100644 --- a/libglissade/glissade_basal_water.F90 +++ b/libglissade/glissade_basal_water.F90 @@ -26,13 +26,14 @@ module glissade_basal_water - use glimmer_global, only: dp + use glimmer_global, only: dp, i8 use glimmer_paramets, only: iulog, eps11, eps08 use glimmer_physcon, only: rhoi, rhow, lhci, grav, scyr use glimmer_log use glimmer_utils, only: point_diag use glide_types - use cism_parallel, only: main_task, this_rank, nhalo, parallel_type, parallel_halo + use cism_parallel, only: main_task, this_rank, nhalo, parallel_type, & + parallel_halo, parallel_global_sum !WHL - debug use glimmer_utils, only: double_to_binary @@ -42,8 +43,16 @@ module glissade_basal_water private public :: glissade_basal_water_init, glissade_calcbwat, glissade_bwat_flux_routing - logical, parameter :: verbose_bwat = .false. -!! logical, parameter :: verbose_bwat = .true. +!! logical, parameter :: verbose_bwat = .false. + logical, parameter :: verbose_bwat = .true. + + character(len=64) :: binary_str + + ! two versions of this subroutine; the second supports reproducible sums + interface route_flux_to_margin_or_halo + module procedure route_flux_to_margin_or_halo_real8 + module procedure route_flux_to_margin_or_halo_integer8 + end interface contains @@ -179,7 +188,8 @@ subroutine glissade_bwat_flux_routing(& delta_Tb, btemp_scale, & bwatflx, bwat_diag, & bhydroflx, & - head, grad_head) + head, grad_head, & + reprosum_in) ! Compute the subglacial water flux and water depth using a steady-state flux routing scheme. ! Water is routed down the hydropotential. For routing purposes, assume p_w = p_i (i.e., N = 0). @@ -218,7 +228,7 @@ subroutine glissade_bwat_flux_routing(& integer, dimension(nx,ny), intent(in) :: & bwat_mask, & ! mask to identify cells through which basal water is routed; - ! = 0 for floating and ocean cells; cells at global domain edge; + ! = 0 for floating and ocean cells, cells at global domain edge, ! and cells with thck = 0 and forced negative SMB floating_mask ! = 1 if ice is present (thck > thklim) and floating, else = 0 @@ -237,6 +247,11 @@ subroutine glissade_bwat_flux_routing(& head, & ! hydraulic head (m) grad_head ! gradient of hydraulic head (m/m), averaged to cell centers + ! Note: The reprosum option requires (1) D8 routing (each cell routes its flux to one downstream neighbor only) + ! and (2) no temperature-weighted refreezing. + logical, intent(in), optional :: & + reprosum_in ! if true, then do a computation independent of the number of tasks + ! Local variables integer :: i, j, p @@ -277,6 +292,8 @@ subroutine glissade_bwat_flux_routing(& real(dp) :: c_flux_to_depth ! proportionality coefficient in Sommers et al., Eq. 6 real(dp) :: Reynolds ! Reynolds number (unitless), = 0 for pure laminar flow + logical :: reprosum ! local version of reprosum_in + integer :: nx_test, ny_test real(dp), dimension(:,:), allocatable :: phi_test integer, dimension(:,:), allocatable :: mask_test @@ -285,12 +302,16 @@ subroutine glissade_bwat_flux_routing(& write(iulog,*) 'In glissade_bwat_flux_routing: rtest, itest, jtest =', rtest, itest, jtest endif + if (present(reprosum_in)) then + reprosum = reprosum_in + else + reprosum = .false. + endif + ! Uncomment if the following fields are not already up to date in halo cells ! call parallel_halo(thk, parallel) ! call parallel_halo(topg, parallel) call parallel_halo(bmlt_hydro, parallel) - !TODO - Add bfrz? - ! Compute the hydraulic head ! For purposes of flux routing, assume N = 0. @@ -313,7 +334,6 @@ subroutine glissade_bwat_flux_routing(& endif ! Route basal water down the gradient of hydraulic head, giving a water flux - ! TODO - Pass in bfrz_pot, return bfrz? call route_basal_water(& nx, ny, & @@ -328,7 +348,8 @@ subroutine glissade_bwat_flux_routing(& bwat_mask, & bwatflx, & bwatflx_refreeze, & - lakes) + lakes, & + reprosum) call parallel_halo(bwatflx, parallel) @@ -366,7 +387,11 @@ subroutine glissade_bwat_flux_routing(& bwatflx(:,:) = bwatflx(:,:) / (dx*dy) ! Given bwatflx_refreeze in m^3/s, compute bhydroflx in W/m2. - ! This is the heat flux needed to refreeze the meltwater held in each cell + ! This is the heat flux needed to refreeze the meltwater held in each cell. + ! This heat flux is supplied at the bed during the next thermal solve. + ! If there is more than enough heat to thaw the bed, some meltwater will be returned later + ! instead of refrozen. + bhydroflx(:,:) = bwatflx_refreeze(:,:) * rhoi * lhci / (dx*dy) if (verbose_bwat) then @@ -447,7 +472,8 @@ subroutine route_basal_water(& bwat_mask, & bwatflx, & bwatflx_refreeze, & - lakes) + lakes, & + reprosum_in) ! Route water from the basal melt field to its destination, recording the water flux along the way. ! Water flow direction is determined according to the gradient of the hydraulic head. @@ -456,11 +482,7 @@ subroutine route_basal_water(& ! This results in the lakes field, which is the difference between the filled head and the original head. ! The method used is by Quinn et. al. (1991). ! - ! Based on code by Jesse Johnson (2005), adapted from the glimmer_routing file by Ian Rutt. - - ! TODO - Pass in bfrz_pot, return bfrz. - - use cism_parallel, only: parallel_global_sum + ! Originally based on code by Jesse Johnson and Ian Rutt in the Glimmer model !WHL - debug use cism_parallel, only: parallel_globalindex, parallel_reduce_max @@ -494,19 +516,22 @@ subroutine route_basal_water(& integer, dimension(nx,ny), intent(in) :: & bwat_mask ! mask to identify cells through which basal water is routed; - ! = 1 where ice is present and not floating + ! excludes floating and ocean cells real(dp), dimension(nx,ny), intent(out) :: & bwatflx, & ! water flux through a grid cell (m^3/s) bwatflx_refreeze, & ! water flux held for refreezing (m^3/s) lakes ! lakes field, difference between filled and original head + logical, intent(in), optional :: & + reprosum_in ! if true, then do a computation independent of the number of tasks + ! Local variables integer :: nlocal ! number of locally owned cells integer :: count, count_max ! iteration counters - integer :: i, j, k, ii, jj, ip, jp, p - integer :: i1, j1, i2, j2, itmp, jtmp, iglobal, jglobal + integer :: i, j, k, iglobal, jglobal + integer :: ii, jj, imax, jmax logical :: finished ! true when an iterative loop has finished @@ -514,15 +539,13 @@ subroutine route_basal_water(& sorted_ij ! i and j indices of all cells, sorted from low to high values of head real(dp), dimension(-1:1,-1:1,nx,ny) :: & - flux_fraction, & ! fraction of flux from each cell that flows downhill to each of 8 neighbors - bwatflx_halo ! water flux (m^3/s) routed to a neighboring halo cell; routed further in next iteration + flux_fraction ! fraction of flux from each cell that flows downhill to each of 8 neighbors real(dp), dimension(nx,ny) :: & head_filled, & ! head after depressions are filled (m) + btemp_weight, & ! temperature-dependent weighting factor, favoring flow where the bed is thawed bwatflx_accum, & ! water flux through the cell (m^3/s) accumulated over multiple iterations - bwatflx_refreeze_accum,& ! water flux (m^3/s) refreezing in place, accumulated over multiple iterations - sum_bwatflx_halo, & ! bwatflx summed over the first 2 dimensions in each grid cell - btemp_weight ! temperature-dependent weighting factor, favoring flow where the bed is thawed + bwatflx_refreeze_accum ! water flux (m^3/s) refreezing in place, accumulated over multiple iterations integer, dimension(nx,ny) :: & local_mask, & ! = 1 for cells owned by the local processor, else = 0 @@ -537,11 +560,33 @@ subroutine route_basal_water(& err, & ! water conservation error global_flux_sum ! flux sum over all cells in global domain + ! The following i8 variables are for computing reproducible sums + integer(i8), dimension(nx,ny) :: & + bwatflx_int, & ! water flux through a grid cell (m^3/s) + btemp_weight_int, & ! temperature-dependent weighting factor, favoring flow where the bed is thawed + bwatflx_accum_int, & ! water flux through the cell (m^3/s) accumulated over multiple iterations + bwatflx_refreeze_accum_int ! water flux (m^3/s) refreezing in place, accumulated over multiple iterations + + integer(i8), dimension(-1:1,-1:1,nx,ny) :: & + flux_fraction_int ! fraction of flux from each cell that flows downhill to each of 8 neighbors + + real(dp), parameter :: & + factor_bwatflx = 1.d16 ! factor for converting between bwatflx and bwatflx_int; + ! large value desired for water mass conservation + + logical :: reprosum ! local version of reprosum_in + character(len=100) :: message !WHL - debug character(len=64) :: binary_str + if (present(reprosum_in)) then + reprosum = reprosum_in + else + reprosum = .false. + endif + ! Allocate the sorted_ij array nlocal = parallel%own_ewn * parallel%own_nsn @@ -622,21 +667,25 @@ subroutine route_basal_water(& ! (2) When water enters a frozen cell (delta_Tb > 0), btemp_weight is used to determine ! how much of the flux is refrozen in place rather than passing through. ! A low value of btemp_weight means that less water passes through. + ! Note: For reproducible sums, refreezing is not supported; must have btemp_weight = 1 everywhere. btemp_weight = 1.0d0 if (btemp_scale > 0.0d0) then - where (bwat_mask == 1) - where (delta_Tb > 0.0d0) - btemp_weight = exp(-delta_Tb/btemp_scale) + if (.not. reprosum) then + where (bwat_mask == 1) + where (delta_Tb > 0.0d0) + btemp_weight = exp(-delta_Tb/btemp_scale) + endwhere endwhere - endwhere - if (verbose_bwat) then - call point_diag(delta_Tb, 'Tpmp - Tb', itest, jtest, rtest, 7, 7) - call point_diag(btemp_weight, 'btemp_weight', itest, jtest, rtest, 7, 7) endif endif + if (verbose_bwat) then + call point_diag(delta_Tb, 'Tpmp - Tb', itest, jtest, rtest, 7, 7) + call point_diag(btemp_weight, 'btemp_weight', itest, jtest, rtest, 7, 7) + endif + ! Compute the fraction of the incoming flux sent to each downstream neighbor. call get_flux_fraction(& @@ -650,14 +699,8 @@ subroutine route_basal_water(& bwat_mask, & flux_fraction) - ! Initialize bwatflx in locally owned cells with the basal melt, which will be routed downslope. - ! Multiply by area, so units are m^3/s. - ! The halo water flux, bwatflx_halo, holds water routed to halo cells; - ! it will be routed downhill during the next iteration. - ! The accumulated flux, bwatflx_accum, holds the total flux over multiple iterations. - ! Some or all of the water entering a frozen cell can be refrozen in place. - ! The heat flux associated with refreezing is passed to the next thermal solve. - ! If this heat flux is enough to thaw the cell, some of the meltwater is returned later. + ! Initialize bwatflx in locally owned cells. + ! Set to the local melt rate, multiplied by area (so the units are m^3/s). bwatflx = 0.0d0 do j = nhalo+1, ny-nhalo @@ -666,174 +709,132 @@ subroutine route_basal_water(& enddo enddo - ! Initialize other fluxes - bwatflx_halo = 0.0d0 - bwatflx_refreeze = 0.0d0 - bwatflx_accum = 0.0d0 - bwatflx_refreeze_accum = 0.0d0 - ! Compute total input of meltwater (m^3/s) total_flux_in = parallel_global_sum(bwatflx, parallel) - if (verbose_bwat .and. this_rank == rtest) then - write(iulog,*) ' ' write(iulog,*) 'Total input basal melt flux (m^3/s):', total_flux_in +!! call double_to_binary(total_flux_in, binary_str) +!! write(iulog,*) ' Binary string', binary_str endif - ! Loop over locally owned cells, from highest to lowest. - ! During each iteration, there are two possible outcomes for routing: - ! (1) Routed to the ice sheet margin, to a cell with bwat_mask = 0. - ! In this case, the routing of that flux is done. - ! (2) Routed to a halo cell, i.e. a downslope cell on a neighboring processor. - ! In this case, the flux will be routed further downhill on the next iteration. - ! When all the water has been routed to the margin, we are done. + ! Route the water downstream, keeping track of the steady-state flux through each cell. + ! The loop goes from highest to lowest values of head on the local processor. + ! At the end of the loop, all the incoming flux has either been + ! (1) routed to the ice sheet margin, + ! (2) set aside for later refreezing, or + ! (3) routed to a halo cell, from which it will continue downstream on the next iteration. + ! When all the water has been routed to the margin or set aside for refreezing, we are done. - count = 0 ! Note: It is hard to predict how many iterations will be sufficient. - ! With Dinf or FD8, we can have flow back and forth across processor boundaries, + ! With Dinf or FD8 we can have flow back and forth across processor boundaries, ! requiring many iterations to reach the margin. ! For Greenland 4 km, Dinf requires ~20 iterations on 4 cores, and FD8 can require > 40. ! For Antarctica 8 km, FD8 can require > 50. - count_max = 100 - finished = .false. - do while (.not.finished) - - count = count + 1 + ! Initialize the cumulative fluxes + bwatflx_accum = 0.0d0 + bwatflx_refreeze_accum = 0.0d0 - if (verbose_bwat .and. this_rank == rtest) then - write(iulog,*) 'flux routing, count =', count - endif + if (reprosum) then - do k = nlocal, 1, -1 + ! Convert bwatflx to a scaled i8 array + bwatflx_int(:,:) = nint(bwatflx(:,:)*factor_bwatflx, i8) - ! Get i and j indices of current cell - i = sorted_ij(k,1) - j = sorted_ij(k,2) + ! Convert flux_fraction to i8 + ! Note: This will work only for the D8 scheme, where all the flux goes downstream + ! to a single cell. + flux_fraction_int(:,:,:,:) = nint(flux_fraction(:,:,:,:), i8) - ! Route the flux - if (bwat_mask(i,j) == 1 .and. bwatflx(i,j) > 0.0d0) then + ! Convert btemp_weight to i8 + btemp_weight_int(:,:) = 1 + ! Note: Can round up to 1 and down to 0 by uncommenting the following line. + ! However, a mix of 1's and 0's leads to oscillations in basal temperature, + ! so it is safer to turn off refreezing by setting btemp_weight = 1 everywhere. +! btemp_weight_int(:,:) = nint(btemp_weight(:,:), i8) - ! Distribute the flux to downslope neighbors. - ! Where the bed is frozen, all or part of the flux is refrozen in place instead of being routed downstream. - do jj = -1,1 - do ii = -1,1 - ip = i + ii - jp = j + jj - if (flux_fraction(ii,jj,i,j) > 0.0d0) then - if (halo_mask(ip,jp) == 1) then - bwatflx_halo(ii,jj,i,j) = bwatflx(i,j)*flux_fraction(ii,jj,i,j)*btemp_weight(i,j) - bwatflx_refreeze(i,j) = bwatflx_refreeze(i,j) & - + bwatflx(i,j)*flux_fraction(ii,jj,i,j)*(1.0d0 - btemp_weight(i,j)) - if (verbose_bwat .and. this_rank==rtest .and. i==itest .and. j==jtest .and. count <= 2) then - write(iulog,*) 'Flux to halo, i, j, ii, jj, flux:', & - i, j, ii, jj, bwatflx(i,j)*flux_fraction(ii,jj,i,j) - endif - elseif (local_mask(ip,jp) == 1) then - bwatflx(ip,jp) = bwatflx(ip,jp) + bwatflx(i,j)*flux_fraction(ii,jj,i,j)*btemp_weight(i,j) - bwatflx_refreeze(i,j) = bwatflx_refreeze(i,j) & - + bwatflx(i,j)*flux_fraction(ii,jj,i,j)*(1.0d0 - btemp_weight(i,j)) - endif - endif ! flux_fraction > 0 - enddo - enddo - endif ! bwat_mask = 1, bwatflx > 0 - enddo ! loop from high to low + ! Initialize other arrays + bwatflx_accum_int = 0 + bwatflx_refreeze_accum_int = 0 - ! Accumulate bwatflx from the latest iteration, then reset to zero for the next iteration. - bwatflx_accum = bwatflx_accum + bwatflx - bwatflx_refreeze_accum = bwatflx_refreeze_accum + bwatflx_refreeze - bwatflx = 0.0d0 - bwatflx_refreeze = 0.0d0 + endif ! reprosum - if (verbose_bwat .and. this_rank == rtest .and. count <= 2) then - i = itest - j = jtest - write(iulog,*) 'i, j, bwatflx_accum:', i, j, bwatflx_accum(i,j) - endif + count = 0 + count_max = 100 + finished = .false. - ! If bwatflx_halo = 0 everywhere, then we are done. - ! (If the remaining flux is very small (< eps11), discard it to avoid - ! unnecessary extra iterations.) - ! If bwatflx_halo remains, then communicate it to neighboring tasks and - ! continue routing on the next iteration. - - do j = 1, ny - do i = 1, nx - sum_bwatflx_halo(i,j) = sum(bwatflx_halo(:,:,i,j)) -! if (verbose_bwat .and. sum_bwatflx_halo(i,j) > eps11 .and. count > 50) then -! write(iulog,*) 'Nonzero bwatflx_halo, count, rank, i, j, sum_bwatflx_halo:', & -! count, this_rank, i, j, sum_bwatflx_halo(i,j) -! call parallel_globalindex(i, j, iglobal, jglobal, parallel) -! write(iulog,*) ' iglobal, jglobal:', iglobal, jglobal -! endif - enddo - enddo - global_flux_sum = parallel_global_sum(sum_bwatflx_halo, parallel) + do while (.not.finished) - if (verbose_bwat .and. count <= 2) then - if (this_rank == rtest) then - write(iulog,*) 'Before halo update, sum of bwatflx_halo:', global_flux_sum - endif - call point_diag(sum_bwatflx_halo, 'sum_bwatflx_halo', itest, jtest, rtest, 7, 7) + count = count + 1 + if (verbose_bwat .and. this_rank == rtest) write(iulog,*) 'flux routing, count =', count + if (count > count_max) then + call write_log('Hydrology error: too many iterations in route_basal_water', GM_FATAL) endif - if (global_flux_sum > eps11) then - - finished = .false. + if (reprosum) then + + ! route downstream + ! Note: The fluxes are scaled by factor_bwatflx + + call route_flux_to_margin_or_halo(& + nx, ny, nlocal, & + itest, jtest, rtest, count, & + parallel, & + sorted_ij, & + local_mask, & + halo_mask, & + bwat_mask, & + flux_fraction_int, & + btemp_weight_int, & + bwatflx_int, & + bwatflx_accum_int, & + bwatflx_refreeze_accum_int, & + finished) - ! Communicate bmltflx_halo to the halo cells of neighboring processors - call parallel_halo(bwatflx_halo(:,:,:,:), parallel) - - ! bmltflx_halo is now available in the halo cells of the local processor. - ! Route downslope to the adjacent locally owned cells. - ! These fluxes will be routed further downslope during the next iteration. - - do j = 2, ny-1 - do i = 2, nx-1 - if (halo_mask(i,j) == 1 .and. sum(bwatflx_halo(:,:,i,j)) > 0.0d0) then - do jj = -1,1 - do ii = -1,1 - if (bwatflx_halo(ii,jj,i,j) > 0.0d0) then - ip = i + ii - jp = j + jj - if (local_mask(ip,jp) == 1) then - bwatflx(ip,jp) = bwatflx(ip,jp) + bwatflx_halo(ii,jj,i,j) - if (verbose_bwat .and. ip==itest .and. jp==jtest .and. this_rank==rtest & - .and. count <= 2) then - write(iulog,*) 'Nonzero bwatflx from halo, rank, i, j:', & - this_rank, ip, jp, bwatflx_halo(ii,jj,i,j) - endif - endif - endif ! bwatflx_halo > 0 to a local cell - enddo ! ii - enddo ! jj - endif ! bwatflx_halo > 0 from this halo cell - enddo ! i - enddo ! j + if (verbose_bwat .and. this_rank == rtest .and. count <= 2) then + i = itest; j = jtest + write(iulog,*) 'count, rank i, j, bwatflx_accum (m/yr), bwatflx_refreeze_accum:', & + count, rtest, i, j, real(bwatflx_accum_int(i,j),dp)/factor_bwatflx, & + real(bwatflx_refreeze_accum_int(i,j),dp)/factor_bwatflx + endif - ! Reset bwatflx_halo for the next iteration - bwatflx_halo = 0.0d0 + else ! non-reproducible sums + + call route_flux_to_margin_or_halo(& + nx, ny, nlocal, & + itest, jtest, rtest, count, & + parallel, & + sorted_ij, & + local_mask, & + halo_mask, & + bwat_mask, & + flux_fraction, & + btemp_weight, & + bwatflx, & + bwatflx_accum, & + bwatflx_refreeze_accum, & + finished) - global_flux_sum = parallel_global_sum(bwatflx, parallel) if (verbose_bwat .and. this_rank == rtest .and. count <= 2) then - ! Should be equal to the global sum of bwatflx_halo computed above - write(iulog,*) 'After halo update, sum(bwatflx from halo) =', global_flux_sum - write(iulog,*) ' ' + i = itest; j = jtest + write(iulog,*) 'count, rank i, j, bwatflx_accum(m/yr), bwatflx_refreeze_accum:', & + count, rtest, i, j, bwatflx_accum(i,j) * scyr/(dx*dy), & + bwatflx_refreeze_accum(i,j) * scyr/(dx*dy) endif - else ! bwatflx_halo = 0 everywhere; no fluxes to route to adjacent processors - if (verbose_bwat .and. this_rank == rtest) write(iulog,*) 'Done routing fluxes' - finished = .true. - bwatflx = bwatflx_accum - bwatflx_refreeze = bwatflx_refreeze_accum - endif + endif ! reprosum - if (count > count_max) then - call write_log('Hydrology error: too many iterations in route_basal_water', GM_FATAL) - endif + enddo ! finished - enddo ! finished routing + if (reprosum) then + ! Convert fluxes back to real(dp) + bwatflx_accum = real(bwatflx_accum_int, dp) / factor_bwatflx + bwatflx_refreeze_accum = real(bwatflx_refreeze_accum_int, dp) / factor_bwatflx + endif + + ! Copy the accumulated values to the output arrays bwatflx and bwatflx_refreeze + bwatflx = bwatflx_accum + bwatflx_refreeze = bwatflx_refreeze_accum + if (verbose_bwat .and. this_rank == rtest) write(iulog,*) 'Done routing fluxes' ! Identify cells just beyond the ice sheet margin, which can receive from upstream but not send downstream where (bwat_mask == 0 .and. bwatflx > 0.0d0) @@ -849,18 +850,11 @@ subroutine route_basal_water(& if (verbose_bwat .and. this_rank == rtest) then write(iulog,*) 'Total bwatflx at margin (m^3/s):', total_flux_margin - call double_to_binary(total_flux_margin, binary_str) - write(iulog,*) ' ', binary_str write(iulog,*) 'Total bwatflx_refreeze (m^3/s)=', total_flux_refreeze - call double_to_binary(total_flux_refreeze, binary_str) - write(iulog,*) ' ', binary_str write(iulog,*) 'Total bwatflx (m^3/s)=', total_flux_out - call double_to_binary(total_flux_out, binary_str) - write(iulog,*) ' ', binary_str write(iulog,*) 'Difference between output and input =', total_flux_out - total_flux_in endif - ! Not sure if a threshold of eps11 is large enough. Increase if needed. if (total_flux_in > 0.0d0) then err = abs(total_flux_in - total_flux_out) @@ -1066,7 +1060,7 @@ subroutine fill_depressions(& real(dp), dimension(nx,ny), intent(out) :: & phi ! output field with depressions filled - ! Local variables -------------------------------------- + ! Local variables logical, dimension(nx,ny) :: & known ! = true for cells where the final phi(i,j) is known @@ -1276,7 +1270,7 @@ subroutine sort_heights(& integer, intent(in) :: & nx, ny, & ! number of grid cells in each direction nlocal, & ! number of locally owned cells - itest, jtest, rtest ! coordinates of diagnostic point + itest, jtest, rtest ! coordinates of diagnostic point !! not currently used real(dp), dimension(nx,ny), intent(in) :: & phi ! input field, to be sorted from low to high @@ -1319,17 +1313,6 @@ subroutine sort_heights(& call indexx(vect, ind) - if (verbose_bwat .and. this_rank == rtest) then - write(iulog,*) ' ' - write(iulog,*) 'Sort from low to high, nlocal =', nlocal - write(iulog,*) 'k, local i and j, ind(k), phi:' - do k = nlocal, nlocal-10, -1 - i = floor(real(ind(k)-1)/real(ny_local)) + 1 + nhalo - j = mod(ind(k)-1,ny_local) + 1 + nhalo - write(iulog,*) k, i, j, ind(k), phi(i,j) - enddo - endif - ! Fill the sorted_ij array with the i and j values of each cell. ! Note: These are the i and j values we would have if there were no halo cells. do k = 1, nlocal @@ -1600,6 +1583,391 @@ subroutine get_flux_fraction(& end subroutine get_flux_fraction +!============================================================== + + subroutine route_flux_to_margin_or_halo_real8(& + nx, ny, nlocal, & + itest, jtest, rtest, count, & + parallel, & + sorted_ij, & + local_mask, & + halo_mask, & + bwat_mask, & + flux_fraction, & + btemp_weight, & + bwatflx, & + bwatflx_accum, & + bwatflx_refreeze_accum, & + finished) + + ! Given the input bwatflx, route the water downstream, keeping track of fluxes along the way. + ! The loop goes from highest to lowest values of 'head' on the local processor. + ! At the end of the loop, all the incoming flux has either been + ! (1) routed to the ice sheet margin; + ! (2) set aside for later refreezing; or + ! (3) routed to a halo cell, from which it continues downstream the next time the subroutine is called. + ! The subroutine is called iteratively until all no water remains in halo cells. + + implicit none + + ! Input/output variables + + integer, intent(in) :: & + nx, ny, & ! number of cells in each direction + nlocal, & ! number of locally owned grid cells on the processor + itest, jtest, rtest, & ! coordinates of diagnostic point + count ! iteration count (diagnostic only) + + type(parallel_type), intent(in) :: & + parallel ! info for parallel communication + + integer, dimension(nlocal,2), intent(in) :: & + sorted_ij ! i and j indices of each local cell, sorted low to high + + integer, dimension(nx,ny), intent(in) :: & + local_mask, & ! = 1 for cells owned by the local processor, else = 0 + halo_mask, & ! = 1 for the layer of halo cells adjacent to locally owned cells, else = 0 + bwat_mask ! = 1 for cells through which basal water is routed; excludes floating and ocean cells + + real(dp), dimension(-1:1,-1:1,nx,ny), intent(in) :: & + flux_fraction ! fraction of flux from a cell that flows downhill to each of 8 neighbors + ! last two indices identify the source cell; + ! 1st two indices give relative location of receiving cell + + real(dp), dimension(nx,ny), intent(in) :: & + btemp_weight ! temperature-dependent weighting factor, favoring flow where the bed is thawed + + real(dp), dimension(nx,ny), intent(inout) :: & + bwatflx, & ! on input: water flux (m^3/s) to be routed to the margin or halo + ! on output: flux routed to halo, to be routed further next time + bwatflx_accum, & ! cumulative bwatflx (m/3/s) over multiple iterations + bwatflx_refreeze_accum ! cumulative bwatflx_refreeze (m^s/s) over multiple iterations + + logical, intent(inout) :: & + finished ! initially F; set to T when all water has been routed as far as it can go + + ! Local variables + + integer :: i, j, k + integer :: ii, jj, ip, jp + + real(dp), dimension(-1:1,-1:1,nx,ny):: & + bwatflx_halo ! flux routed to halo cells + ! last two indices identify the source cell; + ! 1st two indices give relative location of receiving cell + + real(dp), dimension(nx,ny) :: & + bwatflx_refreeze, & ! flux (m^3/s) saved for later refreezing; not routed further downstream + sum_bwatflx_halo ! bwatflx_halo summed over the first 2 indices + + real(dp) :: & + flx_thru, & ! flux (m^3/s) that continues downstream + global_halo_sum ! global sum of water flux in halo cells + + ! Initialize fluxes + bwatflx_refreeze = 0.0d0 + bwatflx_halo = 0.0d0 + + ! loop from high to low values on the local processor + do k = nlocal, 1, -1 + + ! Get i and j indices of current cell + i = sorted_ij(k,1) + j = sorted_ij(k,2) + + if (bwat_mask(i,j) == 1 .and. bwatflx(i,j) > 0.0d0) then + + ! Distribute the flux to downstream neighbors. + ! Based on the temperature-dependent weighting factor btemp_weight, all or part of the flux + ! is refrozen in place instead of being routed downstream. + flx_thru = bwatflx(i,j) * btemp_weight(i,j) + bwatflx_refreeze(i,j) = bwatflx(i,j) * (1.0d0 - btemp_weight(i,j)) + do jj = -1,1 + do ii = -1,1 + ip = i + ii + jp = j + jj + if (flux_fraction(ii,jj,i,j) > 0.0d0) then + if (halo_mask(ip,jp) == 1) then + bwatflx_halo(ii,jj,i,j) = flx_thru*flux_fraction(ii,jj,i,j) + if (verbose_bwat .and. this_rank==rtest .and. i==itest .and. j==jtest .and. count <= 2) then + write(iulog,*) 'Flux to halo, i, j, ii, jj, flux:', & + i, j, ii, jj, flx_thru*flux_fraction(ii,jj,i,j) + endif + elseif (local_mask(ip,jp) == 1) then + bwatflx(ip,jp) = bwatflx(ip,jp) + flx_thru*flux_fraction(ii,jj,i,j) + if (verbose_bwat .and. this_rank==rtest .and. i==itest .and. j==jtest .and. count <= 2) then + write(iulog,*) 'Flux to neighbor, i, j, ii, jj, flux:', & + i, j, ii, jj, flx_thru*flux_fraction(ii,jj,i,j) + endif + endif + endif ! flux_fraction > 0 + enddo ! ii + enddo ! jj + endif ! bwat_mask = 1, bwatflx > 0 + enddo ! loop from high to low + + ! Accumulate the fluxes in the output arrays + bwatflx_accum = bwatflx_accum + bwatflx + bwatflx_refreeze_accum = bwatflx_refreeze_accum + bwatflx_refreeze + + ! Compute the total bwatflx in halo cells + do j = 1, ny + do i = 1, nx + sum_bwatflx_halo(i,j) = sum(bwatflx_halo(:,:,i,j)) + enddo + enddo + global_halo_sum = parallel_global_sum(sum_bwatflx_halo, parallel) + + ! If bwatflx_halo = 0 everywhere, then we are done. + ! Where bwatflx_halo is nonzero, communicate it to the neighboring task. + ! It will be routed further downstream the next time this subroutine is called. + + if (global_halo_sum > 0.0d0) then + + if (verbose_bwat .and. count <= 2) then + if (this_rank == rtest) write(iulog,*) 'Before halo update, global_halo_sum:', global_halo_sum + call point_diag(sum_bwatflx_halo, 'sum_bwatflx_halo', itest, jtest, rtest, 7, 7) + endif + + ! Reset bwatflx to zero for the halo transfer + bwatflx = 0.0d0 + + ! Communicate bmltflx_halo to the halo cells of neighboring processors + call parallel_halo(bwatflx_halo(:,:,:,:), parallel) + + ! bmltflx_halo is now available in the halo cells of the local processor. + ! Route downslope to the adjacent locally owned cells. + ! These fluxes will be routed further downstream during the next iteration. + + do j = 2, ny-1 + do i = 2, nx-1 + if (halo_mask(i,j) == 1 .and. sum(bwatflx_halo(:,:,i,j)) > 0.0d0) then + do jj = -1,1 + do ii = -1,1 + if (bwatflx_halo(ii,jj,i,j) > 0.0d0) then + ip = i + ii + jp = j + jj + if (local_mask(ip,jp) == 1) then + bwatflx(ip,jp) = bwatflx(ip,jp) + bwatflx_halo(ii,jj,i,j) + if (verbose_bwat .and. ip==itest .and. jp==jtest .and. this_rank==rtest .and. count <= 2) then + write(iulog,*) 'Nonzero bwatflx from halo, rank, i, j:', & + this_rank, ip, jp, bwatflx_halo(ii,jj,i,j) + endif + endif + endif ! bwatflx_halo > 0 to a local cell + enddo ! ii + enddo ! jj + endif ! bwatflx_halo > 0 from this halo cell + enddo ! i + enddo ! j + + else + + finished = .true. ! no water in halo cells to route further + + endif ! global_halo_sum > 0 + + end subroutine route_flux_to_margin_or_halo_real8 + +!============================================================== + + subroutine route_flux_to_margin_or_halo_integer8(& + nx, ny, nlocal, & + itest, jtest, rtest, count, & + parallel, & + sorted_ij, & + local_mask, & + halo_mask, & + bwat_mask, & + flux_fraction, & + btemp_weight, & + bwatflx, & + bwatflx_accum, & + bwatflx_refreeze_accum, & + finished) + + ! Given the input bwatflx, route the water downstream, keeping track of fluxes along the way. + ! The loop goes from highest to lowest values of 'head' on the local processor. + ! At the end of the loop, all the incoming flux has either been + ! (1) routed to the ice sheet margin; + ! (2) set aside for later refreezing; or + ! (3) routed to a halo cell, from which it continues downstream the next time the subroutine is called. + ! The subroutine is called iteratively until all no water remains in halo cells. + + implicit none + + ! Input/output variables + + integer, intent(in) :: & + nx, ny, & ! number of cells in each direction + nlocal, & ! number of locally owned grid cells on the processor + itest, jtest, rtest, & ! coordinates of diagnostic point + count ! iteration count (diagnostic only) + + type(parallel_type), intent(in) :: & + parallel ! info for parallel communication + + integer, dimension(nlocal,2), intent(in) :: & + sorted_ij ! i and j indices of each local cell, sorted low to high + + integer, dimension(nx,ny), intent(in) :: & + local_mask, & ! = 1 for cells owned by the local processor, else = 0 + halo_mask, & ! = 1 for the layer of halo cells adjacent to locally owned cells, else = 0 + bwat_mask ! = 1 for cells through which basal water is routed; excludes floating and ocean cells + + ! Note: Both flux_fraction and btemp_weight are constrained to be 0 or 1. + ! This means that the routing is limited to D8 (all the flux goes to one downstream cell), + ! and partial refreezing is not allowed (i.e., btemp_weight = 1 everywhere). + ! Thus, btemp_weight is not needed, but I kept it to keep the code similar to the subroutine above. + ! We could make refreezing all-or-nothing (i.e., weights of either 0 or 1), but this leads to + ! oscillations in bed temperature. + ! I thought of rescaling flux_fraction and btemp_weight to largish i8 integers (e.g., 1000) + ! to keep everything BFB, and then scaling back at the end. The problem is that this subroutine + ! may need to be called repeatedly, and each scaling would lead to larger and larger integers + ! that eventually exceed the i8 limit on integer size, ~10^(19). + + integer(i8), dimension(-1:1,-1:1,nx,ny), intent(in) :: & + flux_fraction ! fraction of flux from a cell that flows downhill to each of 8 neighbors + ! last two indices identify the source cell; + ! 1st two indices give relative location of receiving cell + + integer(i8), dimension(nx,ny), intent(in) :: & + btemp_weight ! temperature-dependent weighting factor, favoring flow where the bed is thawed + + integer(i8), dimension(nx,ny), intent(inout) :: & + bwatflx, & ! on input: water flux (m^3/s * factor_bwatflx) to be routed to the margin or halo + ! on output: flux routed to halo, to be routed further next time + bwatflx_accum, & ! cumulative bwatflx (m^3/s * factor_bwatflx) over multiple iterations + bwatflx_refreeze_accum ! cumulative bwatflx_refreeze (m^3/s * factor_bwatflx) over multiple iterations + + logical, intent(inout) :: & + finished ! initially F; set to T when all water has been routed as far as it can go + + ! Local variables + + integer :: i, j, k + integer :: ii, jj, ip, jp + + ! Note: Some of the local variables are scaled by products of all three scale factors above. + integer(i8), dimension(-1:1,-1:1,nx,ny):: & + bwatflx_halo ! flux routed to halo cells + ! last two indices identify the source cell; + ! 1st two indices give relative location of receiving cell + + integer(i8), dimension(nx,ny) :: & + bwatflx_refreeze, & ! flux saved for later refreezing; not routed further downstream + sum_bwatflx_halo ! bwatflx_halo summed over the first 2 indices + + integer(i8) :: & + flx_thru, & ! flux (m^3/s) that continues downstream + global_halo_sum ! global sum of water flux in halo cells + + real(dp), dimension(nx,ny):: & + bwatflx_dp, bwatflx_halo_dp, bwatflx_refreeze_dp ! temporary dp versions of i8 arrays + + ! Initialize fluxes + bwatflx_halo = 0 + bwatflx_refreeze = 0 + + ! loop from high to low values on the local processor + do k = nlocal, 1, -1 + + ! Get i and j indices of current cell + i = sorted_ij(k,1) + j = sorted_ij(k,2) + + if (bwat_mask(i,j) == 1 .and. bwatflx(i,j) > 0.0d0) then + + ! Distribute the flux to downstream neighbors. + ! Note: If btemp_weight = 1 everwhere, there is no refreezing. + flx_thru = bwatflx(i,j) * btemp_weight(i,j) + bwatflx_refreeze(i,j) = bwatflx(i,j) * (1 - btemp_weight(i,j)) + do jj = -1,1 + do ii = -1,1 + ip = i + ii + jp = j + jj + if (flux_fraction(ii,jj,i,j) > 0) then + if (halo_mask(ip,jp) == 1) then + bwatflx_halo(ii,jj,i,j) = flx_thru*flux_fraction(ii,jj,i,j) + if (verbose_bwat .and. this_rank==rtest .and. i==itest .and. j==jtest .and. count <= 2) then + write(iulog,*) 'Flux to halo, i, j, ii, jj, flux:', & + i, j, ii, jj, flx_thru*flux_fraction(ii,jj,i,j) + endif + elseif (local_mask(ip,jp) == 1) then + bwatflx(ip,jp) = bwatflx(ip,jp) + flx_thru*flux_fraction(ii,jj,i,j) + if (verbose_bwat .and. this_rank==rtest .and. i==itest .and. j==jtest .and. count <= 2) then + write(iulog,*) 'Flux to neighbor, i, j, ii, jj, flux:', & + i, j, ii, jj, flx_thru*flux_fraction(ii,jj,i,j) + endif + endif + endif ! flux_fraction > 0 + enddo ! ii + enddo ! jj + endif ! bwat_mask = 1, bwatflx > 0 + enddo ! loop from high to low + + ! Accumulate the fluxes in the output arrays + bwatflx_accum = bwatflx_accum + bwatflx + bwatflx_refreeze_accum = bwatflx_refreeze_accum + bwatflx_refreeze + + ! Compute the total bwatflx in halo cells + do j = 1, ny + do i = 1, nx + sum_bwatflx_halo(i,j) = sum(bwatflx_halo(:,:,i,j)) + enddo + enddo + global_halo_sum = parallel_global_sum(sum_bwatflx_halo, parallel) + + ! If bwatflx_halo = 0 everywhere, then we are done. + ! Where bwatflx_halo is nonzero, communicate it to the neighboring task. + ! It will be routed further downstream the next time this subroutine is called. + + if (global_halo_sum > 0) then + + if (verbose_bwat .and. count <= 2) then + if (this_rank == rtest) write(iulog,*) 'Before halo update, global_halo_sum (m^3/s):', global_halo_sum + endif + + ! Reset bwatflx to zero for the halo transfer + bwatflx = 0 + + ! Communicate bmltflx_halo to the halo cells of neighboring processors + call parallel_halo(bwatflx_halo(:,:,:,:), parallel) + + ! bmltflx_halo is now available in the halo cells of the local processor. + ! Route downslope to the adjacent locally owned cells. + ! These fluxes will be routed further downstream during the next iteration. + ! Note: This calculation does not use flux_fraction or btemp_weight, so no rescaling is needed at the end. + do j = 2, ny-1 + do i = 2, nx-1 + if (halo_mask(i,j) == 1 .and. sum(bwatflx_halo(:,:,i,j)) > 0) then + do jj = -1,1 + do ii = -1,1 + if (bwatflx_halo(ii,jj,i,j) > 0) then + ip = i + ii + jp = j + jj + if (local_mask(ip,jp) == 1) then + bwatflx(ip,jp) = bwatflx(ip,jp) + bwatflx_halo(ii,jj,i,j) + if (verbose_bwat .and. ip==itest .and. jp==jtest .and. this_rank==rtest .and. count <= 2) then + write(iulog,*) 'Nonzero bwatflx from halo, rank, i, j:', & + this_rank, ip, jp, bwatflx_halo(ii,jj,i,j) + endif + endif + endif ! bwatflx_halo > 0 to a local cell + enddo ! ii + enddo ! jj + endif ! bwatflx_halo > 0 from this halo cell + enddo ! i + enddo ! j + + else + + finished = .true. ! no water in halo cells to route further + + endif ! global_halo_sum > 0 + + end subroutine route_flux_to_margin_or_halo_integer8 + !============================================================== !+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ @@ -1608,8 +1976,8 @@ end subroutine get_flux_fraction ! They are a GPL-licenced replacement for the Numerical Recipes routine indexx. ! They are not derived from any NR code, but are based on a quicksort routine by ! Michael Lamont (http://linux.wku.edu/~lamonml/kb.html), originally written - ! in C, and issued under the GNU General Public License. The conversion to - ! Fortran 90, and modification to do an index sort was done by Ian Rutt. + ! in C, and issued under the GNU General Public License. Ian Rutt did the conversion + ! to Fortran 90 and modified the algorithm to do an index sort. ! !+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ diff --git a/libglissade/glissade_velo_higher.F90 b/libglissade/glissade_velo_higher.F90 index fb6796f7..099f4fd7 100644 --- a/libglissade/glissade_velo_higher.F90 +++ b/libglissade/glissade_velo_higher.F90 @@ -2285,7 +2285,6 @@ subroutine glissade_velo_higher_solve(model, & vbas(:,:) = vvel(nz,:,:) endif -!! if (verbose_beta) then if (verbose_beta .and. counter==1) then if (this_rank == rtest) write(iulog,*) 'Before calcbeta, counter =', counter call point_diag(usrf, 'usrf (m)', itest, jtest, rtest, 7, 7) @@ -2322,10 +2321,6 @@ subroutine glissade_velo_higher_solve(model, & whichbeta_limit, & itest = itest, jtest = jtest, rtest = rtest) - if (verbose_basal) then - call point_diag(beta_internal, 'After calcbeta, beta', itest, jtest, rtest, 7, 7, '(f12.0)') - endif - ! if (verbose_beta) then ! maxbeta = maxval(beta_internal(:,:)) ! maxbeta = parallel_reduce_max(maxbeta) @@ -2409,16 +2404,11 @@ subroutine glissade_velo_higher_solve(model, & !WHL - debug - BFB check if (verbose_reprosum .and. counter == 1) then - if (main_task) write(iulog,*) 'Write out matrices after assemble_stiffness_matrix' - call write_array_to_file(Auu_2d, 21, 'global_Auu1', parallel, write_binary = .true., cycle_indices = .true.) - call write_array_to_file(Auv_2d, 22, 'global_Auv1', parallel, write_binary = .true., cycle_indices = .true.) - call write_array_to_file(Avu_2d, 23, 'global_Avu1', parallel, write_binary = .true., cycle_indices = .true.) - call write_array_to_file(Avv_2d, 24, 'global_Avv1', parallel, write_binary = .true., cycle_indices = .true.) - endif - - !WHL - debug - BFB check -!! if (0 == 1) then - if (verbose_reprosum) then +!! if (main_task) write(iulog,*) 'Write out matrices after assemble_stiffness_matrix' +!! call write_array_to_file(Auu_2d, 21, 'global_Auu1', parallel, write_binary = .true., cycle_indices = .true.) +!! call write_array_to_file(Auv_2d, 22, 'global_Auv1', parallel, write_binary = .true., cycle_indices = .true.) +!! call write_array_to_file(Avu_2d, 23, 'global_Avu1', parallel, write_binary = .true., cycle_indices = .true.) +!! call write_array_to_file(Avv_2d, 24, 'global_Avv1', parallel, write_binary = .true., cycle_indices = .true.) sum_Auu(:) = parallel_global_sum_stagger(Auu_2d, nNodeNeighbors_2d, parallel) sum_Auv(:) = parallel_global_sum_stagger(Auv_2d, nNodeNeighbors_2d, parallel) sum_Avu(:) = parallel_global_sum_stagger(Avu_2d, nNodeNeighbors_2d, parallel) @@ -2552,36 +2542,6 @@ subroutine glissade_velo_higher_solve(model, & call point_diag(beta_eff_y, 'beta_eff_y', itest, jtest, rtest, 7, 7, '(e10.3)') endif - if (verbose_reprosum) then - sum_omega = parallel_global_sum_stagger(omega, parallel) - sum_stag_omega = parallel_global_sum_stagger(stag_omega, parallel) - sum_betax = parallel_global_sum_stagger(beta_eff_x, parallel) - sum_betay = parallel_global_sum_stagger(beta_eff_y, parallel) - if (main_task) then - call double_to_binary(sum_omega, binary_str) - write(iulog,*) 'Before bc_2d: sum_omega, binary_str:', sum_omega, binary_str - call double_to_binary(sum_stag_omega, binary_str) - write(iulog,*) ' sum_stag_omega, binary_str:', sum_stag_omega, binary_str - call double_to_binary(sum_betax, binary_str) - write(iulog,*) ' sum_betax, binary_str:', sum_betax, binary_str - call double_to_binary(sum_betay, binary_str) - write(iulog,*) ' sum_betay, binary_str:', sum_betay, binary_str - endif - if (this_rank == rtest) then - do j = jtest-2, jtest+2 - do i = itest-2, itest+2 - if (i >= staggered_ilo .and. i <= staggered_ihi .and. & - j >= staggered_jlo .and. j <= staggered_jhi) then - call double_to_binary(stag_omega(i,j), binary_str) - write(iulog,*) 'i, j, stag_omega:', i, j, binary_str - call double_to_binary(beta_eff_x(i,j), binary_str) - write(iulog,*) ' beta_eff_x:', i, j, binary_str - endif - enddo - enddo - endif - endif ! verbose_reprosum - if (diva_slope_correction) then ! Incorporate basal sliding boundary conditions with basal curvature, @@ -2698,14 +2658,14 @@ subroutine glissade_velo_higher_solve(model, & !WHL - debug - Write all the matrix elements and rhs elements (in binary form) to files if (verbose_reprosum .and. counter == 1) then - if (main_task) write(iulog,*) 'Write out matrices after adding BC' +!! if (main_task) write(iulog,*) 'Write out matrices after adding BC' !! call write_array_to_file(Auu_2d(:,:,5), 30, 'global_Auu2', parallel) ! diagonal terms only - call write_array_to_file(Auu_2d, 31, 'global_Auu2', parallel, write_binary = .true., cycle_indices = .true.) - call write_array_to_file(Auv_2d, 32, 'global_Auv2', parallel, write_binary = .true., cycle_indices = .true.) - call write_array_to_file(Avu_2d, 33, 'global_Avu2', parallel, write_binary = .true., cycle_indices = .true.) - call write_array_to_file(Avv_2d, 34, 'global_Avv2', parallel, write_binary = .true., cycle_indices = .true.) - call write_array_to_file(bu_2d, 35, 'global_bu2', parallel, write_binary = .true.) - call write_array_to_file(bv_2d, 36, 'global_bv2', parallel, write_binary = .true.) +!! call write_array_to_file(Auu_2d, 31, 'global_Auu2', parallel, write_binary = .true., cycle_indices = .true.) +!! call write_array_to_file(Auv_2d, 32, 'global_Auv2', parallel, write_binary = .true., cycle_indices = .true.) +!! call write_array_to_file(Avu_2d, 33, 'global_Avu2', parallel, write_binary = .true., cycle_indices = .true.) +!! call write_array_to_file(Avv_2d, 34, 'global_Avv2', parallel, write_binary = .true., cycle_indices = .true.) +!! call write_array_to_file(bu_2d, 35, 'global_bu2', parallel, write_binary = .true.) +!! call write_array_to_file(bv_2d, 36, 'global_bv2', parallel, write_binary = .true.) endif !--------------------------------------------------------------------------- @@ -8630,11 +8590,7 @@ subroutine basal_sliding_bc_2d_diva(& if (whichassemble_beta == HO_ASSEMBLE_BETA_LOCAL) then - if (nNeighbors == nNodeNeighbors_3d) then ! 3D problem - m = indxA_3d(0,0,0) - else ! 2D problem - m = indxA_2d(0,0) - endif + m = indxA_2d(0,0) ! Average the lower ice surface elevation to vertices call glissade_stagger(& @@ -8656,16 +8612,16 @@ subroutine basal_sliding_bc_2d_diva(& call parallel_halo(theta_basal_slope_y, parallel) if (verbose_basal) then - call point_diag(theta_basal_slope_x*180.d0/pi, 'theta_basal_slope_x (deg)', itest, jtest, rtest, 7, 7, '(f10.0)') - call point_diag(theta_basal_slope_y*180.d0/pi, 'theta_basal_slope_y (deg)', itest, jtest, rtest, 7, 7, '(f10.0)') + call point_diag(theta_basal_slope_x*180.d0/pi, 'theta_basal_slope_x (deg)', itest, jtest, rtest, 7, 7) + call point_diag(theta_basal_slope_y*180.d0/pi, 'theta_basal_slope_y (deg)', itest, jtest, rtest, 7, 7) endif ! Sum over active vertices do j = 1, ny-1 do i = 1, nx-1 if (active_vertex(i,j)) then - Auu(i,j,m) = Auu(i,j,m) + dx*dy/vol0 * beta_eff_x(i,j) / cos(theta_basal_slope_x(i,j)) - Avv(i,j,m) = Avv(i,j,m) + dx*dy/vol0 * beta_eff_y(i,j) / cos(theta_basal_slope_y(i,j)) + Auu(i,j,m) = Auu(i,j,m) + (dx*dy/vol0) * beta_eff_x(i,j) / cos(theta_basal_slope_x(i,j)) + Avv(i,j,m) = Avv(i,j,m) + (dx*dy/vol0) * beta_eff_y(i,j) / cos(theta_basal_slope_y(i,j)) endif ! active_vertex enddo ! i enddo ! j @@ -8802,20 +8758,6 @@ subroutine basal_sliding_bc_2d_diva(& endif ! whichassemble_beta - if (verbose_basal .and. this_rank==rtest) then - i = itest - j = jtest - if (nNeighbors == nNodeNeighbors_3d) then ! 3D problem - m = indxA_3d(0,0,0) - else - m = indxA_2d(0,0) - endif - write(iulog,*) ' ' - write(iulog,*) 'Basal BC: i, j, diagonal index =', i, j, m - write(iulog,*) 'New Auu diagonal:', Auu(i,j,m) - write(iulog,*) 'New Avv diagonal:', Avv(i,j,m) - endif - end subroutine basal_sliding_bc_2d_diva !**************************************************************************** @@ -10871,7 +10813,7 @@ subroutine check_symmetry_assembled_matrix_3d(nx, ny, nz, & if (verbose_matrix) then global_maxdiff = parallel_reduce_max(maxdiff) - if (maxdiff == global_maxdiff) then + if (global_maxdiff > 0.0d0 .and. maxdiff == global_maxdiff) then ! maxdiff is on this processor; compute and broadcast the global index call parallel_globalindex(imax, jmax, iglobal, jglobal, parallel) write(iulog,*) 'Max asymmetry =', global_maxdiff @@ -11077,7 +11019,7 @@ subroutine check_symmetry_assembled_matrix_2d(nx, ny, & if (verbose_matrix) then global_maxdiff = parallel_reduce_max(maxdiff) - if (maxdiff == global_maxdiff) then + if (global_maxdiff > 0.0d0 .and. maxdiff == global_maxdiff) then ! maxdiff is on this processor; compute and broadcast the global index call parallel_globalindex(imax, jmax, iglobal, jglobal, parallel) write(iulog,*) 'Max asymmetry =', global_maxdiff From 329fc87d051ccb301bddf6a73c3e3cf5f8f93ccc Mon Sep 17 00:00:00 2001 From: William Lipscomb Date: Wed, 14 Jan 2026 16:48:41 -0700 Subject: [PATCH 09/21] Define two distinct temperature scales for basal hydrology This commit replaces btemp_scale, a temperature scale in the flux-routing hydrology, with two distinct scales that have different functions. The two new scales are (1) btemp_flow_scale and (2) btemp_freeze_scale. btemp_flow_scale is used to weigh the likelihood of alternative downstream paths, as computed in subroutine get_flux_fraction. The scale determines a weighting factor: btemp_weight_flow = exp(-delta_Tb/btemp_flow_scale), where delta_Tb is the difference between PMP temperature and bed temperature. A low value of btemp_weight_flow means that water is more likely to be routed to a neighboring grid cell instead of passing through a frozen one. btemp_freeze_scale is used to compute how much of the incoming water flux refreezes locally rather than passing through. The weight is computed as: btemp_weight_freeze = exp(-delta_Tb/btemp_freeze_scale). A low value of btemp_weight_freeze means that more water refreezes in place. For both scales, a larger value makes the hydrology less sensitive to bed temperature. A zero value (the default) means that the scaling is ignored, and the flow is independent of bed temperature. Since the refreezing option is not supported for reproducible sums, I will try running the hydrology with the flow scale alone. If this works well, the refreezing option might be removed later. --- libglide/glide_setup.F90 | 19 ++-- libglide/glide_types.F90 | 6 +- libglissade/glissade.F90 | 3 +- libglissade/glissade_basal_water.F90 | 156 +++++++++++++++------------ 4 files changed, 110 insertions(+), 74 deletions(-) diff --git a/libglide/glide_setup.F90 b/libglide/glide_setup.F90 index 682ff56d..af481071 100644 --- a/libglide/glide_setup.F90 +++ b/libglide/glide_setup.F90 @@ -3310,7 +3310,8 @@ subroutine handle_basal_hydro(section, model) ! flux routing call GetValue(section, 'ho_flux_routing_scheme', model%basal_hydro%ho_flux_routing_scheme) call GetValue(section, 'const_source', model%basal_hydro%const_source) - call GetValue(section, 'btemp_scale', model%basal_hydro%btemp_scale) + call GetValue(section, 'btemp_flow_scale', model%basal_hydro%btemp_flow_scale) + call GetValue(section, 'btemp_freeze_scale', model%basal_hydro%btemp_freeze_scale) ! effective pressure options and parameters call GetValue(section, 'effecpress_delta', model%basal_hydro%effecpress_delta) @@ -3391,11 +3392,11 @@ subroutine print_basal_hydro(model) model%basal_hydro%ho_flux_routing_scheme = HO_FLUX_ROUTING_D8 call write_log(message) endif - if (model%basal_hydro%btemp_scale > 0.0d0) then + if (model%basal_hydro%btemp_freeze_scale > 0.0d0) then write(message,*) 'With reproducible sums, the flux-routing does not support refreezing;' // & - 'setting btemp_scale = 0' + ' setting btemp_freeze_scale = 0' call write_log(message) - model%basal_hydro%btemp_scale = 0.0d0 + model%basal_hydro%btemp_freeze_scale = 0.0d0 endif endif ! reproducible sums write(message,*) 'ho_flux_routing_scheme : ',model%basal_hydro%ho_flux_routing_scheme, & @@ -3405,8 +3406,14 @@ subroutine print_basal_hydro(model) write(message,*) 'constant melt source at the bed (m/yr): ', model%basal_hydro%const_source call write_log(message) endif - if (model%basal_hydro%btemp_scale > 0.0d0) then - write(message,*) 'temp scale (deg C) for frz/thaw transition: ', model%basal_hydro%btemp_scale + if (model%basal_hydro%btemp_flow_scale > 0.0d0) then + write(message,*) 'temp scale (deg C) for flow around frozen bed: ', & + model%basal_hydro%btemp_flow_scale + call write_log(message) + endif + if (model%basal_hydro%btemp_freeze_scale > 0.0d0) then + write(message,*) 'temp scale (deg C) for refreezing at the bed: ', & + model%basal_hydro%btemp_freeze_scale call write_log(message) endif if (model%options%which_ho_effecpress == HO_EFFECPRESS_BWAT) then diff --git a/libglide/glide_types.F90 b/libglide/glide_types.F90 index 2d001f06..0731616b 100644 --- a/libglide/glide_types.F90 +++ b/libglide/glide_types.F90 @@ -2158,7 +2158,11 @@ module glide_types !TODO - Add visc_water and omega_hydro? Currently set in glissade_basal_water module real(dp) :: const_source = 0.0d0 !> constant melt source at the bed (m/yr) !> could be used to represent an englacial or surface source - real(dp) :: btemp_scale = 0.0d0 !> temperature scale (degC) for transition between thawed and frozen bed + real(dp) :: btemp_flow_scale = 0.0d0 !> temperature scale (degC) for transition between thawed and frozen bed; + !> used to route flow away from cells with a frozen bed; + !> btemp_scale = 0 => temperature-independent flow + real(dp) :: btemp_freeze_scale = 0.0d0 !> temperature scale (degC) for transition between thawed and frozen bed; + !> used to refreeze water beneath cells with a frozen bed; !> btemp_scale = 0 => temperature-independent flow ! parameters for macroporous sheet real(dp) :: bwat_threshold = 1.0d-3 !> scale over which N ramps down from overburden to a small value (m) diff --git a/libglissade/glissade.F90 b/libglissade/glissade.F90 index ceae1895..be44b235 100644 --- a/libglissade/glissade.F90 +++ b/libglissade/glissade.F90 @@ -1941,7 +1941,8 @@ subroutine glissade_thermal_solve(model, dt) floating_mask, & model%basal_hydro%bmlt_hydro, & ! m/s model%temper%bpmp - model%temper%btemp_ground, & ! degC - model%basal_hydro%btemp_scale, & ! degC + model%basal_hydro%btemp_flow_scale, & ! degC + model%basal_hydro%btemp_freeze_scale, & ! degC model%basal_hydro%bwatflx, & ! m/s model%basal_hydro%bwat_diag, & ! m model%temper%bhydroflx, & ! W/m2 diff --git a/libglissade/glissade_basal_water.F90 b/libglissade/glissade_basal_water.F90 index 8f9e03ef..68ef2d3f 100644 --- a/libglissade/glissade_basal_water.F90 +++ b/libglissade/glissade_basal_water.F90 @@ -185,7 +185,9 @@ subroutine glissade_bwat_flux_routing(& thklim, & bwat_mask, floating_mask, & bmlt_hydro, & - delta_Tb, btemp_scale, & + delta_Tb, & + btemp_flow_scale, & + btemp_freeze_scale, & bwatflx, bwat_diag, & bhydroflx, & head, grad_head, & @@ -222,9 +224,16 @@ subroutine glissade_bwat_flux_routing(& delta_Tb ! difference T_pmp - T_bed (degC) real(dp), intent(in) :: & - thklim, & ! minimum ice thickness for basal melt and hydropotential calculations (m) + thklim ! minimum ice thickness for basal melt and hydropotential calculations (m) ! Note: This is typically model%geometry%thklim_temp - btemp_scale ! temperature scale for transition from frozen to thawed bed (degC) + + ! Note: These scales ensure a smooth transition in behavior between frozen and thawed beds. + ! Both scales are computed in a similar way, but they apply to different parts of the algorithm. + ! TODO: Decide whether to keep both scales. Only the flow scale works for reprosums, so we might want + ! to remove the freeze scale. + real(dp), intent(in) :: & + btemp_flow_scale, & ! temperature scale for routing water flow around cells with a frozen bed (deg C) + btemp_freeze_scale ! temperature scale for refreezing water beneath cells with a frozen bed (degC) integer, dimension(nx,ny), intent(in) :: & bwat_mask, & ! mask to identify cells through which basal water is routed; @@ -344,7 +353,8 @@ subroutine glissade_bwat_flux_routing(& head, & bmlt_hydro, & delta_Tb, & - btemp_scale, & + btemp_flow_scale, & + btemp_freeze_scale, & bwat_mask, & bwatflx, & bwatflx_refreeze, & @@ -468,7 +478,8 @@ subroutine route_basal_water(& head, & bmlt_hydro, & delta_Tb, & - btemp_scale, & + btemp_flow_scale, & + btemp_freeze_scale, & bwat_mask, & bwatflx, & bwatflx_refreeze, & @@ -507,8 +518,9 @@ subroutine route_basal_water(& delta_Tb ! difference T_pmp - T_bed (degC) real(dp), intent(in) :: & - btemp_scale ! temperature scale for transition from frozen to thawed bed (degC) - ! If btemp_scale = 0, assume no temperature dependence + btemp_flow_scale, & ! temperature scale for routing water flow around cells with a frozen bed (deg C) + btemp_freeze_scale ! temperature scale for refreezing water beneath cells with a frozen bed (degC) + ! If scale = 0, assume no temperature dependence real(dp), dimension(nx,ny), intent(inout) :: & head ! hydraulic head (m) @@ -543,7 +555,8 @@ subroutine route_basal_water(& real(dp), dimension(nx,ny) :: & head_filled, & ! head after depressions are filled (m) - btemp_weight, & ! temperature-dependent weighting factor, favoring flow where the bed is thawed + btemp_weight_flow, & ! temp-dependent weighting factor, forcing flow around cells with frozen beds + btemp_weight_freeze, & ! temp-dependent weighting factor, favoring refreezing in cells with frozen beds bwatflx_accum, & ! water flux through the cell (m^3/s) accumulated over multiple iterations bwatflx_refreeze_accum ! water flux (m^3/s) refreezing in place, accumulated over multiple iterations @@ -563,7 +576,7 @@ subroutine route_basal_water(& ! The following i8 variables are for computing reproducible sums integer(i8), dimension(nx,ny) :: & bwatflx_int, & ! water flux through a grid cell (m^3/s) - btemp_weight_int, & ! temperature-dependent weighting factor, favoring flow where the bed is thawed + btemp_weight_freeze_int, & ! temp-dependent weighting factor, favoring refreezing in cells with frozen beds bwatflx_accum_int, & ! water flux through the cell (m^3/s) accumulated over multiple iterations bwatflx_refreeze_accum_int ! water flux (m^3/s) refreezing in place, accumulated over multiple iterations @@ -660,22 +673,33 @@ subroutine route_basal_water(& enddo enddo - ! Compute a temperature-dependent weighting factor for flux routing. - ! This is used in two parts of the code: - ! (1) In subroutine get_flux_fraction, btemp_weight is used to weight potential downstream paths. - ! A small value of btemp_weight means that a cell is less likely to receive water from upstream. - ! (2) When water enters a frozen cell (delta_Tb > 0), btemp_weight is used to determine - ! how much of the flux is refrozen in place rather than passing through. - ! A low value of btemp_weight means that less water passes through. - ! Note: For reproducible sums, refreezing is not supported; must have btemp_weight = 1 everywhere. - - btemp_weight = 1.0d0 + ! Compute temperature-dependent weighting factors for flux routing. + ! There are two scales with related but distinct functions: + ! (1) In subroutine get_flux_fraction, btemp_flow_scale is used to weigh potential downstream paths. + ! A low value of btemp_weight_flow means that water is less likely to pass through. + ! (2) When water enters a frozen cell (delta_Tb > 0), btemp_freeze_scale determines + ! how much of the flux refreezes in place rather than passing through. + ! A small value of btemp_weight_freeze means that more water refreezes, and less passes through. + ! Note: For reproducible sums, refreezing is not supported; must have btemp_weight_freeze = 1 everywhere. + ! TODO: Possibly remove btemp_freeze_scale and just keep btemp_flow_scale. + + btemp_weight_flow = 1.0d0 + if (btemp_flow_scale > 0.0d0) then + if (.not. reprosum) then + where (bwat_mask == 1) + where (delta_Tb > 0.0d0) + btemp_weight_flow = exp(-delta_Tb/btemp_flow_scale) + endwhere + endwhere + endif + endif - if (btemp_scale > 0.0d0) then + btemp_weight_freeze = 1.0d0 + if (btemp_freeze_scale > 0.0d0) then if (.not. reprosum) then where (bwat_mask == 1) where (delta_Tb > 0.0d0) - btemp_weight = exp(-delta_Tb/btemp_scale) + btemp_weight_freeze = exp(-delta_Tb/btemp_freeze_scale) endwhere endwhere endif @@ -683,7 +707,8 @@ subroutine route_basal_water(& if (verbose_bwat) then call point_diag(delta_Tb, 'Tpmp - Tb', itest, jtest, rtest, 7, 7) - call point_diag(btemp_weight, 'btemp_weight', itest, jtest, rtest, 7, 7) + call point_diag(btemp_weight_flow, 'btemp_weight_flow', itest, jtest, rtest, 7, 7) + call point_diag(btemp_weight_freeze, 'btemp_weight_freeze', itest, jtest, rtest, 7, 7) endif ! Compute the fraction of the incoming flux sent to each downstream neighbor. @@ -695,7 +720,7 @@ subroutine route_basal_water(& flux_routing_scheme, & sorted_ij, & head, & - btemp_weight, & + btemp_weight_flow, & bwat_mask, & flux_fraction) @@ -745,12 +770,12 @@ subroutine route_basal_water(& ! to a single cell. flux_fraction_int(:,:,:,:) = nint(flux_fraction(:,:,:,:), i8) - ! Convert btemp_weight to i8 - btemp_weight_int(:,:) = 1 + ! Convert btemp_weight_freeze to i8 + btemp_weight_freeze_int(:,:) = 1 ! Note: Can round up to 1 and down to 0 by uncommenting the following line. ! However, a mix of 1's and 0's leads to oscillations in basal temperature, - ! so it is safer to turn off refreezing by setting btemp_weight = 1 everywhere. -! btemp_weight_int(:,:) = nint(btemp_weight(:,:), i8) + ! so it is safer to turn off refreezing by setting btemp_weight_freeze = 1 everywhere. +! btemp_weight_freeze_int(:,:) = nint(btemp_weight_freeze(:,:), i8) ! Initialize other arrays bwatflx_accum_int = 0 @@ -776,17 +801,17 @@ subroutine route_basal_water(& ! Note: The fluxes are scaled by factor_bwatflx call route_flux_to_margin_or_halo(& - nx, ny, nlocal, & + nx, ny, nlocal, & itest, jtest, rtest, count, & - parallel, & - sorted_ij, & - local_mask, & - halo_mask, & - bwat_mask, & - flux_fraction_int, & - btemp_weight_int, & - bwatflx_int, & - bwatflx_accum_int, & + parallel, & + sorted_ij, & + local_mask, & + halo_mask, & + bwat_mask, & + flux_fraction_int, & + btemp_weight_freeze_int, & + bwatflx_int, & + bwatflx_accum_int, & bwatflx_refreeze_accum_int, & finished) @@ -808,7 +833,7 @@ subroutine route_basal_water(& halo_mask, & bwat_mask, & flux_fraction, & - btemp_weight, & + btemp_weight_freeze, & bwatflx, & bwatflx_accum, & bwatflx_refreeze_accum, & @@ -1334,7 +1359,7 @@ subroutine get_flux_fraction(& flux_routing_scheme, & sorted_ij, & head, & - btemp_weight, & + btemp_weight_flow, & bwat_mask, & flux_fraction) @@ -1364,7 +1389,7 @@ subroutine get_flux_fraction(& real(dp), dimension(nx,ny), intent(in) :: & head, & ! hydraulic head (m) - btemp_weight ! temperature-dependent weighting factor, favoring flow where the bed is thawed + btemp_weight_flow ! temperature-dependent weighting factor, forcing flow around cells with frozen beds integer, dimension(nx,ny), intent(in) :: & bwat_mask ! = 1 for cells in the region where basal water fluxes can be nonzero @@ -1423,7 +1448,7 @@ subroutine get_flux_fraction(& jp = j + jj if (ip >= 1 .and. ip <= nx .and. jp > 1 .and. jp <= ny) then if (head(ip,jp) < head(i,j)) then - slope(ii,jj) = btemp_weight(ip,jp) * (head(i,j) - head(ip,jp)) / dists(ii,jj) + slope(ii,jj) = btemp_weight_flow(ip,jp) * (head(i,j) - head(ip,jp)) / dists(ii,jj) endif endif endif @@ -1594,7 +1619,7 @@ subroutine route_flux_to_margin_or_halo_real8(& halo_mask, & bwat_mask, & flux_fraction, & - btemp_weight, & + btemp_weight_freeze, & bwatflx, & bwatflx_accum, & bwatflx_refreeze_accum, & @@ -1635,7 +1660,7 @@ subroutine route_flux_to_margin_or_halo_real8(& ! 1st two indices give relative location of receiving cell real(dp), dimension(nx,ny), intent(in) :: & - btemp_weight ! temperature-dependent weighting factor, favoring flow where the bed is thawed + btemp_weight_freeze ! temperature-dependent weighting factor, favoring refreezing at frozen beds real(dp), dimension(nx,ny), intent(inout) :: & bwatflx, & ! on input: water flux (m^3/s) to be routed to the margin or halo @@ -1678,10 +1703,10 @@ subroutine route_flux_to_margin_or_halo_real8(& if (bwat_mask(i,j) == 1 .and. bwatflx(i,j) > 0.0d0) then ! Distribute the flux to downstream neighbors. - ! Based on the temperature-dependent weighting factor btemp_weight, all or part of the flux + ! Based on the temperature-dependent weighting factor btemp_weight_freeze, all or part of the flux ! is refrozen in place instead of being routed downstream. - flx_thru = bwatflx(i,j) * btemp_weight(i,j) - bwatflx_refreeze(i,j) = bwatflx(i,j) * (1.0d0 - btemp_weight(i,j)) + flx_thru = bwatflx(i,j) * btemp_weight_freeze(i,j) + bwatflx_refreeze(i,j) = bwatflx(i,j) * (1.0d0 - btemp_weight_freeze(i,j)) do jj = -1,1 do ii = -1,1 ip = i + ii @@ -1772,18 +1797,18 @@ end subroutine route_flux_to_margin_or_halo_real8 !============================================================== subroutine route_flux_to_margin_or_halo_integer8(& - nx, ny, nlocal, & + nx, ny, nlocal, & itest, jtest, rtest, count, & - parallel, & - sorted_ij, & - local_mask, & - halo_mask, & - bwat_mask, & - flux_fraction, & - btemp_weight, & - bwatflx, & - bwatflx_accum, & - bwatflx_refreeze_accum, & + parallel, & + sorted_ij, & + local_mask, & + halo_mask, & + bwat_mask, & + flux_fraction, & + btemp_weight_freeze, & + bwatflx, & + bwatflx_accum, & + bwatflx_refreeze_accum, & finished) ! Given the input bwatflx, route the water downstream, keeping track of fluxes along the way. @@ -1815,13 +1840,13 @@ subroutine route_flux_to_margin_or_halo_integer8(& halo_mask, & ! = 1 for the layer of halo cells adjacent to locally owned cells, else = 0 bwat_mask ! = 1 for cells through which basal water is routed; excludes floating and ocean cells - ! Note: Both flux_fraction and btemp_weight are constrained to be 0 or 1. + ! Note: Both flux_fraction and btemp_weight_freeze are constrained to be 0 or 1. ! This means that the routing is limited to D8 (all the flux goes to one downstream cell), - ! and partial refreezing is not allowed (i.e., btemp_weight = 1 everywhere). - ! Thus, btemp_weight is not needed, but I kept it to keep the code similar to the subroutine above. + ! and partial refreezing is not allowed (i.e., btemp_weight_freeze = 1 everywhere). + ! Thus, btemp_weight_freeze is not needed, but I kept it to keep the code similar to the subroutine above. ! We could make refreezing all-or-nothing (i.e., weights of either 0 or 1), but this leads to ! oscillations in bed temperature. - ! I thought of rescaling flux_fraction and btemp_weight to largish i8 integers (e.g., 1000) + ! I thought of rescaling flux_fraction and btemp_weight_freeze to largish i8 integers (e.g., 1000) ! to keep everything BFB, and then scaling back at the end. The problem is that this subroutine ! may need to be called repeatedly, and each scaling would lead to larger and larger integers ! that eventually exceed the i8 limit on integer size, ~10^(19). @@ -1832,7 +1857,7 @@ subroutine route_flux_to_margin_or_halo_integer8(& ! 1st two indices give relative location of receiving cell integer(i8), dimension(nx,ny), intent(in) :: & - btemp_weight ! temperature-dependent weighting factor, favoring flow where the bed is thawed + btemp_weight_freeze ! temperature-dependent weighting factor, favoring refreezing at frozen beds integer(i8), dimension(nx,ny), intent(inout) :: & bwatflx, & ! on input: water flux (m^3/s * factor_bwatflx) to be routed to the margin or halo @@ -1879,9 +1904,9 @@ subroutine route_flux_to_margin_or_halo_integer8(& if (bwat_mask(i,j) == 1 .and. bwatflx(i,j) > 0.0d0) then ! Distribute the flux to downstream neighbors. - ! Note: If btemp_weight = 1 everwhere, there is no refreezing. - flx_thru = bwatflx(i,j) * btemp_weight(i,j) - bwatflx_refreeze(i,j) = bwatflx(i,j) * (1 - btemp_weight(i,j)) + ! Note: If btemp_weight_freeze = 1 everwhere, there is no refreezing. + flx_thru = bwatflx(i,j) * btemp_weight_freeze(i,j) + bwatflx_refreeze(i,j) = bwatflx(i,j) * (1 - btemp_weight_freeze(i,j)) do jj = -1,1 do ii = -1,1 ip = i + ii @@ -1937,7 +1962,6 @@ subroutine route_flux_to_margin_or_halo_integer8(& ! bmltflx_halo is now available in the halo cells of the local processor. ! Route downslope to the adjacent locally owned cells. ! These fluxes will be routed further downstream during the next iteration. - ! Note: This calculation does not use flux_fraction or btemp_weight, so no rescaling is needed at the end. do j = 2, ny-1 do i = 2, nx-1 if (halo_mask(i,j) == 1 .and. sum(bwatflx_halo(:,:,i,j)) > 0) then From b1bf020e2158cc8a01f40dd5c9bf17ee59db6adc Mon Sep 17 00:00:00 2001 From: William Lipscomb Date: Fri, 16 Jan 2026 18:51:54 -0700 Subject: [PATCH 10/21] Changed the initialization logic for C_c and C_p This commit makes a small change in the way coulomb_c and powerlaw_c are initialized. The old logic, called from glissade_initialise after input files are read in, was to set C_c to a constant on initialization based on the following logic: if (model%options%which_ho_coulomb_c == HO_COULOMB_C_CONSTANT .or. & model%options%is_restart == NO_RESTART) then if (model%options%elevation_based_coulomb_c) then [stuff] else model%basal_physics%coulomb_c = model%basal_physics%coulomb_c_const endif endif And similarly for powerlaw_c. This is problematic for CESM coupled runs, which may read C_c from an input file created by a previous CISM standalone spinup. When the CESM run is not a restart run, the logic above sets C_c to a constant instead of using the values from the input file. I changed the logic so that (1) For option HO_COULOMB_C_CONSTANT, coulomb_c is set to coulomb_c_const. (2) For elevation-based coulomb_c, check whether coulomb_c_hi or coulomb_c_lo = 0 everywhere. If so, these fields are set to prescribed constants, and then elevation-based coulomb_c is computed. (3) For coulomb_c not based on elevation, check whether coulomb_c = 0 everywhere. If so, coulomb_c is set to coulomb_c_const. In this way, we avoid overwriting coulomb_c (or coulomb_c_hi and coulomb_c_lo) if these fields have already been read in. --- libglissade/glissade.F90 | 53 +++++++++++++++++++++++++++------------- 1 file changed, 36 insertions(+), 17 deletions(-) diff --git a/libglissade/glissade.F90 b/libglissade/glissade.F90 index be44b235..2f3acb4f 100644 --- a/libglissade/glissade.F90 +++ b/libglissade/glissade.F90 @@ -943,7 +943,7 @@ subroutine glissade_initialise(model, evolve_ice) !TODO: Have a single option that is applied with or without glaciers enabled? if (model%options%which_ho_powerlaw_c == HO_POWERLAW_C_CONSTANT .or. & - model%options%is_restart == NO_RESTART) then + parallel_is_zero(model%basal_physics%powerlaw_c)) then if (model%options%enable_glaciers .and. & model%glacier%set_powerlaw_c /= GLACIER_POWERLAW_C_CONSTANT) then ! do nothing; see note above @@ -953,15 +953,29 @@ subroutine glissade_initialise(model, evolve_ice) endif ! Initialize coulomb_c - ! If inverting for coulomb_c, we read in the saved coulomb_c field on restart. - if (model%options%which_ho_coulomb_c == HO_COULOMB_C_CONSTANT .or. & - model%options%is_restart == NO_RESTART) then + ! Note: If inverting for coulomb_c, then coulomb_c is initialized here. + ! On restart, however, the saved coulomb_c (or alternatively, + ! coulomb_c_hi and coulomb_c_lo, for the elevation-based option) + ! should have been read from the restart file and is not reset here. - !TODO - Make sure the initialization is correct when reading from an external file in a restart. - if (model%options%elevation_based_coulomb_c) then + if (model%options%which_ho_coulomb_c == HO_COULOMB_C_CONSTANT) then - model%basal_physics%coulomb_c_hi = model%basal_physics%coulomb_c_const_hi - model%basal_physics%coulomb_c_lo = model%basal_physics%coulomb_c_const_lo + model%basal_physics%coulomb_c = model%basal_physics%coulomb_c_const + + else ! either inverting for coulomb_c or reading values from an input file + + if (model%options%elevation_based_coulomb_c) then ! need coulomb_c_hi and coulomb_c_lo + + if (parallel_is_zero(model%basal_physics%coulomb_c_hi) .or. & + parallel_is_zero(model%basal_physics%coulomb_c_lo)) then + + ! initialize to constants + model%basal_physics%coulomb_c_hi = model%basal_physics%coulomb_c_const_hi + model%basal_physics%coulomb_c_lo = model%basal_physics%coulomb_c_const_lo + + endif + + ! Given coulomb_c_hi and coulomb_c_lo, compute coulomb_c based on elevation call glissade_elevation_based_coulomb_c(& model%general%ewn, model%general%nsn, & @@ -973,17 +987,24 @@ subroutine glissade_initialise(model, evolve_ice) model%basal_physics%coulomb_c_bed_hi, & model%basal_physics%coulomb_c) - call parallel_halo(model%basal_physics%coulomb_c, parallel) + else ! coulomb_c not elevation-based + + if (parallel_is_zero(model%basal_physics%coulomb_c)) then + + ! initialize to constant + model%basal_physics%coulomb_c = model%basal_physics%coulomb_c_const - if (verbose_inversion) then - call point_diag(model%basal_physics%coulomb_c, 'Initial coulomb_c', itest, jtest, rtest, 7, 7) endif - else - model%basal_physics%coulomb_c = model%basal_physics%coulomb_c_const + endif ! elevation-based + + call parallel_halo(model%basal_physics%coulomb_c, parallel) + + if (verbose_inversion) then + call point_diag(model%basal_physics%coulomb_c, 'Initial coulomb_c', itest, jtest, rtest, 7, 7) endif - endif + endif ! coulomb_c options ! Optionally, do initial calculations for inversion ! At the start of the run (but not on restart), this might lead to further thickness adjustments, @@ -1771,9 +1792,6 @@ subroutine glissade_thermal_solve(model, dt) use glissade_therm, only: glissade_therm_driver use glissade_basal_water, only: glissade_calcbwat, glissade_bwat_flux_routing use glissade_masks, only: glissade_get_masks - !WHL - debug - use cism_parallel, only: parallel_reduce_max - use glissade_utils, only: write_array_to_file implicit none @@ -2446,6 +2464,7 @@ subroutine glissade_calving_solve(model, init_calving) use glissade_masks, only: glissade_get_masks, glissade_ocean_connection_mask, & glissade_calving_front_mask use glissade_grounding_line, only: glissade_grounded_fraction + implicit none type(glide_global_type), intent(inout) :: model ! model instance From 656981788a06c86e8ffcb9d19b5584d54f890575 Mon Sep 17 00:00:00 2001 From: William Lipscomb Date: Sat, 17 Jan 2026 10:35:15 -0700 Subject: [PATCH 11/21] Reprosum abort logic for tridiagonal preconditioners This commit adds logic to abort the run if the user requests local triagonal or global tridiagonal preconditioning along with reproducible sums. I looked at both preconditioners and concluded that it's not feasible to make either of them independent of the number of processors. For the global tridiag preconditioner, the issue is that the algorithm computes coefficients that connect adjacent processors on a given row or column. If the block layout is different, then these coefficients will be different and answers will differ at roundoff level, even though we are solving the same global equations. For local tridiag preconditioner, a different block layout means that we are solving an entirely different set of tridiagonal systems, ensuring that answers will differ. With reprosums enabled, the user will therefor need to choose the simpler diagonal preconditioner (or the SIA preconditioner for the 3D solvers). This could result in slower throughput, but not enough slower to be a big problem, especially in CESM coupled runs where the ice sheet model is much cheaper than other components. I also fixed a minor bug in the subroutine that computes glacier AARs. I introduced the bug when I inserted calls to the parallel_global_sum_patch subroutine. The AARs are now correct again. This fix doesn't change prognostic results, since the AAR is just a diagnostic. --- libglide/glide_setup.F90 | 9 ++++++++- libglissade/glissade_glacier.F90 | 8 ++++---- libglissade/glissade_velo_higher_pcg.F90 | 3 ++- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/libglide/glide_setup.F90 b/libglide/glide_setup.F90 index af481071..28f73706 100644 --- a/libglide/glide_setup.F90 +++ b/libglide/glide_setup.F90 @@ -2197,8 +2197,15 @@ subroutine print_options(model) if (model%options%whichdycore == DYCORE_GLISSADE .and. & (model%options%which_ho_sparse == HO_SPARSE_PCG_STANDARD .or. & model%options%which_ho_sparse == HO_SPARSE_PCG_CHRONGEAR) ) then + if (model%options%reproducible_sums) then + if (model%options%which_ho_precond == HO_PRECOND_TRIDIAG_LOCAL .or. & + model%options%which_ho_precond == HO_PRECOND_TRIDIAG_GLOBAL) then + call write_log ('Tridiagonal preconditioners are not supported with reproducible sums.') + call write_log ('Please choose a different preconditioner (e.g., diagonal)', GM_FATAL) + endif + endif write(message,*) 'ho_whichprecond : ',model%options%which_ho_precond, & - ho_whichprecond(model%options%which_ho_precond) + ho_whichprecond(model%options%which_ho_precond) call write_log(message) if (model%options%which_ho_precond < 0 .or. model%options%which_ho_precond >= size(ho_whichprecond)) then call write_log('Error, glissade preconditioner out of range', GM_FATAL) diff --git a/libglissade/glissade_glacier.F90 b/libglissade/glissade_glacier.F90 index 3d6d2641..ee9e1657 100644 --- a/libglissade/glissade_glacier.F90 +++ b/libglissade/glissade_glacier.F90 @@ -3362,20 +3362,20 @@ subroutine glacier_accumulation_area_ratio(& ! Note: Grid cells with SMB = 0 are not counted in either zone. where (cism_glacier_id > 0 .and. smb > 0.0d0) - glacier_id = 1 + glacier_id = cism_glacier_id elsewhere glacier_id = 0 endwhere - accum_area = parallel_global_sum_patch(cell_area, nglacier, cism_glacier_id, parallel) + accum_area = parallel_global_sum_patch(cell_area, nglacier, glacier_id, parallel) where (cism_glacier_id > 0 .and. smb < 0.0d0) - glacier_id = 1 + glacier_id = cism_glacier_id elsewhere glacier_id = 0 endwhere - ablat_area = parallel_global_sum_patch(cell_area, nglacier, cism_glacier_id, parallel) + ablat_area = parallel_global_sum_patch(cell_area, nglacier, glacier_id, parallel) ! Compute the AAR for each glacier diff --git a/libglissade/glissade_velo_higher_pcg.F90 b/libglissade/glissade_velo_higher_pcg.F90 index 6d162af7..2c2cc482 100644 --- a/libglissade/glissade_velo_higher_pcg.F90 +++ b/libglissade/glissade_velo_higher_pcg.F90 @@ -2341,7 +2341,7 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & zu(i,j) = 0.d0 endif if (Adiagv(i,j) /= 0.d0) then - zv(i,j) = rv(i,j) / Adiagv(i,j) + zv(i,j) = rv(i,j) / Adiagv(i,j) else zv(i,j) = 0.d0 endif @@ -3821,6 +3821,7 @@ subroutine setup_preconditioner_tridiag_global_2d(ilocal, jlocal, & !WHL - debug if (verbose_tridiag .and. this_rank == rtest) then + write(iulog,*) write(iulog,*) 'In setup_preconditioner_tridiag_global_2d: itest, jtest, rtest =', itest, jtest, rtest endif From d4758c71f8a0322e8453c82d153983fbca2de441 Mon Sep 17 00:00:00 2001 From: William Lipscomb Date: Sat, 17 Jan 2026 11:04:40 -0700 Subject: [PATCH 12/21] Removed infnan logic from the reprosum module Pat Worley's original code uses the shared module shr_infnan_mod. This module supports logic in the reprosum algorithm to ignore infinities and NaNs in the input arrays. When implementing reprosums in CISM, I carried along a CISM version of this module, cism_infnan_mod. However, CISM arrays passed to the reprosum module should not contain infinities or NaNs, and I would prefer not to carry along a module that is never used. This commit removes cism_infnan_mod.F90, along with the infnan-related logic and use statements in cism_reprosum_mod. Tony Craig did the same thing for the CICE reprosum module: https://github.com/ESCOMP/CICE/blob/main/cicecore/cicedyn/infrastructure/ comm/mpi/ice_reprosum.F90 The CISM version of the reprosum module still contains some subroutines that aren't now called from CISM. I am leaving these subroutines in place in case they turn out to be useful later. --- libglimmer/cism_infnan_mod.F90 | 423 ------------------------------- libglimmer/cism_reprosum_mod.F90 | 188 ++------------ libglimmer/parallel_mpi.F90 | 4 - 3 files changed, 21 insertions(+), 594 deletions(-) delete mode 100644 libglimmer/cism_infnan_mod.F90 diff --git a/libglimmer/cism_infnan_mod.F90 b/libglimmer/cism_infnan_mod.F90 deleted file mode 100644 index 124b051f..00000000 --- a/libglimmer/cism_infnan_mod.F90 +++ /dev/null @@ -1,423 +0,0 @@ -! Flag representing compiler support of Fortran 2003's -! ieee_arithmetic intrinsic module. -#if defined CPRIBM || defined CPRPGI || defined CPRINTEL || defined CPRCRAY || defined CPRNAG -#define HAVE_IEEE_ARITHMETIC -#endif - -!WHL, Nov. 2025: Adapted from shr_infnan_mod.F90, part of CESM shared code -! Changed 'shr' to 'cism' to avoid name conflicts with shared code -! I kept only the r8 interfaces (not r4, i8 or i4). -! Also, I assumed that all input arrays are 1d. - -module cism_infnan_mod -!--------------------------------------------------------------------- -! Module to test for IEEE Inf and NaN values, which also provides a -! method of setting +/-Inf and signaling or quiet NaN. -! -! All functions are elemental, and thus work on arrays. -!--------------------------------------------------------------------- -! To test for these values, just call the corresponding function, e.g: -! -! var_is_nan = cism_infnan_isnan(x) -! -! You can also use it on arrays: -! -! array_contains_nan = any(cism_infnan_isnan(my_array)) -! -!--------------------------------------------------------------------- -! To generate these values, assign one of the provided derived-type -! variables to a real: -! -! use cism_infnan_mod, only: nan => cism_infnan_nan, & -! inf => cism_infnan_inf, & -! assignment(=) -! real(r4) :: my_nan -! real(r8) :: my_inf_array(2,2) -! my_nan = nan -! my_inf_array = inf -! -! Keep in mind that "cism_infnan_nan" and "cism_infnan_inf" cannot be -! passed to functions that expect real arguments. To pass a real -! NaN, you will have to use cism_infnan_nan to set a local real of -! the correct kind. -!--------------------------------------------------------------------- - - use glimmer_global, only: r4 => sp, r8 => dp - use glimmer_global, only: i4, i8 -!!use shr_kind_mod, only: & -!! r4 => SHR_KIND_R4, & -!! r8 => SHR_KIND_R8 - -#ifdef HAVE_IEEE_ARITHMETIC - -! If we have IEEE_ARITHMETIC, the NaN test is provided for us. -use, intrinsic :: ieee_arithmetic, only: & - cism_infnan_isnan => ieee_is_nan - -#else - -! Integers of correct size for bit patterns below. -!!use shr_kind_mod, only: i4 => shr_kind_i4, i8 => shr_kind_i8 - -#endif - -implicit none -private -save - -! Test functions for NaN/Inf values. -public :: cism_infnan_isnan -public :: cism_infnan_isinf -public :: cism_infnan_isposinf -public :: cism_infnan_isneginf - -! Locally defined isnan. -#ifndef HAVE_IEEE_ARITHMETIC -interface cism_infnan_isnan - ! TYPE double,real - module procedure cism_infnan_isnan_r8 -end interface -#endif - -interface cism_infnan_isinf - ! TYPE double,real - module procedure cism_infnan_isinf_r8 -end interface - -interface cism_infnan_isposinf - ! TYPE double,real - module procedure cism_infnan_isposinf_r8 -end interface - -interface cism_infnan_isneginf - ! TYPE double,real - module procedure cism_infnan_isneginf_r8 -end interface - -! Derived types for generation of NaN/Inf -! Even though there's no reason to "use" the types directly, some compilers -! might have trouble with an object being used without its type. -public :: cism_infnan_nan_type -public :: cism_infnan_inf_type -public :: assignment(=) -public :: cism_infnan_to_r4 -public :: cism_infnan_to_r8 - -! Type representing Not A Number. -type :: cism_infnan_nan_type - logical :: quiet = .false. -end type cism_infnan_nan_type - -! Type representing +/-Infinity. -type :: cism_infnan_inf_type - logical :: positive = .true. -end type cism_infnan_inf_type - -! Allow assigning reals to NaN or Inf. -interface assignment(=) - ! TYPE double,real - ! DIMS 0,1,2,3,4,5,6,7 - module procedure set_nan_1d_r8 - ! TYPE double,real - ! DIMS 0,1,2,3,4,5,6,7 - module procedure set_inf_1d_r8 -end interface - -! Conversion functions. -interface cism_infnan_to_r8 - module procedure nan_r8 - module procedure inf_r8 -end interface - -interface cism_infnan_to_r4 - module procedure nan_r4 - module procedure inf_r4 -end interface - -! Initialize objects of NaN/Inf type for other modules to use. - -! Default NaN is signaling, but also provide snan and qnan to choose -! explicitly. -type(cism_infnan_nan_type), public, parameter :: cism_infnan_nan = & - cism_infnan_nan_type(.false.) -type(cism_infnan_nan_type), public, parameter :: cism_infnan_snan = & - cism_infnan_nan_type(.false.) -type(cism_infnan_nan_type), public, parameter :: cism_infnan_qnan = & - cism_infnan_nan_type(.true.) - -! Default Inf is positive, but provide posinf to go with neginf. -type(cism_infnan_inf_type), public, parameter :: cism_infnan_inf = & - cism_infnan_inf_type(.true.) -type(cism_infnan_inf_type), public, parameter :: cism_infnan_posinf = & - cism_infnan_inf_type(.true.) -type(cism_infnan_inf_type), public, parameter :: cism_infnan_neginf = & - cism_infnan_inf_type(.false.) - -! Bit patterns for implementation without ieee_arithmetic. -! Note that in order to satisfy gfortran's range check, we have to use -! ibset to set the sign bit from a BOZ pattern. -#ifndef HAVE_IEEE_ARITHMETIC -! Single precision. -integer(i4), parameter :: ssnan_pat = int(Z'7FA00000',i4) -integer(i4), parameter :: sqnan_pat = int(Z'7FC00000',i4) -integer(i4), parameter :: sposinf_pat = int(Z'7F800000',i4) -integer(i4), parameter :: sneginf_pat = ibset(sposinf_pat,bit_size(1_i4)-1) -! Double precision. -integer(i8), parameter :: dsnan_pat = int(Z'7FF4000000000000',i8) -integer(i8), parameter :: dqnan_pat = int(Z'7FF8000000000000',i8) -integer(i8), parameter :: dposinf_pat = int(Z'7FF0000000000000',i8) -integer(i8), parameter :: dneginf_pat = ibset(dposinf_pat,bit_size(1_i8)-1) -#endif - -contains - -!--------------------------------------------------------------------- -! TEST FUNCTIONS -!--------------------------------------------------------------------- -! The "isinf" function simply calls "isposinf" and "isneginf". -!--------------------------------------------------------------------- - -! TYPE double,real -elemental function cism_infnan_isinf_r8(x) result(isinf) - real(r8), intent(in) :: x - logical :: isinf - - isinf = cism_infnan_isposinf(x) .or. cism_infnan_isneginf(x) - -end function cism_infnan_isinf_r8 - -#ifdef HAVE_IEEE_ARITHMETIC - -!--------------------------------------------------------------------- -! The "isposinf" and "isneginf" functions get the IEEE class of a -! real, and test to see if the class is equal to ieee_positive_inf -! or ieee_negative_inf. -!--------------------------------------------------------------------- - -! TYPE double,real -elemental function cism_infnan_isposinf_r8(x) result(isposinf) - use, intrinsic :: ieee_arithmetic, only: & - ieee_class, & - ieee_positive_inf, & - operator(==) - real(r8), intent(in) :: x - logical :: isposinf - - isposinf = (ieee_positive_inf == ieee_class(x)) - -end function cism_infnan_isposinf_r8 - -! TYPE double,real -elemental function cism_infnan_isneginf_r8(x) result(isneginf) - use, intrinsic :: ieee_arithmetic, only: & - ieee_class, & - ieee_negative_inf, & - operator(==) - real(r8), intent(in) :: x - logical :: isneginf - - isneginf = (ieee_negative_inf == ieee_class(x)) - -end function cism_infnan_isneginf_r8 - -#else -! Don't have ieee_arithmetic. - -!!#ifdef CPRGNU !WHL - Assume this is true -! NaN testing on gfortran. -! TYPE double,real -elemental function cism_infnan_isnan_r8(x) result(is_nan) - real(r8), intent(in) :: x - logical :: is_nan - - is_nan = isnan(x) - -end function cism_infnan_isnan_r8 -! End GNU section. -!!#endif - -!--------------------------------------------------------------------- -! The "isposinf" and "isneginf" functions just test against a known -! bit pattern if we don't have ieee_arithmetic. -!--------------------------------------------------------------------- - -! TYPE double,real -elemental function cism_infnan_isposinf_r8(x) result(isposinf) - real(r8), intent(in) :: x - logical :: isposinf -!!#if ({ITYPE} == TYPEREAL) -!! integer(i4), parameter :: posinf_pat = sposinf_pat -!!#else - integer(i8), parameter :: posinf_pat = dposinf_pat -!!#endif - - isposinf = (x == transfer(posinf_pat,x)) - -end function cism_infnan_isposinf_r8 - -! TYPE double,real -elemental function cism_infnan_isneginf_r8(x) result(isneginf) - real(r8), intent(in) :: x - logical :: isneginf -!!#if ({ITYPE} == TYPEREAL) -!! integer(i4), parameter :: neginf_pat = sneginf_pat -!!#else - integer(i8), parameter :: neginf_pat = dneginf_pat -!!#endif - - isneginf = (x == transfer(neginf_pat,x)) - -end function cism_infnan_isneginf_r8 - -! End ieee_arithmetic conditional. -#endif - -!--------------------------------------------------------------------- -! GENERATION FUNCTIONS -!--------------------------------------------------------------------- -! Two approaches for generation of NaN and Inf values: -! 1. With Fortran 2003, use the ieee_value intrinsic to get a value -! from the corresponding class. These are: -! - ieee_signaling_nan -! - ieee_quiet_nan -! - ieee_positive_inf -! - ieee_negative_inf -! 2. Without Fortran 2003, set the IEEE bit patterns directly. -! Use BOZ literals to get an integer with the correct bit -! pattern, then use "transfer" to transfer those bits into a -! real. -!--------------------------------------------------------------------- - -! TYPE double,real -! DIMS 0,1,2,3,4,5,6,7 -pure subroutine set_nan_1d_r8(output, nan) -#ifdef HAVE_IEEE_ARITHMETIC - use, intrinsic :: ieee_arithmetic, only: & - ieee_signaling_nan, & - ieee_quiet_nan, & - ieee_value -#else -!!#if ({ITYPE} == TYPEREAL) -!! integer(i4), parameter :: snan_pat = ssnan_pat -!! integer(i4), parameter :: qnan_pat = sqnan_pat -!!#else - integer(i8), parameter :: snan_pat = dsnan_pat - integer(i8), parameter :: qnan_pat = dqnan_pat -!!#endif -#endif -!! real(r8), intent(out) :: output{DIMSTR} - real(r8), intent(out) :: output - type(cism_infnan_nan_type), intent(in) :: nan - - ! Use scalar temporary for performance reasons, to reduce the cost of - ! the ieee_value call. - real(r8) :: tmp - -#ifdef HAVE_IEEE_ARITHMETIC - if (nan%quiet) then - tmp = ieee_value(tmp, ieee_quiet_nan) - else - tmp = ieee_value(tmp, ieee_signaling_nan) - end if -#else - if (nan%quiet) then - tmp = transfer(qnan_pat, tmp) - else - tmp = transfer(snan_pat, tmp) - end if -#endif - - output = tmp - -end subroutine set_nan_1d_r8 - -! TYPE double,real -! DIMS 0,1,2,3,4,5,6,7 -pure subroutine set_inf_1d_r8(output, inf) -#ifdef HAVE_IEEE_ARITHMETIC - use, intrinsic :: ieee_arithmetic, only: & - ieee_positive_inf, & - ieee_negative_inf, & - ieee_value -#else -!!#if ({ITYPE} == TYPEREAL) -!! integer(i4), parameter :: posinf_pat = sposinf_pat -!! integer(i4), parameter :: neginf_pat = sneginf_pat -!!#else - integer(i8), parameter :: posinf_pat = dposinf_pat - integer(i8), parameter :: neginf_pat = dneginf_pat -!!#endif -#endif -!! real(r8), intent(out) :: output{DIMSTR} - real(r8), intent(out) :: output - type(cism_infnan_inf_type), intent(in) :: inf - - ! Use scalar temporary for performance reasons, to reduce the cost of - ! the ieee_value call. - real(r8) :: tmp - -#ifdef HAVE_IEEE_ARITHMETIC - if (inf%positive) then - tmp = ieee_value(tmp,ieee_positive_inf) - else - tmp = ieee_value(tmp,ieee_negative_inf) - end if -#else - if (inf%positive) then - tmp = transfer(posinf_pat, tmp) - else - tmp = transfer(neginf_pat, tmp) - end if -#endif - - output = tmp - -end subroutine set_inf_1d_r8 - -!--------------------------------------------------------------------- -! CONVERSION INTERFACES. -!--------------------------------------------------------------------- -! Function methods to get reals from nan/inf types. -!--------------------------------------------------------------------- - -pure function nan_r8(nan) result(output) - class(cism_infnan_nan_type), intent(in) :: nan - real(r8) :: output - -!! output = nan - !WHL kluge - output = 0._r8 - -end function nan_r8 - -pure function nan_r4(nan) result(output) - class(cism_infnan_nan_type), intent(in) :: nan - real(r4) :: output - -!! output = nan - !WHL kluge - output = 0._r8 - -end function nan_r4 - -pure function inf_r8(inf) result(output) - class(cism_infnan_inf_type), intent(in) :: inf - real(r8) :: output - -!! output = inf - !WHL kluge - output = 0._r8 - -end function inf_r8 - -pure function inf_r4(inf) result(output) - class(cism_infnan_inf_type), intent(in) :: inf - real(r4) :: output - -!! output = inf - !WHL kluge - output = 0._r8 - -end function inf_r4 - -end module cism_infnan_mod diff --git a/libglimmer/cism_reprosum_mod.F90 b/libglimmer/cism_reprosum_mod.F90 index ec202273..1b4161a8 100644 --- a/libglimmer/cism_reprosum_mod.F90 +++ b/libglimmer/cism_reprosum_mod.F90 @@ -36,6 +36,10 @@ module cism_reprosum_mod ! I started from a version that includes some logic fixes and code cleanup ! done by Pat Worley in 2023. Pat's revised version differs from the code ! in the CESM repo as of Jan. 2026. +! See here for information on Pat's mods: +! * https://github.com/E3SM-Project/E3SM/pull/5534 +! * https://github.com/E3SM-Project/E3SM/pull/5549 +! * https://github.com/E3SM-Project/E3SM/pull/5560 !------------------------------------------------------------------------ !------------------------------------------------------------------------ @@ -62,11 +66,6 @@ module cism_reprosum_mod use glimmer_global, only: r8 => dp use glimmer_global, only: i8 use glimmer_paramets, only: iulog - use cism_infnan_mod,only: cism_infnan_inf_type, assignment(=), & - cism_infnan_posinf, cism_infnan_neginf, & - cism_infnan_nan, & - cism_infnan_isnan, cism_infnan_isinf, & - cism_infnan_isposinf, cism_infnan_isneginf use profile, only: t_startf, t_stopf !WHL - replace with perf_mod? ! end WHL mods @@ -116,8 +115,6 @@ module cism_reprosum_mod !--------------------------------------------------------------------- logical :: repro_sum_use_ddpdd = .false. - logical :: repro_sum_allow_infnan = .false. - !WHL - Should this code be declared? ! Not sure what EAMXX_STANDALONE means #ifdef EAMXX_STANDALONE @@ -143,7 +140,6 @@ end subroutine cism_reprosumx86_fix_end !======================================================================== ! subroutine cism_reprosum_setopts(repro_sum_use_ddpdd_in, & - repro_sum_allow_infnan_in, & repro_sum_rel_diff_max_in, & repro_sum_recompute_in, & repro_sum_master, & @@ -156,10 +152,6 @@ subroutine cism_reprosum_setopts(repro_sum_use_ddpdd_in, & !------------------------------Arguments--------------------------------- ! Use DDPDD algorithm instead of integer vector algorithm logical, intent(in), optional :: repro_sum_use_ddpdd_in - ! Allow INF or NaN in summands - logical, intent(in), optional :: repro_sum_allow_infnan_in - ! maximum permissible difference between reproducible and - ! nonreproducible sums real(r8), intent(in), optional :: repro_sum_rel_diff_max_in ! recompute using different algorithm when difference between ! reproducible and nonreproducible sums is too great @@ -202,9 +194,6 @@ subroutine cism_reprosum_setopts(repro_sum_use_ddpdd_in, & if ( present(repro_sum_use_ddpdd_in) ) then repro_sum_use_ddpdd = repro_sum_use_ddpdd_in endif - if ( present(repro_sum_allow_infnan_in) ) then - repro_sum_allow_infnan = repro_sum_allow_infnan_in - endif if ( present(repro_sum_rel_diff_max_in) ) then cism_reprosum_reldiffmax = repro_sum_rel_diff_max_in endif @@ -222,14 +211,6 @@ subroutine cism_reprosum_setopts(repro_sum_use_ddpdd_in, & 'distributed sum algorithm' endif - if ( repro_sum_allow_infnan ) then - write(logunit,*) 'cism_REPROSUM_SETOPTS: ',& - 'Will calculate sum when INF or NaN are included in summands' - else - write(logunit,*) 'cism_REPROSUM_SETOPTS: ',& - 'Will abort if INF or NaN are included in summands' - endif - if (cism_reprosum_reldiffmax >= 0.0_r8) then write(logunit,*) ' ',& 'with a maximum relative error tolerance of ', & @@ -256,7 +237,7 @@ end subroutine cism_reprosum_setopts ! subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands, & - nflds, allow_infnan, ddpdd_sum, & + nflds, ddpdd_sum, & arr_gbl_max, arr_gbl_max_out, & arr_max_levels, arr_max_levels_out, & gbl_max_nsummands, gbl_max_nsummands_out,& @@ -505,10 +486,6 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands, & ! use ddpdd algorithm instead ! of integer vector algorithm - logical, intent(in), optional :: allow_infnan - ! if .true., allow INF or NaN input values. - ! if .false. (the default), then abort. - real(r8), intent(in), optional :: arr_gbl_max(nflds) ! upper bound on max(abs(arr)) @@ -561,8 +538,6 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands, & ! ! Local workspace ! - logical :: abort_inf_nan ! flag indicating whether to - ! abort if INF or NaN found in input logical :: use_ddpdd_sum ! flag indicating whether to ! use cism_reprosum_ddpdd or not logical :: recompute ! flag indicating need to @@ -571,17 +546,6 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands, & logical :: validate ! flag indicating need to ! verify gmax and max_levels ! are accurate/sufficient - logical :: nan_check, inf_check ! flag on whether there are - ! NaNs and INFs in input array - logical :: inf_nan_lchecks(3,nflds)! flags on whether there are - ! NaNs, positive INFs, or negative INFs - ! for each input field locally - logical :: inf_nan_gchecks(3,nflds)! flags on whether there are - ! NaNs, positive INFs, or negative INFs - ! for each input field - logical :: arr_gsum_infnan(nflds) ! flag on whether field sum is a - ! NaN or INF - integer :: gbl_lor_red ! global lor reduction? (0/1) integer :: gbl_max_red ! global max reduction? (0/1) integer :: repro_sum_fast ! 1 reduction repro_sum? (0/1) @@ -589,8 +553,6 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands, & integer :: repro_sum_both ! both fast and slow? (0/1) integer :: nonrepro_sum ! nonrepro_sum? (0/1) - integer :: nan_count, inf_count ! local count of NaNs and INFs in - ! input array integer :: omp_nthreads ! number of OpenMP threads integer :: mpi_comm ! MPI subcommunicator integer :: mypid ! MPI task ID (COMM_WORLD) @@ -664,85 +626,6 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands, & ! call t_barrierf('sync_repro_sum',mpi_comm) #endif -! Check whether should abort if input contains NaNs or INFs - abort_inf_nan = .not. repro_sum_allow_infnan - if ( present(allow_infnan) ) then - abort_inf_nan = .not. allow_infnan - endif - -! With Fujitsu always abort on NaNs or INFs in input -#ifdef CPRFJ - abort_inf_nan = .true. -#endif - -#ifndef EAMXX_STANDALONE - call t_startf('cism_reprosum_INF_NaN_Chk') -#endif - -! Initialize flags to indicate that no NaNs or INFs are present in the input data - inf_nan_gchecks = .false. - arr_gsum_infnan = .false. - - !TODO - Remove the inf_nan option; assume abort_inf_nan = T, as in CICE - if (abort_inf_nan) then - -! Check whether input contains NaNs or INFs, and abort if so - nan_check = any(cism_infnan_isnan(arr)) - inf_check = any(cism_infnan_isinf(arr)) - - if (nan_check .or. inf_check) then - - nan_count = count(cism_infnan_isnan(arr)) - inf_count = count(cism_infnan_isinf(arr)) - - if ((nan_count > 0) .or. (inf_count > 0)) then - call mpi_comm_rank(MPI_COMM_WORLD, mypid, ierr) -! write(s_logunit,37) real(nan_count,r8), real(inf_count,r8), mypid - write(iulog,37) real(nan_count,r8), real(inf_count,r8), mypid -37 format("cism_REPROSUM_CALC: Input contains ",e12.5, & - " NaNs and ", e12.5, " INFs on MPI task ", i7) - !WHL mod -! call shr_sys_abort("shr_reprosum_calc ERROR: NaNs or INFs in input") - write(iulog,*) 'cism_reprosum_calc ERROR: NaNs or INFs in input' - call mpi_abort(MPI_COMM_WORLD, 1001, ierr) - !end WHL mod - endif - - endif - -#ifndef CPRFJ - else - -! Determine whether any fields contain NaNs or INFs, and avoid processing them -! via integer expansions - inf_nan_lchecks = .false. - - do ifld=1,nflds - inf_nan_lchecks(1,ifld) = any(cism_infnan_isnan(arr(:,ifld))) - inf_nan_lchecks(2,ifld) = any(cism_infnan_isposinf(arr(:,ifld))) - inf_nan_lchecks(3,ifld) = any(cism_infnan_isneginf(arr(:,ifld))) - end do -#ifndef EAMXX_STANDALONE - call t_startf("repro_sum_allr_lor") -#endif - call mpi_allreduce (inf_nan_lchecks, inf_nan_gchecks, 3*nflds, & - MPI_LOGICAL, MPI_LOR, mpi_comm, ierr) - gbl_lor_red = 1 -#ifndef EAMXX_STANDALONE - call t_stopf("repro_sum_allr_lor") -#endif - - do ifld=1,nflds - arr_gsum_infnan(ifld) = any(inf_nan_gchecks(:,ifld)) - enddo -#endif - - endif - -#ifndef EAMXX_STANDALONE - call t_stopf('cism_reprosum_INF_NaN_Chk') -#endif - ! Check whether should use cism_reprosum_ddpdd algorithm use_ddpdd_sum = repro_sum_use_ddpdd if ( present(ddpdd_sum) ) then @@ -891,7 +774,7 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands, & call cism_reprosum_int(arr, arr_gsum, nsummands, dsummands, & nflds, arr_max_shift, arr_gmax_exp, & arr_max_levels, max_level, extra_levels, & - arr_gsum_infnan, validate, recompute, & + validate, recompute, & omp_nthreads, mpi_comm) ! Record statistics, etc. @@ -943,15 +826,13 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands, & do ifld=1,nflds arr_exp_tlmin = MAXEXPONENT(1.0_r8) arr_exp_tlmax = MINEXPONENT(1.0_r8) - if (.not. arr_gsum_infnan(ifld)) then - do isum=isum_beg(ithread),isum_end(ithread) - if (arr(isum,ifld) /= 0.0_r8) then - arr_exp = exponent(arr(isum,ifld)) - arr_exp_tlmin = min(arr_exp,arr_exp_tlmin) - arr_exp_tlmax = max(arr_exp,arr_exp_tlmax) - endif - end do - endif + do isum=isum_beg(ithread),isum_end(ithread) + if (arr(isum,ifld) /= 0.0_r8) then + arr_exp = exponent(arr(isum,ifld)) + arr_exp_tlmin = min(arr_exp,arr_exp_tlmin) + arr_exp_tlmax = max(arr_exp,arr_exp_tlmax) + endif + end do arr_tlmin_exp(ifld,ithread) = arr_exp_tlmin arr_tlmax_exp(ifld,ithread) = arr_exp_tlmax end do @@ -981,8 +862,8 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands, & arr_gmax_exp(:) = -arr_gextremes(1:nflds,1) arr_gmin_exp(:) = arr_gextremes(1:nflds,2) -! If a field is identically zero or contains INFs or NaNs, arr_gmin_exp -! still equals MAXEXPONENT and arr_gmax_exp still equals MINEXPONENT. +! If a field is identically zero, arr_gmin_exp still equals MAXEXPONENT +! and arr_gmax_exp still equals MINEXPONENT. ! In this case, set arr_gmin_exp = arr_gmax_exp = MINEXPONENT do ifld=1,nflds arr_gmin_exp(ifld) = min(arr_gmax_exp(ifld),arr_gmin_exp(ifld)) @@ -1093,7 +974,7 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands, & call cism_reprosum_int(arr, arr_gsum, nsummands, dsummands, & nflds, arr_max_shift, arr_gmax_exp, & max_levels, max_level, extra_levels, & - arr_gsum_infnan, validate, recompute, & + validate, recompute, & omp_nthreads, mpi_comm) endif @@ -1122,11 +1003,9 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands, & !$omp default(shared) & !$omp private(ifld, isum) do ifld=1,nflds - if (.not. arr_gsum_infnan(ifld)) then - do isum=1,nsummands - arr_lsum(ifld) = arr(isum,ifld) + arr_lsum(ifld) - end do - endif + do isum=1,nsummands + arr_lsum(ifld) = arr(isum,ifld) + arr_lsum(ifld) + end do end do #ifndef EAMXX_STANDALONE @@ -1160,25 +1039,6 @@ subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands, & endif endif -! Set field sums to NaN and INF, as needed - do ifld=1,nflds - if (arr_gsum_infnan(ifld)) then - if (inf_nan_gchecks(1,ifld)) then - ! NaN => NaN - arr_gsum(ifld) = cism_infnan_nan - else if (inf_nan_gchecks(2,ifld) .and. inf_nan_gchecks(3,ifld)) then - ! posINF and negINF => NaN - arr_gsum(ifld) = cism_infnan_nan - else if (inf_nan_gchecks(2,ifld)) then - ! posINF only => posINF - arr_gsum(ifld) = cism_infnan_posinf - else if (inf_nan_gchecks(3,ifld)) then - ! negINF only => negINF - arr_gsum(ifld) = cism_infnan_neginf - endif - endif - end do - ! Return statistics if ( present(repro_sum_stats) ) then repro_sum_stats(1) = repro_sum_stats(1) + repro_sum_fast @@ -1204,7 +1064,8 @@ end subroutine cism_reprosum_calc subroutine cism_reprosum_int(arr, arr_gsum, nsummands, dsummands, nflds, & arr_max_shift, arr_gmax_exp, max_levels, & - max_level, extra_levels, skip_field, & +! max_level, extra_levels, skip_field, & + max_level, extra_levels, & validate, recompute, omp_nthreads, mpi_comm ) !------------------------------------------------------------------------ ! @@ -1245,11 +1106,6 @@ subroutine cism_reprosum_int(arr, arr_gsum, nsummands, dsummands, nflds, & real(r8), intent(in) :: arr(dsummands,nflds) ! input array - logical, intent(in) :: skip_field(nflds) - ! flag indicating whether the sum for this field should be - ! computed or not (used to skip over fields containing - ! NaN or INF summands) - logical, intent(in) :: validate ! flag indicating that accuracy of solution generated from ! arr_gmax_exp and max_levels should be tested @@ -1399,8 +1255,6 @@ subroutine cism_reprosum_int(arr, arr_gsum, nsummands, dsummands, nflds, & not_exact(ifld,ithread) = 0 i8_arr_tlsum_level(:,ifld,ithread) = 0_i8 - if (skip_field(ifld)) cycle - do isum=isum_beg(ithread),isum_end(ithread) arr_remainder = 0.0_r8 diff --git a/libglimmer/parallel_mpi.F90 b/libglimmer/parallel_mpi.F90 index 6a941283..0d248e0e 100644 --- a/libglimmer/parallel_mpi.F90 +++ b/libglimmer/parallel_mpi.F90 @@ -9601,8 +9601,6 @@ subroutine parallel_reduce_reprosum(arr, arr_gsum) logical :: & ddpdd_sum, & ! use ddpdd algorithm instead of fixed-precision algorithm - allow_infnan, & ! if .true., allow INF or NaN input values; - ! if .false. (the default), then abort if INF or NaNs are present repro_sum_validate ! flag enabling/disabling testing that gmax and max_levels ! are accurate/sufficient. Default is enabled. @@ -9617,7 +9615,6 @@ subroutine parallel_reduce_reprosum(arr, arr_gsum) allocate (arr_max_levels_out(nflds)) allocate (rel_diff(2,nflds)) - allow_infnan = .false. ddpdd_sum = .false. repro_sum_validate = .true. @@ -9626,7 +9623,6 @@ subroutine parallel_reduce_reprosum(arr, arr_gsum) call cism_reprosum_calc(& arr, arr_gsum, & nsummands, dsummands, nflds, & - allow_infnan = allow_infnan, & ddpdd_sum = ddpdd_sum, & ! arr_gbl_max = arr_gbl_max, & arr_gbl_max_out = arr_gbl_max_out, & From a8b597f4e6c3192d11b6b9b39f28b4d543026e43 Mon Sep 17 00:00:00 2001 From: William Lipscomb Date: Wed, 21 Jan 2026 20:52:32 -0700 Subject: [PATCH 13/21] Fixed a logic bug in a parallel_sum subroutine This commit fixes a bug in function parallel_global_sum_stagger_real8_3d_nflds. The function now works correctly. I found the bug while running Greenland tests with the BP solver, using various combinations of preconditioners and sparse PCG solvers (e.g., both diagonal and SIA preconditioners, with the standard and Chronopoulos-Gear PCG algorithms). All the BP configurations I tested are now reproducible. I also cleaned up some diagnostics, making the output consistent across related subroutines. --- libglimmer/parallel_mpi.F90 | 31 +--- libglissade/glissade_velo_higher_pcg.F90 | 216 ++++++++++++++++++----- 2 files changed, 177 insertions(+), 70 deletions(-) diff --git a/libglimmer/parallel_mpi.F90 b/libglimmer/parallel_mpi.F90 index 0d248e0e..d043cde2 100644 --- a/libglimmer/parallel_mpi.F90 +++ b/libglimmer/parallel_mpi.F90 @@ -6186,12 +6186,6 @@ function parallel_global_sum_real8_2d(a, parallel, mask_2d) own_ewn => parallel%own_ewn, & own_nsn => parallel%own_nsn) - !WHL - debug - if (verbose_reprosum .and. main_task) then -! write(iulog,*) 'In parallel_global_sum_real8_2d, reprosum =', parallel%reprosum -! write(iulog,*) 'nhalo, local ewn, local_nsn =', nhalo, local_ewn, local_nsn - endif - if (present(mask_2d)) then mask = mask_2d else @@ -6229,9 +6223,8 @@ function parallel_global_sum_real8_2d(a, parallel, mask_2d) parallel_global_sum_real8_2d = arr_gsum(1) - !WHL - debug if (verbose_reprosum .and. main_task) then -!! write(iulog,*) 'arr_gsum =', arr_gsum +! write(iulog,*) 'arr_gsum =', arr_gsum endif deallocate(arr) @@ -6331,7 +6324,6 @@ function parallel_global_sum_real8_3d(a, parallel, mask_2d) parallel_global_sum_real8_3d = arr_gsum(1) - !WHL - debug if (verbose_reprosum .and. main_task) then ! write(iulog,*) 'arr_gsum =', arr_gsum endif @@ -6465,7 +6457,7 @@ function parallel_global_sum_patch_real8_2d(a, npatch, patch_id, parallel) parallel_global_sum_patch_real8_2d(:) = arr_gsum(:) if (verbose_reprosum .and. main_task) then -!! write(iulog,*) 'arr_gsum =', arr_gsum +! write(iulog,*) 'arr_gsum =', arr_gsum endif deallocate(arr) @@ -6567,7 +6559,6 @@ function parallel_global_sum_stagger_real8_2d(arr1, parallel, arr2) parallel_global_sum_stagger_real8_2d = arr_gsum(1) - !WHL - debug if (verbose_reprosum .and. main_task) then ! write(iulog,*) 'arr_gsum =', arr_gsum endif @@ -6683,7 +6674,6 @@ function parallel_global_sum_stagger_real8_3d(arr1, parallel, arr2) parallel_global_sum_stagger_real8_3d = arr_gsum(1) - !WHL - debug if (verbose_reprosum .and. main_task) then ! write(iulog,*) 'arr_gsum =', arr_gsum endif @@ -6751,7 +6741,6 @@ function parallel_global_sum_stagger_real8_2d_nflds(arr1, nflds, parallel, arr2) staggered_jlo, staggered_jhi ! variables for computing reproductible sums -!! integer :: nsummands, nflds ! dimensions of array passed to parallel_reduce_reprosum integer :: nsummands ! dimensions of array passed to parallel_reduce_reprosum integer :: count real(dp), dimension(:,:), allocatable :: arr @@ -6766,7 +6755,6 @@ function parallel_global_sum_stagger_real8_2d_nflds(arr1, nflds, parallel, arr2) ! Allocate and fill arrays to pass to parallel_reduce_reprosum nsummands = (staggered_ihi-staggered_ilo+1) * (staggered_jhi-staggered_jlo+1) -! nflds = size(a,3) allocate(arr(nsummands,nflds)) allocate(arr_gsum(nflds)) @@ -6805,7 +6793,6 @@ function parallel_global_sum_stagger_real8_2d_nflds(arr1, nflds, parallel, arr2) parallel_global_sum_stagger_real8_2d_nflds = arr_gsum(:) - !WHL - debug if (verbose_reprosum .and. main_task) then ! write(iulog,*) 'arr_gsum =', arr_gsum endif @@ -6837,7 +6824,6 @@ function parallel_global_sum_stagger_real8_2d_nflds(arr1, nflds, parallel, arr2) enddo ! nflds - ! take the global sum parallel_global_sum_stagger_real8_2d_nflds = parallel_reduce_sum(local_sum(:)) endif ! reprosum @@ -6872,7 +6858,6 @@ function parallel_global_sum_stagger_real8_3d_nflds(arr1, nflds, parallel, arr2) staggered_jlo, staggered_jhi ! variables for computing reproductible sums -!! integer :: nsummands, nflds ! dimensions of array passed to parallel_reduce_reprosum integer :: nsummands ! dimensions of array passed to parallel_reduce_reprosum integer :: count real(dp), dimension(:,:), allocatable :: arr @@ -6888,14 +6873,8 @@ function parallel_global_sum_stagger_real8_3d_nflds(arr1, nflds, parallel, arr2) if (parallel%reprosum) then ! compute using cism_reprosum_calc - !WHL - debug - if (verbose_reprosum .and. main_task) then - write(iulog,*) ' In global_sum_stagger_real8_3d_nflds' - endif - ! Allocate and fill arrays to pass to parallel_reduce_reprosum nsummands = (staggered_ihi-staggered_ilo+1) * (staggered_jhi-staggered_jlo+1) * nz -! nflds = size(a,3) allocate(arr(nsummands,nflds)) allocate(arr_gsum(nflds)) @@ -6938,7 +6917,6 @@ function parallel_global_sum_stagger_real8_3d_nflds(arr1, nflds, parallel, arr2) parallel_global_sum_stagger_real8_3d_nflds = arr_gsum(:) - !WHL - debug if (verbose_reprosum .and. main_task) then ! write(iulog,*) 'arr_gsum =', arr_gsum endif @@ -6974,10 +6952,9 @@ function parallel_global_sum_stagger_real8_3d_nflds(arr1, nflds, parallel, arr2) enddo ! nflds - endif ! reprosum + parallel_global_sum_stagger_real8_3d_nflds = parallel_reduce_sum(local_sum(:)) - ! take the global sum - parallel_global_sum_stagger_real8_3d_nflds = parallel_reduce_sum(local_sum(:)) + endif ! reprosum end function parallel_global_sum_stagger_real8_3d_nflds diff --git a/libglissade/glissade_velo_higher_pcg.F90 b/libglissade/glissade_velo_higher_pcg.F90 index 2c2cc482..9f3a8261 100644 --- a/libglissade/glissade_velo_higher_pcg.F90 +++ b/libglissade/glissade_velo_higher_pcg.F90 @@ -215,6 +215,19 @@ subroutine pcg_solver_standard_3d(nx, ny, & real(dp), dimension(-1:1,nz,nx-1,ny-1) :: & Muu, Mvv ! simplified SIA matrices for preconditioning + integer :: & + staggered_ilo, staggered_ihi, & ! bounds of locally owned vertices on staggered grid + staggered_jlo, staggered_jhi + + !WHL - debug + integer :: iu_max, ju_max, iv_max, jv_max + real(dp) :: ru_max, rv_max + + staggered_ilo = parallel%staggered_ilo + staggered_ihi = parallel%staggered_ihi + staggered_jlo = parallel%staggered_jlo + staggered_jhi = parallel%staggered_jhi + if (verbose_pcg .and. main_task) then write(iulog,*) 'Using native PCG solver (standard)' write(iulog,*) 'tolerance, maxiters, precond =', tolerance, maxiters, precond @@ -238,7 +251,7 @@ subroutine pcg_solver_standard_3d(nx, ny, & j = jtest write(iulog,*) 'i, j, r =', i, j, this_rank write(iulog,*) 'Auu diag =', Adiagu(:,i,j) - write(iulog,*) 'Avu diag =', Adiagv(:,i,j) + write(iulog,*) 'Avv diag =', Adiagv(:,i,j) endif !TODO - Create a separate setup for tridiag_local @@ -254,6 +267,18 @@ subroutine pcg_solver_standard_3d(nx, ny, & Auu, Avv, & Muu, Mvv) + if (verbose_pcg .and. this_rank == rtest) then + j = jtest + write(iulog,*) ' ' + write(iulog,*) 'i, k, Muu_sia, Mvv_sia:' + do i = staggered_ihi, staggered_ilo, -1 + write(iulog,*) ' ' + do k = 1, nz + write(iulog,'(2i4, 6e13.5)') i, k, Muu(:,k,i,j), Mvv(:,k,i,j) + enddo + enddo ! i + endif + else ! no preconditioner if (verbose_pcg .and. main_task) then @@ -461,6 +486,19 @@ subroutine pcg_solver_standard_3d(nx, ny, & rv(:,:,:) = rv(:,:,:) - alpha * qv(:,:,:) call t_stopf("pcg_vecupdate") + if (verbose_pcg .and. this_rank == rtest) then + j = jtest + write(iulog,*) ' ' + write(iulog,*) 'iter =', iter + write(iulog,*) 'i, k, xu, xv, ru, rv:' + do i = itest-3, itest+3 + write(iulog,*) ' ' + do k = 1, nz + write(iulog,'(2i4, 4f16.10)') i, k, xu(k,i,j), xv(k,i,j), ru(k,i,j), rv(k,i,j) + enddo + enddo ! i + endif + ! Check for convergence every linear_solve_ncheck iterations. ! Also check at iter = 5, to reduce iterations when the nonlinear solver is close to convergence. ! TODO: Check at iter = linear_solve_ncheck/2 instead of 5? This would be answer-changing. @@ -468,7 +506,11 @@ subroutine pcg_solver_standard_3d(nx, ny, & ! For convergence check, use r = b - Ax if (mod(iter, linear_solve_ncheck) == 0 .or. iter == 5) then -!! if (mod(iter, linear_solve_ncheck) == 0 .or. iter == linear_solve_ncheck/2) then + + if (verbose_pcg .and. main_task) then + write(iulog,*) ' ' + write(iulog,*) 'Check convergence, iter =', iter + endif ! Halo update for x @@ -514,28 +556,56 @@ subroutine pcg_solver_standard_3d(nx, ny, & err = L2_resid/L2_rhs if (verbose_pcg .and. main_task) then -! write(iulog,*) ' ' -! write(iulog,*) 'iter, L2_resid, error =', iter, L2_resid, err + write(iulog,*) 'iter, L2_resid, L2_rhs, error =', iter, L2_resid, L2_rhs, err + endif + + !WHL - debug + if (verbose_pcg .and. this_rank == rtest) then + ru_max = 0.d0 + rv_max = 0.d0 + iu_max = 0 + ju_max = 0 + do j = staggered_jlo, staggered_jhi + do i = staggered_ilo, staggered_ihi + if (sum(abs(ru(:,i,j))) > ru_max) then + ru_max = sum(abs(ru(:,i,j))) + iu_max = i + ju_max = j + endif + if (sum(abs(rv(:,i,j))) > rv_max) then + rv_max = sum(abs(rv(:,i,j))) + iv_max = i + jv_max = j + endif + enddo + enddo + write(iulog,*) 'r, i, j, ru_max:', this_rank, iu_max, ju_max, ru_max + write(iulog,*) 'r, i, j, rv_max:', this_rank, iv_max, jv_max, rv_max endif + ! If converged, then exit the loop. + ! Note: Without good preconditioning, convergence can be slow, + ! but the solution after maxiters_chrongear might be good enough. + if (err < tolerance) then niters = iter + if (verbose_pcg .and. main_task) then + write(iulog,*) 'Glissade PCG solver has converged, iter =', niters + write(iulog,*) ' ' + endif exit iter_loop - endif + elseif (iter == maxiters) then + if (verbose_pcg .and. main_task) then + write(iulog,*) 'Glissade PCG solver did not converge' + write(iulog,*) 'iter, err, tolerance:', iter, err, tolerance + write(iulog,*) ' ' + endif + endif endif ! linear_solve_ncheck enddo iter_loop -!WHL - Without good preconditioning, convergence can be slow, but the solution after maxiters might be good enough. - - if (niters == maxiters) then - if (verbose_pcg .and. main_task) then - write(iulog,*) 'Glissade PCG solver not yet converged' - write(iulog,*) 'niters, err, tolerance:', niters, err, tolerance - endif - endif - end subroutine pcg_solver_standard_3d !**************************************************************************** @@ -659,6 +729,19 @@ subroutine pcg_solver_standard_2d(nx, ny, & L2_rhs ! L2 norm of rhs vector b ! solver converges when L2_resid/L2_rhs < tolerance + integer :: & + staggered_ilo, staggered_ihi, & ! bounds of locally owned vertices on staggered grid + staggered_jlo, staggered_jhi + + !WHL - debug + integer :: iu_max, ju_max, iv_max, jv_max + real(dp) :: ru_max, rv_max + + staggered_ilo = parallel%staggered_ilo + staggered_ihi = parallel%staggered_ihi + staggered_jlo = parallel%staggered_jlo + staggered_jhi = parallel%staggered_jhi + if (verbose_pcg .and. main_task) then write(iulog,*) 'Using native PCG solver (standard)' write(iulog,*) 'tolerance, maxiters, precond =', tolerance, maxiters, precond @@ -875,7 +958,6 @@ subroutine pcg_solver_standard_2d(nx, ny, & ! For convergence check, use r = b - Ax if (mod(iter, linear_solve_ncheck) == 0 .or. iter == 5) then -!! if (mod(iter, linear_solve_ncheck) == 0 .or. iter == linear_solve_ncheck/2) then ! Halo update for x @@ -920,24 +1002,53 @@ subroutine pcg_solver_standard_2d(nx, ny, & ! compute normalized error err = L2_resid/L2_rhs + !WHL - debug + if (verbose_pcg .and. this_rank == rtest) then + ru_max = 0.d0 + rv_max = 0.d0 + iu_max = 0 + ju_max = 0 + do j = staggered_jlo, staggered_jhi + do i = staggered_ilo, staggered_ihi + if (abs(ru(i,j)) > ru_max) then + ru_max = ru(i,j) + iu_max = i + ju_max = j + endif + if (abs(rv(i,j)) > rv_max) then + rv_max = rv(i,j) + iv_max = i + jv_max = j + endif + enddo + enddo + write(iulog,*) 'r, i, j, ru_max:', this_rank, iu_max, ju_max, ru_max + write(iulog,*) 'r, i, j, rv_max:', this_rank, iv_max, jv_max, rv_max + endif + + ! If converged, then exit the loop. + ! Note: Without good preconditioning, convergence can be slow, + ! but the solution after maxiters_chrongear might be good enough. + if (err < tolerance) then niters = iter + if (verbose_pcg .and. main_task) then + write(iulog,*) 'Glissade PCG solver has converged, iter =', niters + write(iulog,*) ' ' + endif exit iter_loop - endif + elseif (iter == maxiters) then + if (verbose_pcg .and. main_task) then + write(iulog,*) 'Glissade PCG solver did not converge' + write(iulog,*) 'iter, err, tolerance:', iter, err, tolerance + write(iulog,*) ' ' + endif + endif endif ! linear_solve_ncheck enddo iter_loop -!WHL - Without good preconditioning, convergence can be slow, but the solution after maxiters might be good enough. - - if (niters == maxiters) then - if (verbose_pcg .and. main_task) then - write(iulog,*) 'Glissade PCG solver not yet converged' - write(iulog,*) 'niters, err, tolerance:', niters, err, tolerance - endif - endif - end subroutine pcg_solver_standard_2d !**************************************************************************** @@ -1256,7 +1367,7 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & j = jtest write(iulog,*) 'i, j, r =', i, j, this_rank write(iulog,*) 'Auu diag =', Adiagu(:,i,j) - write(iulog,*) 'Avu diag =', Adiagv(:,i,j) + write(iulog,*) 'Avv diag =', Adiagv(:,i,j) endif elseif (precond == HO_PRECOND_SIA) then @@ -1398,6 +1509,8 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & ! First pass of algorithm !--------------------------------------------------------------- + iter = 1 + ! Note: The matrix A must be complete for all rows corresponding to locally ! owned nodes, and x must have the correct values in ! halo nodes bordering the locally owned nodes. @@ -1590,16 +1703,25 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & rv(:,:,:) = rv(:,:,:) - alpha*qv(:,:,:) call t_stopf("pcg_vecupdate") + if (verbose_pcg .and. this_rank == rtest) then + j = jtest + write(iulog,*) ' ' + write(iulog,*) 'iter =', iter + write(iulog,*) 'i, k, xu, xv, ru, rv:' + do i = itest-3, itest+3 + write(iulog,*) ' ' + do k = 1, nz + write(iulog,'(2i4, 4f16.10)') i, k, xu(k,i,j), xv(k,i,j), ru(k,i,j), rv(k,i,j) + enddo + enddo ! i + endif + !--------------------------------------------------------------- ! Iterate to solution !--------------------------------------------------------------- iter_loop: do iter = 2, maxiters_chrongear ! first iteration done above - if (verbose_pcg .and. main_task) then -! write(iulog,*) 'iter =', iter - endif - !---- Compute PC(r) = solution z of Mz = r !---- z is correct in halo @@ -1768,11 +1890,12 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & if (verbose_pcg .and. this_rank == rtest) then j = jtest write(iulog,*) ' ' + write(iulog,*) 'iter =', iter write(iulog,*) 'i, k, xu, xv, ru, rv:' do i = itest-3, itest+3 write(iulog,*) ' ' do k = 1, nz - write(iulog,'(i4, 4f16.10)') i, xu(k,i,j), xv(k,i,j), ru(k,i,j), rv(k,i,j) + write(iulog,'(2i4, 4f16.10)') i, k, xu(k,i,j), xv(k,i,j), ru(k,i,j), rv(k,i,j) enddo enddo ! i endif @@ -1820,13 +1943,17 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & rr = parallel_global_sum_stagger(worku, parallel, workv) call t_stopf("pcg_glbsum_resid") - L2_resid = sqrt(rr) ! L2 norm of residual - err = L2_resid/L2_rhs ! normalized error + ! take square root + L2_resid = sqrt(rr) + + ! compute normalized error + err = L2_resid/L2_rhs if (verbose_pcg .and. main_task) then write(iulog,*) 'iter, L2_resid, L2_rhs, error =', iter, L2_resid, L2_rhs, err endif + !WHL - debug if (verbose_pcg .and. this_rank == rtest) then ru_max = 0.d0 rv_max = 0.d0 @@ -1834,13 +1961,13 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & ju_max = 0 do j = staggered_jlo, staggered_jhi do i = staggered_ilo, staggered_ihi - if (abs(sum(ru(:,i,j))) > ru_max) then - ru_max = sum(ru(:,i,j)) + if (sum(abs(ru(:,i,j))) > ru_max) then + ru_max = sum(abs(ru(:,i,j))) iu_max = i ju_max = j endif - if (abs(sum(rv(:,i,j))) > rv_max) then - rv_max = sum(rv(:,i,j)) + if (sum(abs(rv(:,i,j))) > rv_max) then + rv_max = sum(abs(rv(:,i,j))) iv_max = i jv_max = j endif @@ -1861,10 +1988,10 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & write(iulog,*) ' ' endif exit iter_loop - elseif (niters == maxiters_chrongear) then + elseif (iter == maxiters_chrongear) then if (verbose_pcg .and. main_task) then - write(iulog,*) 'Glissade PCG solver not yet converged' - write(iulog,*) 'niters, err, tolerance:', niters, err, tolerance + write(iulog,*) 'Glissade PCG solver did not converge' + write(iulog,*) 'iter, err, tolerance:', iter, err, tolerance write(iulog,*) ' ' endif endif @@ -2916,6 +3043,7 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & write(iulog,*) 'iter, L2_resid, L2_rhs, error =', iter, L2_resid, L2_rhs, err endif + !WHL - debug if (verbose_pcg .and. this_rank == rtest) then ru_max = 0.d0 rv_max = 0.d0 @@ -2947,12 +3075,14 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & niters = iter if (verbose_pcg .and. main_task) then write(iulog,*) 'Glissade PCG solver has converged, iter =', niters + write(iulog,*) ' ' endif exit iter_loop - elseif (niters == maxiters_chrongear) then + elseif (iter == maxiters_chrongear) then if (verbose_pcg .and. main_task) then - write(iulog,*) 'Glissade PCG solver not yet converged' - write(iulog,*) 'niters, err, tolerance:', niters, err, tolerance + write(iulog,*) 'Glissade PCG solver did not converge' + write(iulog,*) 'iter, err, tolerance:', iter, err, tolerance + write(iulog,*) ' ' endif endif From cf3a07494da1655aa00e9a04e26fa980c39edfd8 Mon Sep 17 00:00:00 2001 From: William Lipscomb Date: Thu, 22 Jan 2026 17:23:01 -0700 Subject: [PATCH 14/21] Enable tridiagonal preconditioners with all PCG solvers Several years ago, I implemented local and global triagonal preconditioners in module glissade_velo_higher_pcg. For some reason, I coded and tested them in the Chronopoulos-Gear PCG subroutines (which_ho_sparse = 3) but not the standard PCG subroutines (which_ho_sparse = 2). This commit adds the two tridiagonal preconditioners to both standard PCG subroutines (2d and 3d). I tested the 2d version for DIVA and the 3d version for BP, verifying that answers agree with other preconditioners, within roundoff. --- libglissade/glissade_velo_higher.F90 | 3 +- libglissade/glissade_velo_higher_pcg.F90 | 604 ++++++++++++++++++++--- 2 files changed, 544 insertions(+), 63 deletions(-) diff --git a/libglissade/glissade_velo_higher.F90 b/libglissade/glissade_velo_higher.F90 index 099f4fd7..d68d3253 100644 --- a/libglissade/glissade_velo_higher.F90 +++ b/libglissade/glissade_velo_higher.F90 @@ -3274,7 +3274,8 @@ subroutine glissade_velo_higher_solve(model, & call pcg_solver_standard_3d(nx, ny, & nz, parallel, & - indxA_3d, active_vertex, & + indxA_2d, indxA_3d, & + active_vertex, & Auu, Auv, & Avu, Avv, & bu, bv, & diff --git a/libglissade/glissade_velo_higher_pcg.F90 b/libglissade/glissade_velo_higher_pcg.F90 index 9f3a8261..4a303b71 100644 --- a/libglissade/glissade_velo_higher_pcg.F90 +++ b/libglissade/glissade_velo_higher_pcg.F90 @@ -66,7 +66,8 @@ module glissade_velo_higher_pcg subroutine pcg_solver_standard_3d(nx, ny, & nz, parallel, & - indxA_3d, active_vertex, & + indxA_2d, indxA_3d, & + active_vertex, & Auu, Auv, & Avu, Avv, & bu, bv, & @@ -137,7 +138,10 @@ subroutine pcg_solver_standard_3d(nx, ny, & nz ! number of vertical levels where velocity is computed type(parallel_type), intent(in) :: & - parallel ! info for parallel communication + parallel ! info for parallel communication + + integer, dimension(-1:1,-1:1), intent(in) :: & + indxA_2d ! maps relative (x,y) coordinates to an index between 1 and 9 integer, dimension(-1:1,-1:1,-1:1), intent(in) :: & indxA_3d ! maps relative (x,y,z) coordinates to an index between 1 and 27 @@ -188,10 +192,11 @@ subroutine pcg_solver_standard_3d(nx, ny, & ! Local variables and parameters !--------------------------------------------------------------- - integer :: i, j, k ! grid indices - integer :: iA, jA, kA ! grid offsets ranging from -1 to 1 - integer :: m ! matrix element index - integer :: iter ! iteration counter + integer :: i, j, k ! grid indices + integer :: iA, jA, kA ! grid offsets ranging from -1 to 1 + integer :: m ! matrix element index + integer :: ilocal, jlocal ! number of locally owned vertices in each direction + integer :: iter ! iteration counter real(dp) :: & eta0, eta1, eta2, &! scalar inner product results @@ -215,6 +220,36 @@ subroutine pcg_solver_standard_3d(nx, ny, & real(dp), dimension(-1:1,nz,nx-1,ny-1) :: & Muu, Mvv ! simplified SIA matrices for preconditioning + ! arrays for tridiagonal preconditioning + ! Note: 2D diagonal entries are Adiag_u and Adiag_v; distinct from 3D Adiagu and Adiagv above + real(dp), dimension(:,:), allocatable :: & + Asubdiag_u, Adiag_u, Asupdiag_u, & ! matrix entries from Auu for tridiagonal preconditioning + Asubdiag_v, Adiag_v, Asupdiag_v ! matrix entries from Avv for tridiagonal preconditioning + + real(dp), dimension(:,:), allocatable :: & + omega_u, omega_v, & ! work arrays for tridiagonal solve + denom_u, denom_v, & + xuh_u, xuh_v, & + xlh_u, xlh_v + + ! Note: These two matrices are global in the EW and NS dimensions, respectively. + ! Each holds 8 pieces of information for each task on each row or column. + ! Since only 2 of these 8 pieces of information change from one iteration to the next, + ! it is more efficient to gather the remaining information once and pass the arrays + ! with intent(inout), than to declare the arrays in subroutine tridiag_solver_global_2d + ! and gather all the information every time the subroutine is called. + ! TODO: Revisit this. Is the efficiency gain large enough to justify the extra complexity? + + real(dp), dimension(:,:), allocatable :: & + gather_data_row, & ! arrays for gathering data from every task on a row or column + gather_data_col + + integer :: & + tasks_row, & ! number of tasks per row and column for tridiagonal solve + tasks_col + + logical :: first_time ! true on the first subroutine call (iter = 1), false thereafter + integer :: & staggered_ilo, staggered_ihi, & ! bounds of locally owned vertices on staggered grid staggered_jlo, staggered_jhi @@ -228,17 +263,29 @@ subroutine pcg_solver_standard_3d(nx, ny, & staggered_jlo = parallel%staggered_jlo staggered_jhi = parallel%staggered_jhi + tasks_row = parallel%tasks_row + tasks_col = parallel%tasks_col + if (verbose_pcg .and. main_task) then write(iulog,*) 'Using native PCG solver (standard)' write(iulog,*) 'tolerance, maxiters, precond =', tolerance, maxiters, precond endif + ! Compute array sizes for locally owned vertices + ilocal = staggered_ihi - staggered_ilo + 1 + jlocal = staggered_jhi - staggered_jlo + 1 + ! Set up matrices for preconditioning - !TODO - Add tridiagonal options call t_startf("pcg_precond_init") - if (precond == HO_PRECOND_DIAG) then + if (precond == HO_PRECOND_NONE) then ! no preconditioner + + if (verbose_pcg .and. this_rank == rtest) then + write(iulog,*) 'Using no preconditioner' + endif + + elseif (precond == HO_PRECOND_DIAG) then call setup_preconditioner_diag_3d(nx, ny, & nz, indxA_3d, & @@ -254,12 +301,6 @@ subroutine pcg_solver_standard_3d(nx, ny, & write(iulog,*) 'Avv diag =', Adiagv(:,i,j) endif - !TODO - Create a separate setup for tridiag_local - ! For this setup: Pass in Auu and Avv - ! Return Adiag/subdiag/supdiag for u and v in halo - ! Return omega and denom in halo - ! Then M*z = r can compute z in halo - elseif (precond == HO_PRECOND_SIA) then call setup_preconditioner_sia_3d(nx, ny, & @@ -279,11 +320,79 @@ subroutine pcg_solver_standard_3d(nx, ny, & enddo ! i endif - else ! no preconditioner + elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then - if (verbose_pcg .and. main_task) then - write(iulog,*) 'Using no preconditioner' - endif + ! Allocate tridiagonal preconditioning matrices + allocate(Adiag_u (nx-1,ny-1)) + allocate(Asubdiag_u(nx-1,ny-1)) + allocate(Asupdiag_u(nx-1,ny-1)) + allocate(omega_u (nx-1,ny-1)) + allocate(denom_u (nx-1,ny-1)) + + allocate(Adiag_v (nx-1,ny-1)) + allocate(Asubdiag_v(nx-1,ny-1)) + allocate(Asupdiag_v(nx-1,ny-1)) + allocate(omega_v (nx-1,ny-1)) + allocate(denom_v (nx-1,ny-1)) + + ! Compute arrays for tridiagonal preconditioning + + call setup_preconditioner_tridiag_local_3d(& + nx, ny, & + nz, parallel, & + active_vertex, & + indxA_2d, indxA_3d, & + itest, jtest, rtest, & + Auu, Avv, & + Muu, Mvv, & + Adiag_u, Adiag_v, & + Asubdiag_u, Asubdiag_v, & + Asupdiag_u, Asupdiag_v, & + omega_u, omega_v, & + denom_u, denom_v) + + elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then + + ! Allocate tridiagonal preconditioning matrices + ! Note: (i,j) indices are switched for the A_v matrices to reduce striding. + allocate(Adiag_u (ilocal,jlocal)) + allocate(Asubdiag_u(ilocal,jlocal)) + allocate(Asupdiag_u(ilocal,jlocal)) + allocate(omega_u(ilocal,jlocal)) + allocate(denom_u(ilocal,jlocal)) + allocate(xuh_u(ilocal,jlocal)) + allocate(xlh_u(ilocal,jlocal)) + + allocate(Adiag_v (jlocal,ilocal)) + allocate(Asubdiag_v(jlocal,ilocal)) + allocate(Asupdiag_v(jlocal,ilocal)) + allocate(omega_v(jlocal,ilocal)) + allocate(denom_v(jlocal,ilocal)) + allocate(xuh_v(jlocal,ilocal)) + allocate(xlh_v(jlocal,ilocal)) + + ! These two matrices are for gathering data from all tasks on a given row or column. + allocate(gather_data_row(8*tasks_row,jlocal)) + allocate(gather_data_col(8*tasks_col,ilocal)) + gather_data_row = 0.0d0 + gather_data_col = 0.0d0 + + call setup_preconditioner_tridiag_global_3d(& + nx, ny, & + nz, parallel, & + active_vertex, & + indxA_2d, indxA_3d, & + ilocal, jlocal, & + itest, jtest, rtest, & + Auu, Avv, & + Muu, Mvv, & + Adiag_u, Adiag_v, & + Asubdiag_u, Asubdiag_v, & + Asupdiag_u, Asupdiag_v, & + omega_u, omega_v, & + denom_u, denom_v, & + xuh_u, xuh_v, & + xlh_u, xlh_v) endif ! precond @@ -393,6 +502,54 @@ subroutine pcg_solver_standard_3d(nx, ny, & active_vertex, & Mvv, rv, zv) ! solve Mvv*zv = rv for zv + elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then + + ! Use a local tridiagonal solver to find an approximate solution of A*z = r + + call tridiag_solver_local_3d(& + nx, ny, & + nz, parallel, & + active_vertex, & + itest, jtest, rtest, & + Adiag_u, Adiag_v, & ! entries of 2D preconditioning matrix + Asubdiag_u, Asubdiag_v, & + Asupdiag_u, Asupdiag_v, & + omega_u, omega_v, & + denom_u, denom_v, & + Muu, Mvv, & ! entries of SIA matrix + ru, rv, & ! 3D residual + zu, zv) ! approximate solution of Az = r + + elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then + + ! Use a global tridiagonal solver to find an approximate solution of A*z = r + + if (iter == 1) then + first_time = .true. + else + first_time = .false. + endif + + call tridiag_solver_global_3d(& + nx, ny, & + nz, parallel, & + active_vertex, & + ilocal, jlocal, & + tasks_row, tasks_col, & + itest, jtest, rtest, & + Adiag_u, Adiag_v, & ! entries of 2D preconditioning matrix + Asubdiag_u, Asubdiag_v, & + Asupdiag_u, Asupdiag_v, & + omega_u, omega_v, & + denom_u, denom_v, & + xuh_u, xuh_v, & + xlh_u, xlh_v, & + Muu, Mvv, & ! entries of SIA matrix + gather_data_row, gather_data_col, & + first_time, & + ru, rv, & ! 3D residual + zu, zv) ! approximate solution of Az = r + endif ! precond call t_stopf("pcg_precond") @@ -606,6 +763,17 @@ subroutine pcg_solver_standard_3d(nx, ny, & enddo iter_loop + ! Clean up + if (allocated(Adiag_u)) deallocate(Adiag_u, Adiag_v) + if (allocated(Asubdiag_u)) deallocate(Asubdiag_u, Asubdiag_v) + if (allocated(Asupdiag_u)) deallocate(Asupdiag_u, Asupdiag_v) + if (allocated(omega_u)) deallocate(omega_u, omega_v) + if (allocated(denom_u)) deallocate(denom_u, denom_v) + if (allocated(xuh_u)) deallocate(xuh_u, xuh_v) + if (allocated(xlh_u)) deallocate(xlh_u, xlh_v) + if (allocated(gather_data_row)) deallocate(gather_data_row) + if (allocated(gather_data_col)) deallocate(gather_data_col) + end subroutine pcg_solver_standard_3d !**************************************************************************** @@ -707,7 +875,7 @@ subroutine pcg_solver_standard_2d(nx, ny, & ! Local variables and parameters !--------------------------------------------------------------- - integer :: i, j ! grid indices + integer :: i, j, ii, jj ! grid indices integer :: iter ! iteration counter real(dp) :: & @@ -729,10 +897,41 @@ subroutine pcg_solver_standard_2d(nx, ny, & L2_rhs ! L2 norm of rhs vector b ! solver converges when L2_resid/L2_rhs < tolerance + ! tridiagonal matrix elements + real(dp), dimension(:,:), allocatable :: & + Asubdiag_u, Adiag_u, Asupdiag_u, & ! matrix entries from Auu for tridiagonal preconditioning + Asubdiag_v, Adiag_v, Asupdiag_v ! matrix entries from Avv for tridiagonal preconditioning + + real(dp), dimension(:,:), allocatable :: & + omega_u, omega_v, & ! work arrays for tridiagonal solve + denom_u, denom_v, & + xuh_u, xuh_v, & + xlh_u, xlh_v + + real(dp), dimension(:,:), allocatable :: & + b_u, b_v, x_u, x_v + + real(dp), dimension(:,:), allocatable :: & + gather_data_row, & ! arrays for gathering data from every task on a row or column + gather_data_col + + integer :: ilocal, jlocal ! number of locally owned vertices in each direction + + integer :: & + tasks_row, & ! number of tasks per row and column for tridiagonal solve + tasks_col + + logical :: first_time ! true on the first subroutine call (iter = 1), false thereafter + integer :: & staggered_ilo, staggered_ihi, & ! bounds of locally owned vertices on staggered grid staggered_jlo, staggered_jhi + !TODO - Apply the following for tridiag PCs? +! integer, parameter :: & +! maxiters_tridiag = 100 ! max number of linear iterations for tridiagonal preconditioning, +! ! which generally leads to faster convergence than diagonal preconditioning + !WHL - debug integer :: iu_max, ju_max, iv_max, jv_max real(dp) :: ru_max, rv_max @@ -742,24 +941,160 @@ subroutine pcg_solver_standard_2d(nx, ny, & staggered_jlo = parallel%staggered_jlo staggered_jhi = parallel%staggered_jhi + tasks_row = parallel%tasks_row + tasks_col = parallel%tasks_col + if (verbose_pcg .and. main_task) then write(iulog,*) 'Using native PCG solver (standard)' write(iulog,*) 'tolerance, maxiters, precond =', tolerance, maxiters, precond endif + ! Compute array sizes for locally owned vertices + ilocal = staggered_ihi - staggered_ilo + 1 + jlocal = staggered_jhi - staggered_jlo + 1 + ! Set up matrices for preconditioning - !TODO - Add tridiagonal option + call t_startf("pcg_precond_init") - if (verbose_pcg .and. main_task) then - write(iulog,*) 'Using diagonal matrix for preconditioning' - endif ! verbose_pcg + if (precond == HO_PRECOND_NONE) then ! no preconditioner + + if (verbose_pcg .and. main_task) then + write(iulog,*) 'Using no preconditioner' + endif + + elseif (precond == HO_PRECOND_DIAG) then + + if (verbose_pcg .and. main_task) then + write(iulog,*) 'Using diagonal matrix for preconditioning' + endif ! verbose_pcg + + call setup_preconditioner_diag_2d(nx, ny, & + indxA_2d, & + Auu, Avv, & + Adiagu, Adiagv) + + elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then + + !WHL - debug + if (verbose_tridiag .and. this_rank==rtest) then + i = itest + j = jtest + write(iulog,*) ' ' + write(iulog,*) 'r, i, j =', this_rank, i, j + write(iulog,*) 'Auu =', Auu(i,j,:) + write(iulog,*) 'Avv =', Avv(i,j,:) + endif + + allocate(Adiag_u (nx-1,ny-1)) + allocate(Asubdiag_u(nx-1,ny-1)) + allocate(Asupdiag_u(nx-1,ny-1)) + allocate(omega_u (nx-1,ny-1)) + allocate(denom_u (nx-1,ny-1)) + + allocate(Adiag_v (nx-1,ny-1)) + allocate(Asubdiag_v(nx-1,ny-1)) + allocate(Asupdiag_v(nx-1,ny-1)) + allocate(omega_v (nx-1,ny-1)) + allocate(denom_v (nx-1,ny-1)) + + call setup_preconditioner_tridiag_local_2d(& + nx, ny, & + parallel, indxA_2d, & + itest, jtest, rtest, & + Auu, Avv, & + Adiag_u, Adiag_v, & + Asubdiag_u, Asubdiag_v, & + Asupdiag_u, Asupdiag_v, & + omega_u, omega_v, & + denom_u, denom_v) + + elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then + + ! Allocate tridiagonal matrices + ! Note: (i,j) indices are switced for the A_v matrices to reduce striding. + + allocate(Adiag_u (ilocal,jlocal)) + allocate(Asubdiag_u(ilocal,jlocal)) + allocate(Asupdiag_u(ilocal,jlocal)) + allocate(omega_u(ilocal,jlocal)) + allocate(denom_u(ilocal,jlocal)) + allocate(xuh_u(ilocal,jlocal)) + allocate(xlh_u(ilocal,jlocal)) + allocate(b_u(ilocal,jlocal)) + allocate(x_u(ilocal,jlocal)) + + allocate(Adiag_v (jlocal,ilocal)) + allocate(Asubdiag_v(jlocal,ilocal)) + allocate(Asupdiag_v(jlocal,ilocal)) + allocate(omega_v(jlocal,ilocal)) + allocate(denom_v(jlocal,ilocal)) + allocate(xuh_v(jlocal,ilocal)) + allocate(xlh_v(jlocal,ilocal)) + allocate(b_v(jlocal,ilocal)) + allocate(x_v(jlocal,ilocal)) + + ! These two matrices are for gathering data from all tasks on a given row or column. + allocate(gather_data_row(8*tasks_row,jlocal)) + allocate(gather_data_col(8*tasks_col,ilocal)) + gather_data_row = 0.0d0 + gather_data_col = 0.0d0 + + ! Compute the entries of the tridiagonal matrices + + ! Extract tridiagonal matrix entries from Auu + do j = 1, jlocal + jj = j + staggered_jlo - 1 + do i = 1, ilocal + ii = i + staggered_ilo - 1 + Asubdiag_u(i,j) = Auu(ii,jj,indxA_2d(-1,0)) ! subdiagonal elements + Adiag_u (i,j) = Auu(ii,jj,indxA_2d( 0,0)) ! diagonal elements + Asupdiag_u(i,j) = Auu(ii,jj,indxA_2d( 1,0)) ! superdiagonal elements + enddo + enddo + + ! compute work arrays for the u solve in each matrix row + call setup_preconditioner_tridiag_global_2d(& + ilocal, jlocal, & +!! itest, jtest, rtest, & + itest - staggered_ilo + 1, & ! itest referenced to (ilocal,jlocal) coordinates + jtest - staggered_jlo + 1, & ! jtest referenced to (ilocal,jlocal) coordinates + rtest, & + Adiag_u, & + Asubdiag_u, Asupdiag_u, & + omega_u, denom_u, & + xuh_u, xlh_u) + + ! Extract tridiagonal matrix entries from Avv + + do i = 1, ilocal + ii = i + staggered_ilo - 1 + do j = 1, jlocal + jj = j + staggered_jlo - 1 + Asubdiag_v(j,i) = Avv(ii,jj,indxA_2d(0,-1)) ! subdiagonal elements + Adiag_v (j,i) = Avv(ii,jj,indxA_2d(0, 0)) ! diagonal elements + Asupdiag_v(j,i) = Avv(ii,jj,indxA_2d(0, 1)) ! superdiagonal elements + enddo + enddo + + ! compute work arrays for the v solve in each matrix column + ! Note: The *_v arrays have dimensions (jlocal,ilocal) to reduce strides + + call setup_preconditioner_tridiag_global_2d(& + jlocal, ilocal, & +!! itest, jtest, rtest, & + jtest - staggered_jlo + 1, & ! jtest referenced to (jlocal,ilocal) coordinates + itest - staggered_ilo + 1, & ! itest referenced to (jlocal,ilocal) coordinates + rtest, & + Adiag_v, & + Asubdiag_v, Asupdiag_v, & + omega_v, denom_v, & + xuh_v, xlh_v) + + endif ! precond + + if (verbose_pcg .and. main_task) write(iulog,*) 'Done in PC setup' - call t_startf("pcg_precond_init") - call setup_preconditioner_diag_2d(nx, ny, & - indxA_2d, & - Auu, Avv, & - Adiagu, Adiagv) call t_stopf("pcg_precond_init") ! Compute initial residual and initialize the direction vector d @@ -858,6 +1193,143 @@ subroutine pcg_solver_standard_2d(nx, ny, & enddo ! i enddo ! j + elseif(precond == HO_PRECOND_TRIDIAG_LOCAL) then ! local + + if (verbose_tridiag .and. this_rank == rtest) then + i = itest + j = jtest + write(iulog,*) 'Residual:' + write(iulog,*) 'r, i, j, ru:', this_rank, i, j, ru(i,j) + write(iulog,*) 'r, i, j, rv:', this_rank, i, j, rv(i,j) + write(iulog,*) ' ' + write(iulog,*) 'jtest =', jtest + write(iulog,*) 'i, ru, rv:' + do i = itest-3, itest+3 + write(iulog,'(i4, 2f15.10)') i, ru(i,j), rv(i,j) + enddo + endif + + if (verbose_pcg .and. main_task) then + write(iulog,*) 'call tridiag_solver_local_2d' + endif + + ! Solve M*z = r, where M is a local tridiagonal matrix (one matrix per task) + !TODO - Test a local solver that can compute zu and zv in the halo + ! (to avoid the halo update below) + + call tridiag_solver_local_2d(& + nx, ny, & + parallel, & + itest, jtest, rtest, & + Adiag_u, Adiag_v, & ! entries of preconditioning matrix + Asubdiag_u, Asubdiag_v, & + Asupdiag_u, Asupdiag_v, & + omega_u, omega_v, & + denom_u, denom_v, & + ru, rv, & ! right hand side + zu, zv) ! solution + + !WHL - debug + if (verbose_pcg .and. this_rank == rtest) then + j = jtest + write(iulog,*) ' ' + write(iulog,*) 'tridiag solve: i, ru, rv, zu, zv:' + do i = itest-3, itest+3 + write(iulog,'(i4, 4f16.10)') i, ru(i,j), rv(i,j), zu(i,j), zv(i,j) + enddo + endif + + !Note: Need zu and zv in a row of halo cells so that q = A*d is correct in all locally owned cells + !TODO: See whether tridiag solvers could be modified to provide zu and zv in halo cells? + call staggered_parallel_halo(zu, parallel) + call staggered_parallel_halo(zv, parallel) + + elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then ! tridiagonal preconditioning with global solve + + ! Use a global tridiagonal solver to find an approximate solution of A*z = r + if (iter == 1) then + first_time = .true. + else + first_time = .false. + endif + + ! convert ru(nx-1,ny-1) to b_u(ilocal,jlocal) + do j = 1, jlocal + jj = j + staggered_jlo - 1 + do i = 1, ilocal + ii = i + staggered_ilo - 1 + b_u(i,j) = ru(ii,jj) + enddo + enddo + + ! Solve M*z = r, where M is a global tridiagonal matrix + + call tridiag_solver_global_2d(& + ilocal, jlocal, & + parallel, tasks_row, & + 'row', & ! tridiagonal solve for each row +!! itest, jtest, rtest, & + itest - staggered_ilo + 1, & ! itest referenced to (ilocal,jlocal) coordinates + jtest - staggered_jlo + 1, & ! jtest referenced to (ilocal,jlocal) coordinates + rtest, & + Adiag_u, & + Asubdiag_u, Asupdiag_u, & + omega_u, denom_u, & + xuh_u, xlh_u, & + b_u, x_u, & + first_time, & + gather_data_row) + + ! convert x_u(ilocal,jlocal) to zu(nx-1,ny-1) + zu(:,:) = 0.0d0 + do j = 1, jlocal + jj = j + staggered_jlo - 1 + do i = 1, ilocal + ii = i + staggered_ilo - 1 + zu(ii,jj) = x_u(i,j) + enddo + enddo + + ! convert rv(nx-1,ny-1) to b_v(jlocal,ilocal) + do i = 1, ilocal + ii = i + staggered_ilo - 1 + do j = 1, jlocal + jj = j + staggered_jlo - 1 + b_v(j,i) = rv(ii,jj) + enddo + enddo + + call tridiag_solver_global_2d(& + jlocal, ilocal, & + parallel, tasks_col, & + 'col', & ! tridiagonal solve for each column + !! itest, jtest, rtest, & + jtest - staggered_jlo + 1, & ! jtest referenced to (jlocal,ilocal) coordinates + itest - staggered_ilo + 1, & ! itest referenced to (jlocal,ilocal) coordinates + rtest, & + Adiag_v, & + Asubdiag_v, Asupdiag_v, & + omega_v, denom_v, & + xuh_v, xlh_v, & + b_v, x_v, & + first_time, & + gather_data_col) + + ! convert x_v(jlocal,ilocal) to zv(nx-1,ny-1) + zv(:,:) = 0.0d0 + do i = 1, ilocal + ii = i + staggered_ilo - 1 + do j = 1, jlocal + jj = j + staggered_jlo - 1 + zv(ii,jj) = x_v(j,i) + enddo + enddo + + !Note: Need zu and zv in a row of halo cells so that q = A*d is correct in all locally owned cells + !TODO: See whether tridiag_solver_local_2d could be modified to provide zu and zv in halo cells? + call staggered_parallel_halo(zu, parallel) + call staggered_parallel_halo(zv, parallel) + endif ! precond call t_stopf("pcg_precond") @@ -1049,6 +1521,19 @@ subroutine pcg_solver_standard_2d(nx, ny, & enddo iter_loop + ! Clean up + if (allocated(Adiag_u)) deallocate(Adiag_u, Adiag_v) + if (allocated(Asubdiag_u)) deallocate(Asubdiag_u, Asubdiag_v) + if (allocated(Asupdiag_u)) deallocate(Asupdiag_u, Asupdiag_v) + if (allocated(omega_u)) deallocate(omega_u, omega_v) + if (allocated(denom_u)) deallocate(denom_u, denom_v) + if (allocated(xuh_u)) deallocate(xuh_u, xuh_v) + if (allocated(xlh_u)) deallocate(xlh_u, xlh_v) + if (allocated(b_u)) deallocate(b_u, b_v) + if (allocated(x_u)) deallocate(x_u, x_v) + if (allocated(gather_data_row)) deallocate(gather_data_row) + if (allocated(gather_data_col)) deallocate(gather_data_col) + end subroutine pcg_solver_standard_2d !**************************************************************************** @@ -1178,7 +1663,7 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & nz ! number of vertical levels where velocity is computed type(parallel_type), intent(in) :: & - parallel ! info for parallel communication + parallel ! info for parallel communication integer, dimension(-1:1,-1:1), intent(in) :: & indxA_2d ! maps relative (x,y) coordinates to an index between 1 and 9 @@ -1285,9 +1770,6 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & xuh_u, xuh_v, & xlh_u, xlh_v - real(dp), dimension(:,:), allocatable :: & - b_u, b_v, x_u, x_v - ! Note: These two matrices are global in the EW and NS dimensions, respectively. ! Each holds 8 pieces of information for each task on each row or column. ! Since only 2 of these 8 pieces of information change from one iteration to the next, @@ -2015,6 +2497,8 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & if (allocated(denom_u)) deallocate(denom_u, denom_v) if (allocated(xuh_u)) deallocate(xuh_u, xuh_v) if (allocated(xlh_u)) deallocate(xlh_u, xlh_v) + if (allocated(gather_data_row)) deallocate(gather_data_row) + if (allocated(gather_data_col)) deallocate(gather_data_col) end subroutine pcg_solver_chrongear_3d @@ -2115,7 +2599,7 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & ! Local variables and parameters !--------------------------------------------------------------- - integer :: i, j ! grid indices + integer :: i, j, ii, jj ! grid indices integer :: m ! matrix element index integer :: iter ! iteration counter @@ -2130,24 +2614,6 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & real(dp), dimension(2) :: & gsum ! result of global sum for dot products - ! diagonal matrix elements - real(dp), dimension(nx-1,ny-1) :: & - Adiagu, Adiagv ! diagonal terms of matrices Auu and Avv - - ! tridiagonal matrix elements - real(dp), dimension(:,:), allocatable :: & - Asubdiag_u, Adiag_u, Asupdiag_u, & ! matrix entries from Auu for tridiagonal preconditioning - Asubdiag_v, Adiag_v, Asupdiag_v ! matrix entries from Avv for tridiagonal preconditioning - - real(dp), dimension(:,:), allocatable :: & - omega_u, omega_v, & ! work arrays for tridiagonal solve - denom_u, denom_v, & - xuh_u, xuh_v, & - xlh_u, xlh_v - - real(dp), dimension(:,:), allocatable :: & - b_u, b_v, x_u, x_v - ! vectors (each of these is split into u and v components) real(dp), dimension(nx-1,ny-1) :: & ru, rv, &! residual vector (b-Ax) @@ -2167,13 +2633,30 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & L2_rhs ! L2 norm of rhs vector = sqrt(b,b) ! solver is converged when L2_resid/L2_rhs < tolerance + ! diagonal matrix elements + real(dp), dimension(nx-1,ny-1) :: & + Adiagu, Adiagv ! diagonal terms of matrices Auu and Avv + + ! tridiagonal matrix elements + real(dp), dimension(:,:), allocatable :: & + Asubdiag_u, Adiag_u, Asupdiag_u, & ! matrix entries from Auu for tridiagonal preconditioning + Asubdiag_v, Adiag_v, Asupdiag_v ! matrix entries from Avv for tridiagonal preconditioning + + real(dp), dimension(:,:), allocatable :: & + omega_u, omega_v, & ! work arrays for tridiagonal solve + denom_u, denom_v, & + xuh_u, xuh_v, & + xlh_u, xlh_v + + real(dp), dimension(:,:), allocatable :: & + b_u, b_v, x_u, x_v + real(dp), dimension(:,:), allocatable :: & gather_data_row, & ! arrays for gathering data from every task on a row or column gather_data_col integer :: ilocal, jlocal ! number of locally owned vertices in each direction - integer :: ii, jj integer :: maxiters_chrongear ! max number of linear iterations before quitting integer :: & @@ -2285,9 +2768,8 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & denom_u, denom_v) elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then - !TODO - Figure out why these calculations depend on the number of cores. Halo bug? - ! Allocate tridiagonal matrices + ! Allocate tridiagonal preconditioning matrices ! Note: (i,j) indices are switced for the A_v matrices to reduce striding. allocate(Adiag_u (ilocal,jlocal)) @@ -2310,6 +2792,12 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & allocate(b_v(jlocal,ilocal)) allocate(x_v(jlocal,ilocal)) + ! These two matrices are for gathering data from all tasks on a given row or column. + allocate(gather_data_row(8*tasks_row,jlocal)) + allocate(gather_data_col(8*tasks_col,ilocal)) + gather_data_row = 0.0d0 + gather_data_col = 0.0d0 + ! Compute the entries of the tridiagonal matrices ! Extract tridiagonal matrix entries from Auu @@ -2548,10 +3036,6 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & enddo enddo - ! Initialize the array for gathering information on each row of tasks - allocate(gather_data_row(8*tasks_row,jlocal)) - gather_data_row = 0.0d0 - ! Solve M*z = r, where M is a global tridiagonal matrix call tridiag_solver_global_2d(ilocal, jlocal, & @@ -2589,10 +3073,6 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & enddo enddo - ! Initialize the array for gathering information on each column of tasks - allocate(gather_data_col(8*tasks_col,ilocal)) - gather_data_col = 0.0d0 - call tridiag_solver_global_2d(jlocal, ilocal, & parallel, tasks_col, & 'col', & ! tridiagonal solve for each column From d2c46ae9268c4332c344e198b13f63e730d1b60d Mon Sep 17 00:00:00 2001 From: William Lipscomb Date: Thu, 22 Jan 2026 20:00:54 -0700 Subject: [PATCH 15/21] Reordered the PCG subroutines This commit simply moves the 2D subroutines before the 3D subroutines in glissade_velo_higher_pcg.F90. It seems more logical to put the 2D subroutines first. Might do something similar in glissade_velo_higher.F90. --- libglissade/glissade_velo_higher_pcg.F90 | 3061 +++++++++++----------- 1 file changed, 1531 insertions(+), 1530 deletions(-) diff --git a/libglissade/glissade_velo_higher_pcg.F90 b/libglissade/glissade_velo_higher_pcg.F90 index 4a303b71..57d1cebe 100644 --- a/libglissade/glissade_velo_higher_pcg.F90 +++ b/libglissade/glissade_velo_higher_pcg.F90 @@ -53,8 +53,8 @@ module glissade_velo_higher_pcg implicit none private - public :: pcg_solver_standard_3d, pcg_solver_standard_2d, & - pcg_solver_chrongear_3d, pcg_solver_chrongear_2d, & + public :: pcg_solver_standard_2d, pcg_solver_standard_3d, & + pcg_solver_chrongear_2d, pcg_solver_chrongear_3d, & matvec_multiply_structured_3d logical, parameter :: verbose_pcg = .false. @@ -64,10 +64,9 @@ module glissade_velo_higher_pcg !**************************************************************************** - subroutine pcg_solver_standard_3d(nx, ny, & - nz, parallel, & - indxA_2d, indxA_3d, & - active_vertex, & + subroutine pcg_solver_standard_2d(nx, ny, & + parallel, & + indxA_2d, active_vertex, & Auu, Auv, & Avu, Avv, & bu, bv, & @@ -82,50 +81,25 @@ subroutine pcg_solver_standard_3d(nx, ny, & ! to solve the equation $Ax=b$. ! Convergence is checked every {\em linear_solve_ncheck} steps. ! - ! It is based on the barotropic solver in the POP ocean model - ! (author Phil Jones, LANL). Input and output arrays are located - ! on a structured (i,j,k) grid as defined in the glissade_velo_higher - ! module. The global matrix is sparse, but its nonzero elements - ! are stored in four dense matrices called Auu, Avv, Auv, and Avu. - ! Each matrix has 3x3x3 = 27 potential nonzero elements per - ! node (i,j,k). + ! It is similar to subroutine pcg_solver_standard_3d, but modified + ! to solve for x and y at a single horizontal level, as in the + ! shallow-shelf approximation. See the comments in that subroutine + ! (above) for more details on data structure and solver methods. + ! + ! Input and output arrays are located on a structured (i,j) grid + ! as defined in the glissade_velo_higher module. The global matrix + ! is sparse, but its nonzero element are stored in four dense matrices + ! called Auu, Avv, Auv, and Avu. Each matrix has 3x3 = 9 potential + ! nonzero elements per node (i,j). ! ! The current preconditioning options are ! (0) no preconditioning ! (1) diagonal preconditioning - ! (2) preconditioning using a physics-based SIA solver - ! - ! For the dome test case with higher-order dynamics, option (2) is best. - ! - ! Here is a schematic of the method implemented below for solving Ax = b: - ! - ! halo_update(x0) - ! r0 = b - A*x0 - ! d0 = 0 - ! eta0 = 1 - ! - ! while (not converged) - ! solve Mz = r for z - ! eta1 = (r,z) - ! beta = eta1/eta0 - ! d = z + beta*d - ! halo_update(d) - ! eta0 = eta1 - ! q = Ad - ! eta2 = (d,q) - ! alpha = eta1/eta2 - ! x = x + alpha*d - ! r = r - alpha*q (or occasionally, r = b - Ax) - ! Check for convergence: err = sqrt(r,r)/sqrt(b,b) < tolerance - ! end while ! - ! where x = solution (initial value = x0) - ! d = conjugate direction vector (initial value = d0) - ! r = residual vector (initial value = r0) - ! M = preconditioning matrix - ! (r,z) = dot product of vectors r and z - ! and similarly for (d,q) - ! + ! The SIA-based preconditioning optional is not available for a 2D solve. + ! + ! TODO: Add a tridiagonal preconditioning option to this subroutine, + ! as for subroutine pcg_solver_chrongear_2d. !--------------------------------------------------------------- !--------------------------------------------------------------- @@ -133,70 +107,62 @@ subroutine pcg_solver_standard_3d(nx, ny, & !--------------------------------------------------------------- integer, intent(in) :: & - nx, ny, & ! horizontal grid dimensions (for scalars) - ! velocity grid has dimensions (nx-1,ny-1) - nz ! number of vertical levels where velocity is computed + nx, ny ! horizontal grid dimensions (for scalars) + ! velocity grid has dimensions (nx-1,ny-1) type(parallel_type), intent(in) :: & - parallel ! info for parallel communication + parallel ! info for parallel communication integer, dimension(-1:1,-1:1), intent(in) :: & indxA_2d ! maps relative (x,y) coordinates to an index between 1 and 9 - integer, dimension(-1:1,-1:1,-1:1), intent(in) :: & - indxA_3d ! maps relative (x,y,z) coordinates to an index between 1 and 27 - logical, dimension(nx-1,ny-1), intent(in) :: & - active_vertex ! T for columns (i,j) where velocity is computed, else F + active_vertex ! T for vertices (i,j) where velocity is computed, else F - real(dp), dimension(27,nz,nx-1,ny-1), intent(in) :: & + real(dp), dimension(nx-1,ny-1,9), intent(in) :: & Auu, Auv, Avu, Avv ! four components of assembled matrix - ! 1st dimension = 27 (node and its nearest neighbors in x, y and z direction) - ! other dimensions = (z,x,y) indices + ! 3rd dimension = 9 (node and its nearest neighbors in x and y direction) + ! 1st and 2nd dimensions = (x,y) indices ! ! Auu | Auv ! _____|____ ! Avu | Avv ! | - real(dp), dimension(nz,nx-1,ny-1), intent(in) :: & + real(dp), dimension(nx-1,ny-1), intent(in) :: & bu, bv ! assembled load (rhs) vector, divided into 2 parts - real(dp), dimension(nz,nx-1,ny-1), intent(inout) :: & + real(dp), dimension(nx-1,ny-1), intent(inout) :: & xu, xv ! u and v components of solution (i.e., uvel and vvel) integer, intent(in) :: & precond ! = 0 for no preconditioning ! = 1 for diagonal preconditioning (best option for SSA-dominated flow) - ! = 2 for preconditioning with SIA solver (works well for SIA-dominated flow) integer, intent(in) :: & - linear_solve_ncheck ! number of iterations between convergence checks in the linear solver + linear_solve_ncheck ! number of iterations between convergence checks in the linear solver integer, intent(in) :: & - maxiters ! max number of linear iterations before quitting + maxiters ! max number of linear iterations before quitting real(dp), intent(in) :: & - tolerance ! tolerance for linear solver + tolerance ! tolerance for linear solver real(dp), intent(out) :: & - err ! error (L2 norm of residual) in final solution + err ! error (L2 norm of residual) in final solution integer, intent(out) :: & - niters ! iterations needed to solution + niters ! iterations needed to solution integer, intent(in) :: & - itest, jtest, rtest ! point for debugging diagnostics + itest, jtest, rtest ! point for debugging diagnostics !--------------------------------------------------------------- ! Local variables and parameters !--------------------------------------------------------------- - integer :: i, j, k ! grid indices - integer :: iA, jA, kA ! grid offsets ranging from -1 to 1 - integer :: m ! matrix element index - integer :: ilocal, jlocal ! number of locally owned vertices in each direction - integer :: iter ! iteration counter + integer :: i, j, ii, jj ! grid indices + integer :: iter ! iteration counter real(dp) :: & eta0, eta1, eta2, &! scalar inner product results @@ -204,7 +170,7 @@ subroutine pcg_solver_standard_3d(nx, ny, & beta ! eta1/eta0 = term in expression for new direction vector ! vectors (each of these is split into u and v components) - real(dp), dimension(nz,nx-1,ny-1) :: & + real(dp), dimension(nx-1,ny-1) :: & Adiagu, Adiagv, &! diagonal terms of matrices Auu and Avv ru, rv, &! residual vector (b-Ax) du, dv, &! conjugate direction vector @@ -217,33 +183,26 @@ subroutine pcg_solver_standard_3d(nx, ny, & L2_rhs ! L2 norm of rhs vector b ! solver converges when L2_resid/L2_rhs < tolerance - real(dp), dimension(-1:1,nz,nx-1,ny-1) :: & - Muu, Mvv ! simplified SIA matrices for preconditioning - - ! arrays for tridiagonal preconditioning - ! Note: 2D diagonal entries are Adiag_u and Adiag_v; distinct from 3D Adiagu and Adiagv above + ! tridiagonal matrix elements real(dp), dimension(:,:), allocatable :: & Asubdiag_u, Adiag_u, Asupdiag_u, & ! matrix entries from Auu for tridiagonal preconditioning Asubdiag_v, Adiag_v, Asupdiag_v ! matrix entries from Avv for tridiagonal preconditioning - + real(dp), dimension(:,:), allocatable :: & omega_u, omega_v, & ! work arrays for tridiagonal solve - denom_u, denom_v, & + denom_u, denom_v, & xuh_u, xuh_v, & xlh_u, xlh_v - ! Note: These two matrices are global in the EW and NS dimensions, respectively. - ! Each holds 8 pieces of information for each task on each row or column. - ! Since only 2 of these 8 pieces of information change from one iteration to the next, - ! it is more efficient to gather the remaining information once and pass the arrays - ! with intent(inout), than to declare the arrays in subroutine tridiag_solver_global_2d - ! and gather all the information every time the subroutine is called. - ! TODO: Revisit this. Is the efficiency gain large enough to justify the extra complexity? + real(dp), dimension(:,:), allocatable :: & + b_u, b_v, x_u, x_v real(dp), dimension(:,:), allocatable :: & gather_data_row, & ! arrays for gathering data from every task on a row or column gather_data_col + integer :: ilocal, jlocal ! number of locally owned vertices in each direction + integer :: & tasks_row, & ! number of tasks per row and column for tridiagonal solve tasks_col @@ -254,6 +213,11 @@ subroutine pcg_solver_standard_3d(nx, ny, & staggered_ilo, staggered_ihi, & ! bounds of locally owned vertices on staggered grid staggered_jlo, staggered_jhi + !TODO - Apply the following for tridiag PCs? +! integer, parameter :: & +! maxiters_tridiag = 100 ! max number of linear iterations for tridiagonal preconditioning, +! ! which generally leads to faster convergence than diagonal preconditioning + !WHL - debug integer :: iu_max, ju_max, iv_max, jv_max real(dp) :: ru_max, rv_max @@ -281,48 +245,33 @@ subroutine pcg_solver_standard_3d(nx, ny, & if (precond == HO_PRECOND_NONE) then ! no preconditioner - if (verbose_pcg .and. this_rank == rtest) then + if (verbose_pcg .and. main_task) then write(iulog,*) 'Using no preconditioner' endif elseif (precond == HO_PRECOND_DIAG) then - call setup_preconditioner_diag_3d(nx, ny, & - nz, indxA_3d, & - Auu, Avv, & - Adiagu, Adiagv) - - !WHL - debug - if (verbose_pcg .and. this_rank == rtest) then - i = itest - j = jtest - write(iulog,*) 'i, j, r =', i, j, this_rank - write(iulog,*) 'Auu diag =', Adiagu(:,i,j) - write(iulog,*) 'Avv diag =', Adiagv(:,i,j) - endif + if (verbose_pcg .and. main_task) then + write(iulog,*) 'Using diagonal matrix for preconditioning' + endif ! verbose_pcg - elseif (precond == HO_PRECOND_SIA) then + call setup_preconditioner_diag_2d(nx, ny, & + indxA_2d, & + Auu, Avv, & + Adiagu, Adiagv) - call setup_preconditioner_sia_3d(nx, ny, & - nz, indxA_3d, & - Auu, Avv, & - Muu, Mvv) + elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then - if (verbose_pcg .and. this_rank == rtest) then + !WHL - debug + if (verbose_tridiag .and. this_rank==rtest) then + i = itest j = jtest write(iulog,*) ' ' - write(iulog,*) 'i, k, Muu_sia, Mvv_sia:' - do i = staggered_ihi, staggered_ilo, -1 - write(iulog,*) ' ' - do k = 1, nz - write(iulog,'(2i4, 6e13.5)') i, k, Muu(:,k,i,j), Mvv(:,k,i,j) - enddo - enddo ! i + write(iulog,*) 'r, i, j =', this_rank, i, j + write(iulog,*) 'Auu =', Auu(i,j,:) + write(iulog,*) 'Avv =', Avv(i,j,:) endif - elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then - - ! Allocate tridiagonal preconditioning matrices allocate(Adiag_u (nx-1,ny-1)) allocate(Asubdiag_u(nx-1,ny-1)) allocate(Asupdiag_u(nx-1,ny-1)) @@ -335,26 +284,22 @@ subroutine pcg_solver_standard_3d(nx, ny, & allocate(omega_v (nx-1,ny-1)) allocate(denom_v (nx-1,ny-1)) - ! Compute arrays for tridiagonal preconditioning - - call setup_preconditioner_tridiag_local_3d(& - nx, ny, & - nz, parallel, & - active_vertex, & - indxA_2d, indxA_3d, & - itest, jtest, rtest, & - Auu, Avv, & - Muu, Mvv, & - Adiag_u, Adiag_v, & - Asubdiag_u, Asubdiag_v, & - Asupdiag_u, Asupdiag_v, & - omega_u, omega_v, & + call setup_preconditioner_tridiag_local_2d(& + nx, ny, & + parallel, indxA_2d, & + itest, jtest, rtest, & + Auu, Avv, & + Adiag_u, Adiag_v, & + Asubdiag_u, Asubdiag_v, & + Asupdiag_u, Asupdiag_v, & + omega_u, omega_v, & denom_u, denom_v) elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then - ! Allocate tridiagonal preconditioning matrices - ! Note: (i,j) indices are switched for the A_v matrices to reduce striding. + ! Allocate tridiagonal matrices + ! Note: (i,j) indices are switced for the A_v matrices to reduce striding. + allocate(Adiag_u (ilocal,jlocal)) allocate(Asubdiag_u(ilocal,jlocal)) allocate(Asupdiag_u(ilocal,jlocal)) @@ -362,6 +307,8 @@ subroutine pcg_solver_standard_3d(nx, ny, & allocate(denom_u(ilocal,jlocal)) allocate(xuh_u(ilocal,jlocal)) allocate(xlh_u(ilocal,jlocal)) + allocate(b_u(ilocal,jlocal)) + allocate(x_u(ilocal,jlocal)) allocate(Adiag_v (jlocal,ilocal)) allocate(Asubdiag_v(jlocal,ilocal)) @@ -370,6 +317,8 @@ subroutine pcg_solver_standard_3d(nx, ny, & allocate(denom_v(jlocal,ilocal)) allocate(xuh_v(jlocal,ilocal)) allocate(xlh_v(jlocal,ilocal)) + allocate(b_v(jlocal,ilocal)) + allocate(x_v(jlocal,ilocal)) ! These two matrices are for gathering data from all tasks on a given row or column. allocate(gather_data_row(8*tasks_row,jlocal)) @@ -377,36 +326,72 @@ subroutine pcg_solver_standard_3d(nx, ny, & gather_data_row = 0.0d0 gather_data_col = 0.0d0 - call setup_preconditioner_tridiag_global_3d(& - nx, ny, & - nz, parallel, & - active_vertex, & - indxA_2d, indxA_3d, & - ilocal, jlocal, & - itest, jtest, rtest, & - Auu, Avv, & - Muu, Mvv, & - Adiag_u, Adiag_v, & - Asubdiag_u, Asubdiag_v, & - Asupdiag_u, Asupdiag_v, & - omega_u, omega_v, & - denom_u, denom_v, & - xuh_u, xuh_v, & - xlh_u, xlh_v) - - endif ! precond + ! Compute the entries of the tridiagonal matrices - call t_stopf("pcg_precond_init") + ! Extract tridiagonal matrix entries from Auu + do j = 1, jlocal + jj = j + staggered_jlo - 1 + do i = 1, ilocal + ii = i + staggered_ilo - 1 + Asubdiag_u(i,j) = Auu(ii,jj,indxA_2d(-1,0)) ! subdiagonal elements + Adiag_u (i,j) = Auu(ii,jj,indxA_2d( 0,0)) ! diagonal elements + Asupdiag_u(i,j) = Auu(ii,jj,indxA_2d( 1,0)) ! superdiagonal elements + enddo + enddo - ! Compute initial residual and initialize the direction vector d - ! Note: The matrix A must be complete for all rows corresponding to locally - ! owned vertices, and x must have the correct values in - ! halo vertices bordering the locally owned vertices. - ! Then y = Ax will be correct for locally owned vertices. + ! compute work arrays for the u solve in each matrix row + call setup_preconditioner_tridiag_global_2d(& + ilocal, jlocal, & +!! itest, jtest, rtest, & + itest - staggered_ilo + 1, & ! itest referenced to (ilocal,jlocal) coordinates + jtest - staggered_jlo + 1, & ! jtest referenced to (ilocal,jlocal) coordinates + rtest, & + Adiag_u, & + Asubdiag_u, Asupdiag_u, & + omega_u, denom_u, & + xuh_u, xlh_u) - ! Halo update for x (initial guess for velocity solution) + ! Extract tridiagonal matrix entries from Avv - call t_startf("pcg_halo_init") + do i = 1, ilocal + ii = i + staggered_ilo - 1 + do j = 1, jlocal + jj = j + staggered_jlo - 1 + Asubdiag_v(j,i) = Avv(ii,jj,indxA_2d(0,-1)) ! subdiagonal elements + Adiag_v (j,i) = Avv(ii,jj,indxA_2d(0, 0)) ! diagonal elements + Asupdiag_v(j,i) = Avv(ii,jj,indxA_2d(0, 1)) ! superdiagonal elements + enddo + enddo + + ! compute work arrays for the v solve in each matrix column + ! Note: The *_v arrays have dimensions (jlocal,ilocal) to reduce strides + + call setup_preconditioner_tridiag_global_2d(& + jlocal, ilocal, & +!! itest, jtest, rtest, & + jtest - staggered_jlo + 1, & ! jtest referenced to (jlocal,ilocal) coordinates + itest - staggered_ilo + 1, & ! itest referenced to (jlocal,ilocal) coordinates + rtest, & + Adiag_v, & + Asubdiag_v, Asupdiag_v, & + omega_v, denom_v, & + xuh_v, xlh_v) + + endif ! precond + + if (verbose_pcg .and. main_task) write(iulog,*) 'Done in PC setup' + + call t_stopf("pcg_precond_init") + + ! Compute initial residual and initialize the direction vector d + ! Note: The matrix A must be complete for all rows corresponding to locally + ! owned vertices, and x must have the correct values in + ! halo vertices bordering the locally owned vertices. + ! Then y = Ax will be correct for locally owned vertices. + + ! Halo update for x (initial guess for velocity solution) + + call t_startf("pcg_halo_init") call staggered_parallel_halo(xu, parallel) call staggered_parallel_halo(xv, parallel) call t_stopf("pcg_halo_init") @@ -414,9 +399,9 @@ subroutine pcg_solver_standard_3d(nx, ny, & ! Compute A*x (use z as a temp vector for A*x) call t_startf("pcg_matmult_init") - call matvec_multiply_structured_3d(nx, ny, & - nz, parallel, & - indxA_3d, active_vertex, & + call matvec_multiply_structured_2d(nx, ny, & + parallel, & + indxA_2d, active_vertex, & Auu, Auv, & Avu, Avv, & xu, xv, & @@ -427,8 +412,8 @@ subroutine pcg_solver_standard_3d(nx, ny, & ! This will be correct for locally owned vertices. call t_startf("pcg_vecupdate_init") - ru(:,:,:) = bu(:,:,:) - zu(:,:,:) - rv(:,:,:) = bv(:,:,:) - zv(:,:,:) + ru(:,:) = bu(:,:) - zu(:,:) + rv(:,:) = bv(:,:) - zv(:,:) call t_stopf("pcg_vecupdate_init") ! Initialize scalars and vectors @@ -436,18 +421,18 @@ subroutine pcg_solver_standard_3d(nx, ny, & niters = maxiters eta0 = 1.d0 - du(:,:,:) = 0.d0 - dv(:,:,:) = 0.d0 + du(:,:) = 0.d0 + dv(:,:) = 0.d0 - zu(:,:,:) = 0.d0 - zv(:,:,:) = 0.d0 + zu(:,:) = 0.d0 + zv(:,:) = 0.d0 ! Compute the L2 norm of the RHS vectors ! (Goal is to obtain L2_resid/L2_rhs < tolerance) call t_startf("pcg_dotprod") - work0u(:,:,:) = bu(:,:,:)*bu(:,:,:) ! terms of dot product (b, b) - work0v(:,:,:) = bv(:,:,:)*bv(:,:,:) + work0u(:,:) = bu(:,:)*bu(:,:) ! terms of dot product (b, b) + work0v(:,:) = bv(:,:)*bv(:,:) call t_stopf("pcg_dotprod") ! find global sum of the squared L2 norm @@ -464,92 +449,173 @@ subroutine pcg_solver_standard_3d(nx, ny, & iter_loop: do iter = 1, maxiters + if (verbose_pcg .and. main_task) then +! write(iulog,*) 'iter =', iter + endif + call t_startf("pcg_precond") ! Compute PC(r) = solution z of Mz = r if (precond == HO_PRECOND_NONE) then ! no preconditioning - zu(:,:,:) = ru(:,:,:) ! PC(r) = r - zv(:,:,:) = rv(:,:,:) ! PC(r) = r + zu(:,:) = ru(:,:) ! PC(r) = r + zv(:,:) = rv(:,:) ! PC(r) = r - elseif (precond == HO_PRECOND_DIAG ) then ! diagonal preconditioning + elseif (precond == HO_PRECOND_DIAG) then ! diagonal preconditioning do j = 1, ny-1 do i = 1, nx-1 - do k = 1, nz - if (Adiagu(k,i,j) /= 0.d0) then - zu(k,i,j) = ru(k,i,j) / Adiagu(k,i,j) ! PC(r), where PC is formed from diagonal elements of A + if (Adiagu(i,j) /= 0.d0) then + zu(i,j) = ru(i,j) / Adiagu(i,j) ! PC(r), where PC is formed from diagonal elements of A else - zu(k,i,j) = 0.d0 + zu(i,j) = 0.d0 endif - if (Adiagv(k,i,j) /= 0.d0) then - zv(k,i,j) = rv(k,i,j) / Adiagv(k,i,j) + if (Adiagv(i,j) /= 0.d0) then + zv(i,j) = rv(i,j) / Adiagv(i,j) else - zv(k,i,j) = 0.d0 + zv(i,j) = 0.d0 endif - enddo ! k enddo ! i enddo ! j - elseif (precond == HO_PRECOND_SIA) then ! local vertical shallow-ice solver for preconditioning - - call easy_sia_solver(nx, ny, nz, & - active_vertex, & - Muu, ru, zu) ! solve Muu*zu = ru for zu + elseif(precond == HO_PRECOND_TRIDIAG_LOCAL) then ! local - call easy_sia_solver(nx, ny, nz, & - active_vertex, & - Mvv, rv, zv) ! solve Mvv*zv = rv for zv + if (verbose_tridiag .and. this_rank == rtest) then + i = itest + j = jtest + write(iulog,*) 'Residual:' + write(iulog,*) 'r, i, j, ru:', this_rank, i, j, ru(i,j) + write(iulog,*) 'r, i, j, rv:', this_rank, i, j, rv(i,j) + write(iulog,*) ' ' + write(iulog,*) 'jtest =', jtest + write(iulog,*) 'i, ru, rv:' + do i = itest-3, itest+3 + write(iulog,'(i4, 2f15.10)') i, ru(i,j), rv(i,j) + enddo + endif - elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then + if (verbose_pcg .and. main_task) then + write(iulog,*) 'call tridiag_solver_local_2d' + endif - ! Use a local tridiagonal solver to find an approximate solution of A*z = r + ! Solve M*z = r, where M is a local tridiagonal matrix (one matrix per task) + !TODO - Test a local solver that can compute zu and zv in the halo + ! (to avoid the halo update below) - call tridiag_solver_local_3d(& + call tridiag_solver_local_2d(& nx, ny, & - nz, parallel, & - active_vertex, & + parallel, & itest, jtest, rtest, & - Adiag_u, Adiag_v, & ! entries of 2D preconditioning matrix + Adiag_u, Adiag_v, & ! entries of preconditioning matrix Asubdiag_u, Asubdiag_v, & Asupdiag_u, Asupdiag_v, & omega_u, omega_v, & denom_u, denom_v, & - Muu, Mvv, & ! entries of SIA matrix - ru, rv, & ! 3D residual - zu, zv) ! approximate solution of Az = r + ru, rv, & ! right hand side + zu, zv) ! solution + + !WHL - debug + if (verbose_pcg .and. this_rank == rtest) then + j = jtest + write(iulog,*) ' ' + write(iulog,*) 'tridiag solve: i, ru, rv, zu, zv:' + do i = itest-3, itest+3 + write(iulog,'(i4, 4f16.10)') i, ru(i,j), rv(i,j), zu(i,j), zv(i,j) + enddo + endif - elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then + !Note: Need zu and zv in a row of halo cells so that q = A*d is correct in all locally owned cells + !TODO: See whether tridiag solvers could be modified to provide zu and zv in halo cells? + call staggered_parallel_halo(zu, parallel) + call staggered_parallel_halo(zv, parallel) - ! Use a global tridiagonal solver to find an approximate solution of A*z = r + elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then ! tridiagonal preconditioning with global solve + ! Use a global tridiagonal solver to find an approximate solution of A*z = r if (iter == 1) then first_time = .true. else first_time = .false. endif - call tridiag_solver_global_3d(& - nx, ny, & - nz, parallel, & - active_vertex, & - ilocal, jlocal, & - tasks_row, tasks_col, & - itest, jtest, rtest, & - Adiag_u, Adiag_v, & ! entries of 2D preconditioning matrix - Asubdiag_u, Asubdiag_v, & - Asupdiag_u, Asupdiag_v, & - omega_u, omega_v, & - denom_u, denom_v, & - xuh_u, xuh_v, & - xlh_u, xlh_v, & - Muu, Mvv, & ! entries of SIA matrix - gather_data_row, gather_data_col, & - first_time, & - ru, rv, & ! 3D residual - zu, zv) ! approximate solution of Az = r + ! convert ru(nx-1,ny-1) to b_u(ilocal,jlocal) + do j = 1, jlocal + jj = j + staggered_jlo - 1 + do i = 1, ilocal + ii = i + staggered_ilo - 1 + b_u(i,j) = ru(ii,jj) + enddo + enddo + + ! Solve M*z = r, where M is a global tridiagonal matrix + + call tridiag_solver_global_2d(& + ilocal, jlocal, & + parallel, tasks_row, & + 'row', & ! tridiagonal solve for each row +!! itest, jtest, rtest, & + itest - staggered_ilo + 1, & ! itest referenced to (ilocal,jlocal) coordinates + jtest - staggered_jlo + 1, & ! jtest referenced to (ilocal,jlocal) coordinates + rtest, & + Adiag_u, & + Asubdiag_u, Asupdiag_u, & + omega_u, denom_u, & + xuh_u, xlh_u, & + b_u, x_u, & + first_time, & + gather_data_row) + + ! convert x_u(ilocal,jlocal) to zu(nx-1,ny-1) + zu(:,:) = 0.0d0 + do j = 1, jlocal + jj = j + staggered_jlo - 1 + do i = 1, ilocal + ii = i + staggered_ilo - 1 + zu(ii,jj) = x_u(i,j) + enddo + enddo + + ! convert rv(nx-1,ny-1) to b_v(jlocal,ilocal) + do i = 1, ilocal + ii = i + staggered_ilo - 1 + do j = 1, jlocal + jj = j + staggered_jlo - 1 + b_v(j,i) = rv(ii,jj) + enddo + enddo + + call tridiag_solver_global_2d(& + jlocal, ilocal, & + parallel, tasks_col, & + 'col', & ! tridiagonal solve for each column + !! itest, jtest, rtest, & + jtest - staggered_jlo + 1, & ! jtest referenced to (jlocal,ilocal) coordinates + itest - staggered_ilo + 1, & ! itest referenced to (jlocal,ilocal) coordinates + rtest, & + Adiag_v, & + Asubdiag_v, Asupdiag_v, & + omega_v, denom_v, & + xuh_v, xlh_v, & + b_v, x_v, & + first_time, & + gather_data_col) + + ! convert x_v(jlocal,ilocal) to zv(nx-1,ny-1) + zv(:,:) = 0.0d0 + do i = 1, ilocal + ii = i + staggered_ilo - 1 + do j = 1, jlocal + jj = j + staggered_jlo - 1 + zv(ii,jj) = x_v(j,i) + enddo + enddo + !Note: Need zu and zv in a row of halo cells so that q = A*d is correct in all locally owned cells + !TODO: See whether tridiag_solver_local_2d could be modified to provide zu and zv in halo cells? + call staggered_parallel_halo(zu, parallel) + call staggered_parallel_halo(zv, parallel) + endif ! precond call t_stopf("pcg_precond") @@ -557,8 +623,8 @@ subroutine pcg_solver_standard_3d(nx, ny, & ! Compute the dot product eta1 = (r, PC(r)) call t_startf("pcg_dotprod") - work0u(:,:,:) = ru(:,:,:)*zu(:,:,:) ! terms of dot product (r, PC(r)) - work0v(:,:,:) = rv(:,:,:)*zv(:,:,:) + work0u(:,:) = ru(:,:)*zu(:,:) ! terms of dot product (r, PC(r)) + work0v(:,:) = rv(:,:)*zv(:,:) call t_stopf("pcg_dotprod") call t_startf("pcg_glbsum_iter") @@ -569,7 +635,7 @@ subroutine pcg_solver_standard_3d(nx, ny, & ! then eta1 will be NaN. if (eta1 /= eta1) then ! eta1 is NaN - call write_log('PCG solver has failed, alpha = NaN', GM_FATAL) + call write_log('PCG solver has failed, eta1 = NaN', GM_FATAL) endif ! Update the conjugate direction vector d @@ -577,16 +643,16 @@ subroutine pcg_solver_standard_3d(nx, ny, & beta = eta1/eta0 call t_startf("pcg_vecupdate") - du(:,:,:) = zu(:,:,:) + beta*du(:,:,:) ! d_(i+1) = PC(r_(i+1)) + beta_(i+1)*d_i - dv(:,:,:) = zv(:,:,:) + beta*dv(:,:,:) ! - ! (r_(i+1), PC(r_(i+1))) - ! where beta_(i+1) = -------------------- - ! (r_i, PC(r_i)) - ! Initially eta0 = 1 - ! For n >=2, eta0 = old eta1 - call t_stopf("pcg_vecupdate") - - ! Halo update for d + du(:,:) = zu(:,:) + beta*du(:,:) ! d_(i+1) = PC(r_(i+1)) + beta_(i+1)*d_i + dv(:,:) = zv(:,:) + beta*dv(:,:) ! + ! (r_(i+1), PC(r_(i+1))) + ! where beta_(i+1) = -------------------- + ! (r_i, PC(r_i)) + ! Initially eta0 = 1 + ! For n >=2, eta0 = old eta1 + call t_stopf("pcg_vecupdate") + + ! Halo update for d call t_startf("pcg_halo_iter") call staggered_parallel_halo(du, parallel) @@ -597,9 +663,9 @@ subroutine pcg_solver_standard_3d(nx, ny, & ! This is the one matvec multiply required for each iteration call t_startf("pcg_matmult_iter") - call matvec_multiply_structured_3d(nx, ny, & - nz, parallel, & - indxA_3d, active_vertex, & + call matvec_multiply_structured_2d(nx, ny, & + parallel, & + indxA_2d, active_vertex, & Auu, Auv, & Avu, Avv, & du, dv, & @@ -613,8 +679,8 @@ subroutine pcg_solver_standard_3d(nx, ny, & ! Compute the dot product eta2 = (d, A*d) call t_startf("pcg_dotprod") - work0u(:,:,:) = du(:,:,:) * qu(:,:,:) ! terms of dot product (d, Ad) - work0v(:,:,:) = dv(:,:,:) * qv(:,:,:) + work0u(:,:) = du(:,:) * qu(:,:) ! terms of dot product (d, Ad) + work0v(:,:) = dv(:,:) * qv(:,:) call t_stopf("pcg_dotprod") call t_startf("pcg_glbsum_iter") @@ -636,26 +702,13 @@ subroutine pcg_solver_standard_3d(nx, ny, & ! Compute the new solution and residual call t_startf("pcg_vecupdate") - xu(:,:,:) = xu(:,:,:) + alpha * du(:,:,:) ! new solution, x_(i+1) = x_i + alpha*d - xv(:,:,:) = xv(:,:,:) + alpha * dv(:,:,:) + xu(:,:) = xu(:,:) + alpha * du(:,:) ! new solution, x_(i+1) = x_i + alpha*d + xv(:,:) = xv(:,:) + alpha * dv(:,:) - ru(:,:,:) = ru(:,:,:) - alpha * qu(:,:,:) ! new residual, r_(i+1) = r_i - alpha*(Ad) - rv(:,:,:) = rv(:,:,:) - alpha * qv(:,:,:) + ru(:,:) = ru(:,:) - alpha * qu(:,:) ! new residual, r_(i+1) = r_i - alpha*(Ad) + rv(:,:) = rv(:,:) - alpha * qv(:,:) call t_stopf("pcg_vecupdate") - if (verbose_pcg .and. this_rank == rtest) then - j = jtest - write(iulog,*) ' ' - write(iulog,*) 'iter =', iter - write(iulog,*) 'i, k, xu, xv, ru, rv:' - do i = itest-3, itest+3 - write(iulog,*) ' ' - do k = 1, nz - write(iulog,'(2i4, 4f16.10)') i, k, xu(k,i,j), xv(k,i,j), ru(k,i,j), rv(k,i,j) - enddo - enddo ! i - endif - ! Check for convergence every linear_solve_ncheck iterations. ! Also check at iter = 5, to reduce iterations when the nonlinear solver is close to convergence. ! TODO: Check at iter = linear_solve_ncheck/2 instead of 5? This would be answer-changing. @@ -664,11 +717,6 @@ subroutine pcg_solver_standard_3d(nx, ny, & if (mod(iter, linear_solve_ncheck) == 0 .or. iter == 5) then - if (verbose_pcg .and. main_task) then - write(iulog,*) ' ' - write(iulog,*) 'Check convergence, iter =', iter - endif - ! Halo update for x call t_startf("pcg_halo_resid") @@ -679,9 +727,9 @@ subroutine pcg_solver_standard_3d(nx, ny, & ! Compute A*x (use z as a temp vector for A*x) call t_startf("pcg_matmult_resid") - call matvec_multiply_structured_3d(nx, ny, & - nz, parallel, & - indxA_3d, active_vertex, & + call matvec_multiply_structured_2d(nx, ny, & + parallel, & + indxA_2d, active_vertex, & Auu, Auv, & Avu, Avv, & xu, xv, & @@ -691,15 +739,15 @@ subroutine pcg_solver_standard_3d(nx, ny, & ! Compute residual r = b - Ax call t_startf("pcg_vecupdate") - ru(:,:,:) = bu(:,:,:) - zu(:,:,:) - rv(:,:,:) = bv(:,:,:) - zv(:,:,:) + ru(:,:) = bu(:,:) - zu(:,:) + rv(:,:) = bv(:,:) - zv(:,:) call t_stopf("pcg_vecupdate") ! Compute squared L2 norm of (r, r) call t_startf("pcg_dotprod") - work0u(:,:,:) = ru(:,:,:)*ru(:,:,:) ! terms of dot product (r, r) - work0v(:,:,:) = rv(:,:,:)*rv(:,:,:) + work0u(:,:) = ru(:,:)*ru(:,:) ! terms of dot product (r, r) + work0v(:,:) = rv(:,:)*rv(:,:) call t_stopf("pcg_dotprod") call t_startf("pcg_glbsum_resid") @@ -712,10 +760,6 @@ subroutine pcg_solver_standard_3d(nx, ny, & ! compute normalized error err = L2_resid/L2_rhs - if (verbose_pcg .and. main_task) then - write(iulog,*) 'iter, L2_resid, L2_rhs, error =', iter, L2_resid, L2_rhs, err - endif - !WHL - debug if (verbose_pcg .and. this_rank == rtest) then ru_max = 0.d0 @@ -724,13 +768,13 @@ subroutine pcg_solver_standard_3d(nx, ny, & ju_max = 0 do j = staggered_jlo, staggered_jhi do i = staggered_ilo, staggered_ihi - if (sum(abs(ru(:,i,j))) > ru_max) then - ru_max = sum(abs(ru(:,i,j))) + if (abs(ru(i,j)) > ru_max) then + ru_max = ru(i,j) iu_max = i ju_max = j endif - if (sum(abs(rv(:,i,j))) > rv_max) then - rv_max = sum(abs(rv(:,i,j))) + if (abs(rv(i,j)) > rv_max) then + rv_max = rv(i,j) iv_max = i jv_max = j endif @@ -771,16 +815,19 @@ subroutine pcg_solver_standard_3d(nx, ny, & if (allocated(denom_u)) deallocate(denom_u, denom_v) if (allocated(xuh_u)) deallocate(xuh_u, xuh_v) if (allocated(xlh_u)) deallocate(xlh_u, xlh_v) + if (allocated(b_u)) deallocate(b_u, b_v) + if (allocated(x_u)) deallocate(x_u, x_v) if (allocated(gather_data_row)) deallocate(gather_data_row) if (allocated(gather_data_col)) deallocate(gather_data_col) - end subroutine pcg_solver_standard_3d + end subroutine pcg_solver_standard_2d !**************************************************************************** - subroutine pcg_solver_standard_2d(nx, ny, & - parallel, & - indxA_2d, active_vertex, & + subroutine pcg_solver_standard_3d(nx, ny, & + nz, parallel, & + indxA_2d, indxA_3d, & + active_vertex, & Auu, Auv, & Avu, Avv, & bu, bv, & @@ -795,25 +842,50 @@ subroutine pcg_solver_standard_2d(nx, ny, & ! to solve the equation $Ax=b$. ! Convergence is checked every {\em linear_solve_ncheck} steps. ! - ! It is similar to subroutine pcg_solver_standard_3d, but modified - ! to solve for x and y at a single horizontal level, as in the - ! shallow-shelf approximation. See the comments in that subroutine - ! (above) for more details on data structure and solver methods. - ! - ! Input and output arrays are located on a structured (i,j) grid - ! as defined in the glissade_velo_higher module. The global matrix - ! is sparse, but its nonzero element are stored in four dense matrices - ! called Auu, Avv, Auv, and Avu. Each matrix has 3x3 = 9 potential - ! nonzero elements per node (i,j). + ! It is based on the barotropic solver in the POP ocean model + ! (author Phil Jones, LANL). Input and output arrays are located + ! on a structured (i,j,k) grid as defined in the glissade_velo_higher + ! module. The global matrix is sparse, but its nonzero elements + ! are stored in four dense matrices called Auu, Avv, Auv, and Avu. + ! Each matrix has 3x3x3 = 27 potential nonzero elements per + ! node (i,j,k). ! ! The current preconditioning options are ! (0) no preconditioning ! (1) diagonal preconditioning - ! - ! The SIA-based preconditioning optional is not available for a 2D solve. + ! (2) preconditioning using a physics-based SIA solver ! - ! TODO: Add a tridiagonal preconditioning option to this subroutine, - ! as for subroutine pcg_solver_chrongear_2d. + ! For the dome test case with higher-order dynamics, option (2) is best. + ! + ! Here is a schematic of the method implemented below for solving Ax = b: + ! + ! halo_update(x0) + ! r0 = b - A*x0 + ! d0 = 0 + ! eta0 = 1 + ! + ! while (not converged) + ! solve Mz = r for z + ! eta1 = (r,z) + ! beta = eta1/eta0 + ! d = z + beta*d + ! halo_update(d) + ! eta0 = eta1 + ! q = Ad + ! eta2 = (d,q) + ! alpha = eta1/eta2 + ! x = x + alpha*d + ! r = r - alpha*q (or occasionally, r = b - Ax) + ! Check for convergence: err = sqrt(r,r)/sqrt(b,b) < tolerance + ! end while + ! + ! where x = solution (initial value = x0) + ! d = conjugate direction vector (initial value = d0) + ! r = residual vector (initial value = r0) + ! M = preconditioning matrix + ! (r,z) = dot product of vectors r and z + ! and similarly for (d,q) + ! !--------------------------------------------------------------- !--------------------------------------------------------------- @@ -821,62 +893,70 @@ subroutine pcg_solver_standard_2d(nx, ny, & !--------------------------------------------------------------- integer, intent(in) :: & - nx, ny ! horizontal grid dimensions (for scalars) - ! velocity grid has dimensions (nx-1,ny-1) + nx, ny, & ! horizontal grid dimensions (for scalars) + ! velocity grid has dimensions (nx-1,ny-1) + nz ! number of vertical levels where velocity is computed type(parallel_type), intent(in) :: & - parallel ! info for parallel communication + parallel ! info for parallel communication integer, dimension(-1:1,-1:1), intent(in) :: & indxA_2d ! maps relative (x,y) coordinates to an index between 1 and 9 + integer, dimension(-1:1,-1:1,-1:1), intent(in) :: & + indxA_3d ! maps relative (x,y,z) coordinates to an index between 1 and 27 + logical, dimension(nx-1,ny-1), intent(in) :: & - active_vertex ! T for vertices (i,j) where velocity is computed, else F + active_vertex ! T for columns (i,j) where velocity is computed, else F - real(dp), dimension(nx-1,ny-1,9), intent(in) :: & + real(dp), dimension(27,nz,nx-1,ny-1), intent(in) :: & Auu, Auv, Avu, Avv ! four components of assembled matrix - ! 3rd dimension = 9 (node and its nearest neighbors in x and y direction) - ! 1st and 2nd dimensions = (x,y) indices + ! 1st dimension = 27 (node and its nearest neighbors in x, y and z direction) + ! other dimensions = (z,x,y) indices ! ! Auu | Auv ! _____|____ ! Avu | Avv ! | - real(dp), dimension(nx-1,ny-1), intent(in) :: & + real(dp), dimension(nz,nx-1,ny-1), intent(in) :: & bu, bv ! assembled load (rhs) vector, divided into 2 parts - real(dp), dimension(nx-1,ny-1), intent(inout) :: & + real(dp), dimension(nz,nx-1,ny-1), intent(inout) :: & xu, xv ! u and v components of solution (i.e., uvel and vvel) integer, intent(in) :: & precond ! = 0 for no preconditioning ! = 1 for diagonal preconditioning (best option for SSA-dominated flow) + ! = 2 for preconditioning with SIA solver (works well for SIA-dominated flow) integer, intent(in) :: & - linear_solve_ncheck ! number of iterations between convergence checks in the linear solver + linear_solve_ncheck ! number of iterations between convergence checks in the linear solver integer, intent(in) :: & - maxiters ! max number of linear iterations before quitting + maxiters ! max number of linear iterations before quitting real(dp), intent(in) :: & - tolerance ! tolerance for linear solver + tolerance ! tolerance for linear solver real(dp), intent(out) :: & - err ! error (L2 norm of residual) in final solution + err ! error (L2 norm of residual) in final solution integer, intent(out) :: & - niters ! iterations needed to solution + niters ! iterations needed to solution integer, intent(in) :: & - itest, jtest, rtest ! point for debugging diagnostics + itest, jtest, rtest ! point for debugging diagnostics !--------------------------------------------------------------- ! Local variables and parameters !--------------------------------------------------------------- - integer :: i, j, ii, jj ! grid indices - integer :: iter ! iteration counter + integer :: i, j, k ! grid indices + integer :: iA, jA, kA ! grid offsets ranging from -1 to 1 + integer :: m ! matrix element index + integer :: ilocal, jlocal ! number of locally owned vertices in each direction + integer :: iter ! iteration counter real(dp) :: & eta0, eta1, eta2, &! scalar inner product results @@ -884,7 +964,7 @@ subroutine pcg_solver_standard_2d(nx, ny, & beta ! eta1/eta0 = term in expression for new direction vector ! vectors (each of these is split into u and v components) - real(dp), dimension(nx-1,ny-1) :: & + real(dp), dimension(nz,nx-1,ny-1) :: & Adiagu, Adiagv, &! diagonal terms of matrices Auu and Avv ru, rv, &! residual vector (b-Ax) du, dv, &! conjugate direction vector @@ -897,26 +977,33 @@ subroutine pcg_solver_standard_2d(nx, ny, & L2_rhs ! L2 norm of rhs vector b ! solver converges when L2_resid/L2_rhs < tolerance - ! tridiagonal matrix elements + real(dp), dimension(-1:1,nz,nx-1,ny-1) :: & + Muu, Mvv ! simplified SIA matrices for preconditioning + + ! arrays for tridiagonal preconditioning + ! Note: 2D diagonal entries are Adiag_u and Adiag_v; distinct from 3D Adiagu and Adiagv above real(dp), dimension(:,:), allocatable :: & Asubdiag_u, Adiag_u, Asupdiag_u, & ! matrix entries from Auu for tridiagonal preconditioning Asubdiag_v, Adiag_v, Asupdiag_v ! matrix entries from Avv for tridiagonal preconditioning - + real(dp), dimension(:,:), allocatable :: & omega_u, omega_v, & ! work arrays for tridiagonal solve - denom_u, denom_v, & + denom_u, denom_v, & xuh_u, xuh_v, & xlh_u, xlh_v - real(dp), dimension(:,:), allocatable :: & - b_u, b_v, x_u, x_v + ! Note: These two matrices are global in the EW and NS dimensions, respectively. + ! Each holds 8 pieces of information for each task on each row or column. + ! Since only 2 of these 8 pieces of information change from one iteration to the next, + ! it is more efficient to gather the remaining information once and pass the arrays + ! with intent(inout), than to declare the arrays in subroutine tridiag_solver_global_2d + ! and gather all the information every time the subroutine is called. + ! TODO: Revisit this. Is the efficiency gain large enough to justify the extra complexity? real(dp), dimension(:,:), allocatable :: & gather_data_row, & ! arrays for gathering data from every task on a row or column gather_data_col - integer :: ilocal, jlocal ! number of locally owned vertices in each direction - integer :: & tasks_row, & ! number of tasks per row and column for tridiagonal solve tasks_col @@ -927,11 +1014,6 @@ subroutine pcg_solver_standard_2d(nx, ny, & staggered_ilo, staggered_ihi, & ! bounds of locally owned vertices on staggered grid staggered_jlo, staggered_jhi - !TODO - Apply the following for tridiag PCs? -! integer, parameter :: & -! maxiters_tridiag = 100 ! max number of linear iterations for tridiagonal preconditioning, -! ! which generally leads to faster convergence than diagonal preconditioning - !WHL - debug integer :: iu_max, ju_max, iv_max, jv_max real(dp) :: ru_max, rv_max @@ -959,33 +1041,48 @@ subroutine pcg_solver_standard_2d(nx, ny, & if (precond == HO_PRECOND_NONE) then ! no preconditioner - if (verbose_pcg .and. main_task) then + if (verbose_pcg .and. this_rank == rtest) then write(iulog,*) 'Using no preconditioner' endif elseif (precond == HO_PRECOND_DIAG) then - if (verbose_pcg .and. main_task) then - write(iulog,*) 'Using diagonal matrix for preconditioning' - endif ! verbose_pcg - - call setup_preconditioner_diag_2d(nx, ny, & - indxA_2d, & - Auu, Avv, & - Adiagu, Adiagv) - - elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then + call setup_preconditioner_diag_3d(nx, ny, & + nz, indxA_3d, & + Auu, Avv, & + Adiagu, Adiagv) !WHL - debug - if (verbose_tridiag .and. this_rank==rtest) then + if (verbose_pcg .and. this_rank == rtest) then i = itest + j = jtest + write(iulog,*) 'i, j, r =', i, j, this_rank + write(iulog,*) 'Auu diag =', Adiagu(:,i,j) + write(iulog,*) 'Avv diag =', Adiagv(:,i,j) + endif + + elseif (precond == HO_PRECOND_SIA) then + + call setup_preconditioner_sia_3d(nx, ny, & + nz, indxA_3d, & + Auu, Avv, & + Muu, Mvv) + + if (verbose_pcg .and. this_rank == rtest) then j = jtest write(iulog,*) ' ' - write(iulog,*) 'r, i, j =', this_rank, i, j - write(iulog,*) 'Auu =', Auu(i,j,:) - write(iulog,*) 'Avv =', Avv(i,j,:) + write(iulog,*) 'i, k, Muu_sia, Mvv_sia:' + do i = staggered_ihi, staggered_ilo, -1 + write(iulog,*) ' ' + do k = 1, nz + write(iulog,'(2i4, 6e13.5)') i, k, Muu(:,k,i,j), Mvv(:,k,i,j) + enddo + enddo ! i endif + elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then + + ! Allocate tridiagonal preconditioning matrices allocate(Adiag_u (nx-1,ny-1)) allocate(Asubdiag_u(nx-1,ny-1)) allocate(Asupdiag_u(nx-1,ny-1)) @@ -998,22 +1095,26 @@ subroutine pcg_solver_standard_2d(nx, ny, & allocate(omega_v (nx-1,ny-1)) allocate(denom_v (nx-1,ny-1)) - call setup_preconditioner_tridiag_local_2d(& - nx, ny, & - parallel, indxA_2d, & - itest, jtest, rtest, & - Auu, Avv, & - Adiag_u, Adiag_v, & - Asubdiag_u, Asubdiag_v, & - Asupdiag_u, Asupdiag_v, & - omega_u, omega_v, & + ! Compute arrays for tridiagonal preconditioning + + call setup_preconditioner_tridiag_local_3d(& + nx, ny, & + nz, parallel, & + active_vertex, & + indxA_2d, indxA_3d, & + itest, jtest, rtest, & + Auu, Avv, & + Muu, Mvv, & + Adiag_u, Adiag_v, & + Asubdiag_u, Asubdiag_v, & + Asupdiag_u, Asupdiag_v, & + omega_u, omega_v, & denom_u, denom_v) elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then - ! Allocate tridiagonal matrices - ! Note: (i,j) indices are switced for the A_v matrices to reduce striding. - + ! Allocate tridiagonal preconditioning matrices + ! Note: (i,j) indices are switched for the A_v matrices to reduce striding. allocate(Adiag_u (ilocal,jlocal)) allocate(Asubdiag_u(ilocal,jlocal)) allocate(Asupdiag_u(ilocal,jlocal)) @@ -1021,8 +1122,6 @@ subroutine pcg_solver_standard_2d(nx, ny, & allocate(denom_u(ilocal,jlocal)) allocate(xuh_u(ilocal,jlocal)) allocate(xlh_u(ilocal,jlocal)) - allocate(b_u(ilocal,jlocal)) - allocate(x_u(ilocal,jlocal)) allocate(Adiag_v (jlocal,ilocal)) allocate(Asubdiag_v(jlocal,ilocal)) @@ -1031,8 +1130,6 @@ subroutine pcg_solver_standard_2d(nx, ny, & allocate(denom_v(jlocal,ilocal)) allocate(xuh_v(jlocal,ilocal)) allocate(xlh_v(jlocal,ilocal)) - allocate(b_v(jlocal,ilocal)) - allocate(x_v(jlocal,ilocal)) ! These two matrices are for gathering data from all tasks on a given row or column. allocate(gather_data_row(8*tasks_row,jlocal)) @@ -1040,61 +1137,25 @@ subroutine pcg_solver_standard_2d(nx, ny, & gather_data_row = 0.0d0 gather_data_col = 0.0d0 - ! Compute the entries of the tridiagonal matrices - - ! Extract tridiagonal matrix entries from Auu - do j = 1, jlocal - jj = j + staggered_jlo - 1 - do i = 1, ilocal - ii = i + staggered_ilo - 1 - Asubdiag_u(i,j) = Auu(ii,jj,indxA_2d(-1,0)) ! subdiagonal elements - Adiag_u (i,j) = Auu(ii,jj,indxA_2d( 0,0)) ! diagonal elements - Asupdiag_u(i,j) = Auu(ii,jj,indxA_2d( 1,0)) ! superdiagonal elements - enddo - enddo - - ! compute work arrays for the u solve in each matrix row - call setup_preconditioner_tridiag_global_2d(& - ilocal, jlocal, & -!! itest, jtest, rtest, & - itest - staggered_ilo + 1, & ! itest referenced to (ilocal,jlocal) coordinates - jtest - staggered_jlo + 1, & ! jtest referenced to (ilocal,jlocal) coordinates - rtest, & - Adiag_u, & - Asubdiag_u, Asupdiag_u, & - omega_u, denom_u, & - xuh_u, xlh_u) - - ! Extract tridiagonal matrix entries from Avv - - do i = 1, ilocal - ii = i + staggered_ilo - 1 - do j = 1, jlocal - jj = j + staggered_jlo - 1 - Asubdiag_v(j,i) = Avv(ii,jj,indxA_2d(0,-1)) ! subdiagonal elements - Adiag_v (j,i) = Avv(ii,jj,indxA_2d(0, 0)) ! diagonal elements - Asupdiag_v(j,i) = Avv(ii,jj,indxA_2d(0, 1)) ! superdiagonal elements - enddo - enddo - - ! compute work arrays for the v solve in each matrix column - ! Note: The *_v arrays have dimensions (jlocal,ilocal) to reduce strides - - call setup_preconditioner_tridiag_global_2d(& - jlocal, ilocal, & -!! itest, jtest, rtest, & - jtest - staggered_jlo + 1, & ! jtest referenced to (jlocal,ilocal) coordinates - itest - staggered_ilo + 1, & ! itest referenced to (jlocal,ilocal) coordinates - rtest, & - Adiag_v, & - Asubdiag_v, Asupdiag_v, & - omega_v, denom_v, & - xuh_v, xlh_v) + call setup_preconditioner_tridiag_global_3d(& + nx, ny, & + nz, parallel, & + active_vertex, & + indxA_2d, indxA_3d, & + ilocal, jlocal, & + itest, jtest, rtest, & + Auu, Avv, & + Muu, Mvv, & + Adiag_u, Adiag_v, & + Asubdiag_u, Asubdiag_v, & + Asupdiag_u, Asupdiag_v, & + omega_u, omega_v, & + denom_u, denom_v, & + xuh_u, xuh_v, & + xlh_u, xlh_v) endif ! precond - if (verbose_pcg .and. main_task) write(iulog,*) 'Done in PC setup' - call t_stopf("pcg_precond_init") ! Compute initial residual and initialize the direction vector d @@ -1113,9 +1174,9 @@ subroutine pcg_solver_standard_2d(nx, ny, & ! Compute A*x (use z as a temp vector for A*x) call t_startf("pcg_matmult_init") - call matvec_multiply_structured_2d(nx, ny, & - parallel, & - indxA_2d, active_vertex, & + call matvec_multiply_structured_3d(nx, ny, & + nz, parallel, & + indxA_3d, active_vertex, & Auu, Auv, & Avu, Avv, & xu, xv, & @@ -1126,8 +1187,8 @@ subroutine pcg_solver_standard_2d(nx, ny, & ! This will be correct for locally owned vertices. call t_startf("pcg_vecupdate_init") - ru(:,:) = bu(:,:) - zu(:,:) - rv(:,:) = bv(:,:) - zv(:,:) + ru(:,:,:) = bu(:,:,:) - zu(:,:,:) + rv(:,:,:) = bv(:,:,:) - zv(:,:,:) call t_stopf("pcg_vecupdate_init") ! Initialize scalars and vectors @@ -1135,18 +1196,18 @@ subroutine pcg_solver_standard_2d(nx, ny, & niters = maxiters eta0 = 1.d0 - du(:,:) = 0.d0 - dv(:,:) = 0.d0 + du(:,:,:) = 0.d0 + dv(:,:,:) = 0.d0 - zu(:,:) = 0.d0 - zv(:,:) = 0.d0 + zu(:,:,:) = 0.d0 + zv(:,:,:) = 0.d0 ! Compute the L2 norm of the RHS vectors ! (Goal is to obtain L2_resid/L2_rhs < tolerance) call t_startf("pcg_dotprod") - work0u(:,:) = bu(:,:)*bu(:,:) ! terms of dot product (b, b) - work0v(:,:) = bv(:,:)*bv(:,:) + work0u(:,:,:) = bu(:,:,:)*bu(:,:,:) ! terms of dot product (b, b) + work0v(:,:,:) = bv(:,:,:)*bv(:,:,:) call t_stopf("pcg_dotprod") ! find global sum of the squared L2 norm @@ -1163,182 +1224,101 @@ subroutine pcg_solver_standard_2d(nx, ny, & iter_loop: do iter = 1, maxiters - if (verbose_pcg .and. main_task) then -! write(iulog,*) 'iter =', iter - endif - call t_startf("pcg_precond") ! Compute PC(r) = solution z of Mz = r if (precond == HO_PRECOND_NONE) then ! no preconditioning - zu(:,:) = ru(:,:) ! PC(r) = r - zv(:,:) = rv(:,:) ! PC(r) = r + zu(:,:,:) = ru(:,:,:) ! PC(r) = r + zv(:,:,:) = rv(:,:,:) ! PC(r) = r - elseif (precond == HO_PRECOND_DIAG) then ! diagonal preconditioning + elseif (precond == HO_PRECOND_DIAG ) then ! diagonal preconditioning do j = 1, ny-1 do i = 1, nx-1 - if (Adiagu(i,j) /= 0.d0) then - zu(i,j) = ru(i,j) / Adiagu(i,j) ! PC(r), where PC is formed from diagonal elements of A + do k = 1, nz + if (Adiagu(k,i,j) /= 0.d0) then + zu(k,i,j) = ru(k,i,j) / Adiagu(k,i,j) ! PC(r), where PC is formed from diagonal elements of A else - zu(i,j) = 0.d0 + zu(k,i,j) = 0.d0 endif - if (Adiagv(i,j) /= 0.d0) then - zv(i,j) = rv(i,j) / Adiagv(i,j) + if (Adiagv(k,i,j) /= 0.d0) then + zv(k,i,j) = rv(k,i,j) / Adiagv(k,i,j) else - zv(i,j) = 0.d0 + zv(k,i,j) = 0.d0 endif + enddo ! k enddo ! i enddo ! j - elseif(precond == HO_PRECOND_TRIDIAG_LOCAL) then ! local + elseif (precond == HO_PRECOND_SIA) then ! local vertical shallow-ice solver for preconditioning - if (verbose_tridiag .and. this_rank == rtest) then - i = itest - j = jtest - write(iulog,*) 'Residual:' - write(iulog,*) 'r, i, j, ru:', this_rank, i, j, ru(i,j) - write(iulog,*) 'r, i, j, rv:', this_rank, i, j, rv(i,j) - write(iulog,*) ' ' - write(iulog,*) 'jtest =', jtest - write(iulog,*) 'i, ru, rv:' - do i = itest-3, itest+3 - write(iulog,'(i4, 2f15.10)') i, ru(i,j), rv(i,j) - enddo - endif + call easy_sia_solver(nx, ny, nz, & + active_vertex, & + Muu, ru, zu) ! solve Muu*zu = ru for zu - if (verbose_pcg .and. main_task) then - write(iulog,*) 'call tridiag_solver_local_2d' - endif + call easy_sia_solver(nx, ny, nz, & + active_vertex, & + Mvv, rv, zv) ! solve Mvv*zv = rv for zv - ! Solve M*z = r, where M is a local tridiagonal matrix (one matrix per task) - !TODO - Test a local solver that can compute zu and zv in the halo - ! (to avoid the halo update below) + elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then - call tridiag_solver_local_2d(& + ! Use a local tridiagonal solver to find an approximate solution of A*z = r + + call tridiag_solver_local_3d(& nx, ny, & - parallel, & + nz, parallel, & + active_vertex, & itest, jtest, rtest, & - Adiag_u, Adiag_v, & ! entries of preconditioning matrix + Adiag_u, Adiag_v, & ! entries of 2D preconditioning matrix Asubdiag_u, Asubdiag_v, & Asupdiag_u, Asupdiag_v, & omega_u, omega_v, & denom_u, denom_v, & - ru, rv, & ! right hand side - zu, zv) ! solution - - !WHL - debug - if (verbose_pcg .and. this_rank == rtest) then - j = jtest - write(iulog,*) ' ' - write(iulog,*) 'tridiag solve: i, ru, rv, zu, zv:' - do i = itest-3, itest+3 - write(iulog,'(i4, 4f16.10)') i, ru(i,j), rv(i,j), zu(i,j), zv(i,j) - enddo - endif - - !Note: Need zu and zv in a row of halo cells so that q = A*d is correct in all locally owned cells - !TODO: See whether tridiag solvers could be modified to provide zu and zv in halo cells? - call staggered_parallel_halo(zu, parallel) - call staggered_parallel_halo(zv, parallel) + Muu, Mvv, & ! entries of SIA matrix + ru, rv, & ! 3D residual + zu, zv) ! approximate solution of Az = r - elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then ! tridiagonal preconditioning with global solve + elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then ! Use a global tridiagonal solver to find an approximate solution of A*z = r + if (iter == 1) then first_time = .true. else first_time = .false. endif - ! convert ru(nx-1,ny-1) to b_u(ilocal,jlocal) - do j = 1, jlocal - jj = j + staggered_jlo - 1 - do i = 1, ilocal - ii = i + staggered_ilo - 1 - b_u(i,j) = ru(ii,jj) - enddo - enddo + call tridiag_solver_global_3d(& + nx, ny, & + nz, parallel, & + active_vertex, & + ilocal, jlocal, & + tasks_row, tasks_col, & + itest, jtest, rtest, & + Adiag_u, Adiag_v, & ! entries of 2D preconditioning matrix + Asubdiag_u, Asubdiag_v, & + Asupdiag_u, Asupdiag_v, & + omega_u, omega_v, & + denom_u, denom_v, & + xuh_u, xuh_v, & + xlh_u, xlh_v, & + Muu, Mvv, & ! entries of SIA matrix + gather_data_row, gather_data_col, & + first_time, & + ru, rv, & ! 3D residual + zu, zv) ! approximate solution of Az = r - ! Solve M*z = r, where M is a global tridiagonal matrix + endif ! precond - call tridiag_solver_global_2d(& - ilocal, jlocal, & - parallel, tasks_row, & - 'row', & ! tridiagonal solve for each row -!! itest, jtest, rtest, & - itest - staggered_ilo + 1, & ! itest referenced to (ilocal,jlocal) coordinates - jtest - staggered_jlo + 1, & ! jtest referenced to (ilocal,jlocal) coordinates - rtest, & - Adiag_u, & - Asubdiag_u, Asupdiag_u, & - omega_u, denom_u, & - xuh_u, xlh_u, & - b_u, x_u, & - first_time, & - gather_data_row) + call t_stopf("pcg_precond") - ! convert x_u(ilocal,jlocal) to zu(nx-1,ny-1) - zu(:,:) = 0.0d0 - do j = 1, jlocal - jj = j + staggered_jlo - 1 - do i = 1, ilocal - ii = i + staggered_ilo - 1 - zu(ii,jj) = x_u(i,j) - enddo - enddo - - ! convert rv(nx-1,ny-1) to b_v(jlocal,ilocal) - do i = 1, ilocal - ii = i + staggered_ilo - 1 - do j = 1, jlocal - jj = j + staggered_jlo - 1 - b_v(j,i) = rv(ii,jj) - enddo - enddo - - call tridiag_solver_global_2d(& - jlocal, ilocal, & - parallel, tasks_col, & - 'col', & ! tridiagonal solve for each column - !! itest, jtest, rtest, & - jtest - staggered_jlo + 1, & ! jtest referenced to (jlocal,ilocal) coordinates - itest - staggered_ilo + 1, & ! itest referenced to (jlocal,ilocal) coordinates - rtest, & - Adiag_v, & - Asubdiag_v, Asupdiag_v, & - omega_v, denom_v, & - xuh_v, xlh_v, & - b_v, x_v, & - first_time, & - gather_data_col) - - ! convert x_v(jlocal,ilocal) to zv(nx-1,ny-1) - zv(:,:) = 0.0d0 - do i = 1, ilocal - ii = i + staggered_ilo - 1 - do j = 1, jlocal - jj = j + staggered_jlo - 1 - zv(ii,jj) = x_v(j,i) - enddo - enddo - - !Note: Need zu and zv in a row of halo cells so that q = A*d is correct in all locally owned cells - !TODO: See whether tridiag_solver_local_2d could be modified to provide zu and zv in halo cells? - call staggered_parallel_halo(zu, parallel) - call staggered_parallel_halo(zv, parallel) - - endif ! precond - - call t_stopf("pcg_precond") - - ! Compute the dot product eta1 = (r, PC(r)) + ! Compute the dot product eta1 = (r, PC(r)) call t_startf("pcg_dotprod") - work0u(:,:) = ru(:,:)*zu(:,:) ! terms of dot product (r, PC(r)) - work0v(:,:) = rv(:,:)*zv(:,:) + work0u(:,:,:) = ru(:,:,:)*zu(:,:,:) ! terms of dot product (r, PC(r)) + work0v(:,:,:) = rv(:,:,:)*zv(:,:,:) call t_stopf("pcg_dotprod") call t_startf("pcg_glbsum_iter") @@ -1349,7 +1329,7 @@ subroutine pcg_solver_standard_2d(nx, ny, & ! then eta1 will be NaN. if (eta1 /= eta1) then ! eta1 is NaN - call write_log('PCG solver has failed, eta1 = NaN', GM_FATAL) + call write_log('PCG solver has failed, alpha = NaN', GM_FATAL) endif ! Update the conjugate direction vector d @@ -1357,13 +1337,13 @@ subroutine pcg_solver_standard_2d(nx, ny, & beta = eta1/eta0 call t_startf("pcg_vecupdate") - du(:,:) = zu(:,:) + beta*du(:,:) ! d_(i+1) = PC(r_(i+1)) + beta_(i+1)*d_i - dv(:,:) = zv(:,:) + beta*dv(:,:) ! - ! (r_(i+1), PC(r_(i+1))) - ! where beta_(i+1) = -------------------- - ! (r_i, PC(r_i)) - ! Initially eta0 = 1 - ! For n >=2, eta0 = old eta1 + du(:,:,:) = zu(:,:,:) + beta*du(:,:,:) ! d_(i+1) = PC(r_(i+1)) + beta_(i+1)*d_i + dv(:,:,:) = zv(:,:,:) + beta*dv(:,:,:) ! + ! (r_(i+1), PC(r_(i+1))) + ! where beta_(i+1) = -------------------- + ! (r_i, PC(r_i)) + ! Initially eta0 = 1 + ! For n >=2, eta0 = old eta1 call t_stopf("pcg_vecupdate") ! Halo update for d @@ -1377,9 +1357,9 @@ subroutine pcg_solver_standard_2d(nx, ny, & ! This is the one matvec multiply required for each iteration call t_startf("pcg_matmult_iter") - call matvec_multiply_structured_2d(nx, ny, & - parallel, & - indxA_2d, active_vertex, & + call matvec_multiply_structured_3d(nx, ny, & + nz, parallel, & + indxA_3d, active_vertex, & Auu, Auv, & Avu, Avv, & du, dv, & @@ -1393,8 +1373,8 @@ subroutine pcg_solver_standard_2d(nx, ny, & ! Compute the dot product eta2 = (d, A*d) call t_startf("pcg_dotprod") - work0u(:,:) = du(:,:) * qu(:,:) ! terms of dot product (d, Ad) - work0v(:,:) = dv(:,:) * qv(:,:) + work0u(:,:,:) = du(:,:,:) * qu(:,:,:) ! terms of dot product (d, Ad) + work0v(:,:,:) = dv(:,:,:) * qv(:,:,:) call t_stopf("pcg_dotprod") call t_startf("pcg_glbsum_iter") @@ -1416,13 +1396,26 @@ subroutine pcg_solver_standard_2d(nx, ny, & ! Compute the new solution and residual call t_startf("pcg_vecupdate") - xu(:,:) = xu(:,:) + alpha * du(:,:) ! new solution, x_(i+1) = x_i + alpha*d - xv(:,:) = xv(:,:) + alpha * dv(:,:) + xu(:,:,:) = xu(:,:,:) + alpha * du(:,:,:) ! new solution, x_(i+1) = x_i + alpha*d + xv(:,:,:) = xv(:,:,:) + alpha * dv(:,:,:) - ru(:,:) = ru(:,:) - alpha * qu(:,:) ! new residual, r_(i+1) = r_i - alpha*(Ad) - rv(:,:) = rv(:,:) - alpha * qv(:,:) + ru(:,:,:) = ru(:,:,:) - alpha * qu(:,:,:) ! new residual, r_(i+1) = r_i - alpha*(Ad) + rv(:,:,:) = rv(:,:,:) - alpha * qv(:,:,:) call t_stopf("pcg_vecupdate") + if (verbose_pcg .and. this_rank == rtest) then + j = jtest + write(iulog,*) ' ' + write(iulog,*) 'iter =', iter + write(iulog,*) 'i, k, xu, xv, ru, rv:' + do i = itest-3, itest+3 + write(iulog,*) ' ' + do k = 1, nz + write(iulog,'(2i4, 4f16.10)') i, k, xu(k,i,j), xv(k,i,j), ru(k,i,j), rv(k,i,j) + enddo + enddo ! i + endif + ! Check for convergence every linear_solve_ncheck iterations. ! Also check at iter = 5, to reduce iterations when the nonlinear solver is close to convergence. ! TODO: Check at iter = linear_solve_ncheck/2 instead of 5? This would be answer-changing. @@ -1431,6 +1424,11 @@ subroutine pcg_solver_standard_2d(nx, ny, & if (mod(iter, linear_solve_ncheck) == 0 .or. iter == 5) then + if (verbose_pcg .and. main_task) then + write(iulog,*) ' ' + write(iulog,*) 'Check convergence, iter =', iter + endif + ! Halo update for x call t_startf("pcg_halo_resid") @@ -1441,9 +1439,9 @@ subroutine pcg_solver_standard_2d(nx, ny, & ! Compute A*x (use z as a temp vector for A*x) call t_startf("pcg_matmult_resid") - call matvec_multiply_structured_2d(nx, ny, & - parallel, & - indxA_2d, active_vertex, & + call matvec_multiply_structured_3d(nx, ny, & + nz, parallel, & + indxA_3d, active_vertex, & Auu, Auv, & Avu, Avv, & xu, xv, & @@ -1453,15 +1451,15 @@ subroutine pcg_solver_standard_2d(nx, ny, & ! Compute residual r = b - Ax call t_startf("pcg_vecupdate") - ru(:,:) = bu(:,:) - zu(:,:) - rv(:,:) = bv(:,:) - zv(:,:) + ru(:,:,:) = bu(:,:,:) - zu(:,:,:) + rv(:,:,:) = bv(:,:,:) - zv(:,:,:) call t_stopf("pcg_vecupdate") ! Compute squared L2 norm of (r, r) call t_startf("pcg_dotprod") - work0u(:,:) = ru(:,:)*ru(:,:) ! terms of dot product (r, r) - work0v(:,:) = rv(:,:)*rv(:,:) + work0u(:,:,:) = ru(:,:,:)*ru(:,:,:) ! terms of dot product (r, r) + work0v(:,:,:) = rv(:,:,:)*rv(:,:,:) call t_stopf("pcg_dotprod") call t_startf("pcg_glbsum_resid") @@ -1474,6 +1472,10 @@ subroutine pcg_solver_standard_2d(nx, ny, & ! compute normalized error err = L2_resid/L2_rhs + if (verbose_pcg .and. main_task) then + write(iulog,*) 'iter, L2_resid, L2_rhs, error =', iter, L2_resid, L2_rhs, err + endif + !WHL - debug if (verbose_pcg .and. this_rank == rtest) then ru_max = 0.d0 @@ -1482,13 +1484,13 @@ subroutine pcg_solver_standard_2d(nx, ny, & ju_max = 0 do j = staggered_jlo, staggered_jhi do i = staggered_ilo, staggered_ihi - if (abs(ru(i,j)) > ru_max) then - ru_max = ru(i,j) + if (sum(abs(ru(:,i,j))) > ru_max) then + ru_max = sum(abs(ru(:,i,j))) iu_max = i ju_max = j endif - if (abs(rv(i,j)) > rv_max) then - rv_max = rv(i,j) + if (sum(abs(rv(:,i,j))) > rv_max) then + rv_max = sum(abs(rv(:,i,j))) iv_max = i jv_max = j endif @@ -1529,19 +1531,16 @@ subroutine pcg_solver_standard_2d(nx, ny, & if (allocated(denom_u)) deallocate(denom_u, denom_v) if (allocated(xuh_u)) deallocate(xuh_u, xuh_v) if (allocated(xlh_u)) deallocate(xlh_u, xlh_v) - if (allocated(b_u)) deallocate(b_u, b_v) - if (allocated(x_u)) deallocate(x_u, x_v) if (allocated(gather_data_row)) deallocate(gather_data_row) if (allocated(gather_data_col)) deallocate(gather_data_col) - end subroutine pcg_solver_standard_2d + end subroutine pcg_solver_standard_3d !**************************************************************************** - subroutine pcg_solver_chrongear_3d(nx, ny, & - nz, parallel, & - indxA_2d, indxA_3d, & - active_vertex, & + subroutine pcg_solver_chrongear_2d(nx, ny, & + parallel, & + indxA_2d, active_vertex, & Auu, Auv, & Avu, Avv, & bu, bv, & @@ -1553,104 +1552,26 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & !--------------------------------------------------------------- ! This subroutine uses a Chronopoulos-Gear preconditioned conjugate-gradient - ! algorithm to solve the equation $Ax=b$. - ! - ! It is based on the Chronopoulos-Gear PCG solver in the POP ocean model - ! (author Frank Bryan, NCAR). It is a rearranged conjugate gradient solver - ! that reduces the number of global reductions per iteration from two to one - ! (not counting the convergence check). Convergence is checked every - ! {\em linear_solve_ncheck} steps. - ! - ! References are: - ! - ! Chronopoulos, A.T., A Class of Parallel Iterative Methods Implemented on Multiprocessors, - ! Ph.D. thesis, Technical Report UIUCDCS-R-86-1267, Department of Computer Science, - ! University of Illinois, Urbana, Illinois, pp. 1-116, 1986. - ! - ! Chronopoulos, A.T., and C.W. Gear. s-step iterative methods - ! for symmetric linear systems. J. Comput. Appl. Math., 25(2), - ! 153-168, 1989. - ! - ! Dongarra, J. and V. Eijkhout. LAPACK Working Note 159. - ! Finite-choice algorithm optimization in conjugate gradients. - ! Tech. Rep. ut-cs-03-502. Computer Science Department. - ! University of Tennessee, Knoxville. 2003. + ! algorithm to solve the equation $Ax=b$. (See references in subroutine above.) ! - ! D Azevedo, E.F., V.L. Eijkhout, and C.H. Romine. LAPACK Working - ! Note 56. Conjugate gradient algorithms with reduced - ! synchronization overhead on distributed memory multiprocessors. - ! Tech. Rep. CS-93-185. Computer Science Department. - ! University of Tennessee, Knoxville. 1993. - !--------------------------------------------------------------- + ! It is similar to subroutine pcg_solver_chrongear_3d, but modified + ! to solve for x and y at a single horizontal level, as in the + ! shallow-shelf approximation. See the comments in that subroutine + ! (above) for more details on data structure and solver methods. ! - ! The input and output arrays are located on a structured (i,j,k) grid - ! as defined in the glissade_velo_higher module. - ! The global matrix is sparse, but its nonzero elements are stored in - ! four dense matrices called Auu, Avv, Auv, and Avu. - ! Each matrix has 3x3x3 = 27 potential nonzero elements per node (i,j,k). + ! Input and output arrays are located on a structured (i,j) grid + ! as defined in the glissade_velo_higher module. The global matrix + ! is sparse, but its nonzero element are stored in four dense matrices + ! called Auu, Avv, Auv, and Avu. Each matrix has 3x3 = 9 potential + ! nonzero elements per node (i,j). ! - ! The current preconditioning options are + ! The current preconditioning options for the solver are ! (0) no preconditioning ! (1) diagonal preconditioning - ! (2) preconditioning using a physics-based SIA solver - ! - ! For the dome test case with higher-order dynamics, option (2) is best. - ! - ! Here is a schematic of the method implemented below for solving Ax = b: - ! - ! Set up preconditioner M - ! work0 = (b,b) - ! bb = global_sum(work0) - ! - ! First pass of algorithm: - ! halo_update(x) - ! r = b - A*x - ! halo_update(r) - ! solve Mz = r for z - ! work(1) = (r,z) - ! d = z - ! q = A*d - ! work(2) = (d,q) - ! halo_update(q) - ! rho_old = global_sum(work(1)) - ! sigma = global_sum(work(2)) - ! alpha = rho_old/sigma - ! x = x + alpha*d - ! r = r - alpha*q - ! - ! Iterative loop: - ! while (not converged) - ! solve Mz = r for z - ! Az = A*z - ! work(1) = (r,z) - ! work(2) = (Az,z) - ! halo_update(Az) - ! rho = global_sum(work(1)) - ! delta = global_sum(work(2)) - ! beta = rho/rho_old - ! sigma = delta - beta^2 * sigma - ! alpha = rho/sigma - ! rho_old = rho - ! d = z + beta*d - ! q = Az + beta*q - ! x = x + alpha*d - ! r = r - alpha*q - ! if (time to check convergence) then - ! r = b - A*x - ! work0 = (r,r) - ! halo_update(r) - ! rr = global_sum(work0) - ! if (sqrt(r,r)/sqrt(b,b) < tolerance) exit - ! endif - ! end while + ! (3) local tridiagonal preconditioning + ! (4) global tridiagonal preconditioning + ! The SIA-based preconditioning option is not available for a 2D solve. ! - ! where x = solution vector - ! d = conjugate direction vector - ! r = residual vector - ! M = preconditioning matrix - ! (r,z) = dot product of vectors r and z - ! and similarly for (Az,z), etc. - ! !--------------------------------------------------------------- !--------------------------------------------------------------- @@ -1658,9 +1579,8 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & !--------------------------------------------------------------- integer, intent(in) :: & - nx, ny, & ! horizontal grid dimensions (for scalars) - ! velocity grid has dimensions (nx-1,ny-1) - nz ! number of vertical levels where velocity is computed + nx, ny ! horizontal grid dimensions (for scalars) + ! velocity grid has dimensions (nx-1,ny-1) type(parallel_type), intent(in) :: & parallel ! info for parallel communication @@ -1668,60 +1588,54 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & integer, dimension(-1:1,-1:1), intent(in) :: & indxA_2d ! maps relative (x,y) coordinates to an index between 1 and 9 - integer, dimension(-1:1,-1:1,-1:1), intent(in) :: & - indxA_3d ! maps relative (x,y,z) coordinates to an index between 1 and 27 - logical, dimension(nx-1,ny-1), intent(in) :: & active_vertex ! T for columns (i,j) where velocity is computed, else F - real(dp), dimension(27,nz,nx-1,ny-1), intent(in) :: & - Auu, Auv, Avu, Avv ! four components of assembled matrix - ! 1st dimension = 27 (node and its nearest neighbors in x, y and z direction) - ! other dimensions = (z,x,y) indices + real(dp), dimension(nx-1,ny-1,9), intent(in) :: & + Auu, Auv, & ! four components of assembled matrix + Avu, Avv ! 3rd dimension = 9 (node and its nearest neighbors in x and y direction) + ! 1st and 2nd dimensions = (x,y) indices ! ! Auu | Auv ! _____|____ ! Avu | Avv ! | - real(dp), dimension(nz,nx-1,ny-1), intent(in) :: & + real(dp), dimension(nx-1,ny-1), intent(in) :: & bu, bv ! assembled load (rhs) vector, divided into 2 parts - real(dp), dimension(nz,nx-1,ny-1), intent(inout) :: & + real(dp), dimension(nx-1,ny-1), intent(inout) :: & xu, xv ! u and v components of solution (i.e., uvel and vvel) integer, intent(in) :: & precond ! = 0 for no preconditioning ! = 1 for diagonal preconditioning (best option for SSA-dominated flow) - ! = 2 for preconditioning with SIA solver (works well for SIA-dominated flow) integer, intent(in) :: & - linear_solve_ncheck ! number of iterations between convergence checks in the linear solver + linear_solve_ncheck ! number of iterations between convergence checks in the linear solver integer, intent(in) :: & - maxiters ! max number of linear iterations before quitting + maxiters ! max number of linear iterations before quitting real(dp), intent(in) :: & - tolerance ! tolerance for linear solver + tolerance ! tolerance for linear solver real(dp), intent(out) :: & - err ! error (L2 norm of residual) in final solution + err ! error (L2 norm of residual) in final solution integer, intent(out) :: & - niters ! iterations needed to solution + niters ! iterations needed to solution integer, intent(in) :: & - itest, jtest, rtest ! point for debugging diagnostics + itest, jtest, rtest ! point for debugging diagnostics !--------------------------------------------------------------- ! Local variables and parameters !--------------------------------------------------------------- - integer :: i, j, k, m ! grid indices - integer :: ii, jj - integer :: ilocal, jlocal ! number of locally owned vertices in each direction - integer :: iter ! iteration counter - integer :: maxiters_chrongear ! max number of linear iterations before quitting + integer :: i, j, ii, jj ! grid indices + integer :: m ! matrix element index + integer :: iter ! iteration counter real(dp) :: & alpha, &! rho/sigma = term in expression for new residual and solution @@ -1735,8 +1649,7 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & gsum ! result of global sum for dot products ! vectors (each of these is split into u and v components) - real(dp), dimension(nz,nx-1,ny-1) :: & - Adiagu, Adiagv, &! diagonal terms of matrices Auu and Avv + real(dp), dimension(nx-1,ny-1) :: & ru, rv, &! residual vector (b-Ax) du, dv, &! conjugate direction vector zu, zv, &! solution of Mz = r @@ -1744,7 +1657,7 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & Azu, Azv, &! result of matvec multiply A*z worku, workv ! intermediate results - real(dp), dimension(nz,nx-1,ny-1,2) :: & + real(dp), dimension(nx-1,ny-1,2) :: & work2u, work2v ! intermediate results real(dp) :: & @@ -1754,12 +1667,11 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & L2_rhs ! L2 norm of rhs vector = sqrt(b,b) ! solver is converged when L2_resid/L2_rhs < tolerance - real(dp), dimension(-1:1,nz,nx-1,ny-1) :: & - Muu, Mvv ! simplified SIA matrices for preconditioning - - ! arrays for tridiagonal preconditioning - ! Note: 2D diagonal entries are Adiag_u and Adiag_v; distinct from 3D Adiagu and Adiagv above + ! diagonal matrix elements + real(dp), dimension(nx-1,ny-1) :: & + Adiagu, Adiagv ! diagonal terms of matrices Auu and Avv + ! tridiagonal matrix elements real(dp), dimension(:,:), allocatable :: & Asubdiag_u, Adiag_u, Asupdiag_u, & ! matrix entries from Auu for tridiagonal preconditioning Asubdiag_v, Adiag_v, Asupdiag_v ! matrix entries from Avv for tridiagonal preconditioning @@ -1770,18 +1682,17 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & xuh_u, xuh_v, & xlh_u, xlh_v - ! Note: These two matrices are global in the EW and NS dimensions, respectively. - ! Each holds 8 pieces of information for each task on each row or column. - ! Since only 2 of these 8 pieces of information change from one iteration to the next, - ! it is more efficient to gather the remaining information once and pass the arrays - ! with intent(inout), than to declare the arrays in subroutine tridiag_solver_global_2d - ! and gather all the information every time the subroutine is called. - ! TODO: Revisit this. Is the efficiency gain large enough to justify the extra complexity? + real(dp), dimension(:,:), allocatable :: & + b_u, b_v, x_u, x_v real(dp), dimension(:,:), allocatable :: & gather_data_row, & ! arrays for gathering data from every task on a row or column gather_data_col + integer :: ilocal, jlocal ! number of locally owned vertices in each direction + + integer :: maxiters_chrongear ! max number of linear iterations before quitting + integer :: & staggered_ilo, staggered_ihi, & ! bounds of locally owned vertices on staggered grid staggered_jlo, staggered_jhi @@ -1790,9 +1701,17 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & tasks_row, & ! number of tasks per row and column for tridiagonal solve tasks_col + integer, parameter :: & + maxiters_tridiag = 100 ! max number of linear iterations for tridiagonal preconditioning, + ! which generally leads to faster convergence than diagonal preconditioning + + !WHL - debug + real(dp) :: usum, usum_global, vsum, vsum_global + !WHL - debug integer :: iu_max, ju_max, iv_max, jv_max real(dp) :: ru_max, rv_max + real(dp) :: sum_temp staggered_ilo = parallel%staggered_ilo staggered_ihi = parallel%staggered_ihi @@ -1802,22 +1721,17 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & tasks_row = parallel%tasks_row tasks_col = parallel%tasks_col - ! Note: maxiters_tridiag commented out here, because the BP tridiagonal solver - ! tends not to converge as well as the 2D version. - ! TODO: Make maxiters a config option. - ! Set the maximum number of linear iterations. ! Typically allow up to 200 iterations with diagonal preconditioning, but only 100 ! with tridiagonal, which usually converges faster. - !TODO - Test whether maxiters_tridiag (currently = 100) is sufficient for convergence with 3D solver -!! if (precond == HO_PRECOND_TRIDIAG_LOCAL .or. precond == HO_PRECOND_TRIDIAG_GLOBAL) then -!! maxiters_chrongear = maxiters_tridiag -!! else + if (precond == HO_PRECOND_TRIDIAG_LOCAL .or. precond == HO_PRECOND_TRIDIAG_GLOBAL) then + maxiters_chrongear = maxiters_tridiag + else maxiters_chrongear = maxiters -!! endif + endif - if (verbose_pcg .and. this_rank == rtest) then + if (verbose_pcg .and. main_task) then write(iulog,*) 'Using native PCG solver (Chronopoulos-Gear)' write(iulog,*) 'tolerance, maxiters, precond =', tolerance, maxiters_chrongear, precond endif @@ -1832,14 +1746,14 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & if (precond == HO_PRECOND_NONE) then ! no preconditioner - if (verbose_pcg .and. this_rank == rtest) then + if (verbose_pcg .and. main_task) then write(iulog,*) 'Using no preconditioner' endif elseif (precond == HO_PRECOND_DIAG) then - call setup_preconditioner_diag_3d(nx, ny, & - nz, indxA_3d, & + call setup_preconditioner_diag_2d(nx, ny, & + indxA_2d, & Auu, Avv, & Adiagu, Adiagv) @@ -1848,32 +1762,22 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & i = itest j = jtest write(iulog,*) 'i, j, r =', i, j, this_rank - write(iulog,*) 'Auu diag =', Adiagu(:,i,j) - write(iulog,*) 'Avv diag =', Adiagv(:,i,j) + write(iulog,*) 'Au diag =', Adiagu(i,j) + write(iulog,*) 'Av diag =', Adiagv(i,j) endif - elseif (precond == HO_PRECOND_SIA) then - - call setup_preconditioner_sia_3d(nx, ny, & - nz, indxA_3d, & - Auu, Avv, & - Muu, Mvv) + elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then - if (verbose_pcg .and. this_rank == rtest) then + !WHL - debug + if (verbose_tridiag .and. this_rank==rtest) then + i = itest j = jtest write(iulog,*) ' ' - write(iulog,*) 'i, k, Muu_sia, Mvv_sia:' - do i = staggered_ihi, staggered_ilo, -1 - write(iulog,*) ' ' - do k = 1, nz - write(iulog,'(2i4, 6e13.5)') i, k, Muu(:,k,i,j), Mvv(:,k,i,j) - enddo - enddo ! i + write(iulog,*) 'r, i, j =', this_rank, i, j + write(iulog,*) 'Auu =', Auu(i,j,:) + write(iulog,*) 'Avv =', Avv(i,j,:) endif - elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then - - ! Allocate tridiagonal preconditioning matrices allocate(Adiag_u (nx-1,ny-1)) allocate(Asubdiag_u(nx-1,ny-1)) allocate(Asupdiag_u(nx-1,ny-1)) @@ -1886,26 +1790,22 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & allocate(omega_v (nx-1,ny-1)) allocate(denom_v (nx-1,ny-1)) - ! Compute arrays for tridiagonal preconditioning - - call setup_preconditioner_tridiag_local_3d(& - nx, ny, & - nz, parallel, & - active_vertex, & - indxA_2d, indxA_3d, & - itest, jtest, rtest, & - Auu, Avv, & - Muu, Mvv, & - Adiag_u, Adiag_v, & - Asubdiag_u, Asubdiag_v, & - Asupdiag_u, Asupdiag_v, & - omega_u, omega_v, & + call setup_preconditioner_tridiag_local_2d(& + nx, ny, & + parallel, indxA_2d, & + itest, jtest, rtest, & + Auu, Avv, & + Adiag_u, Adiag_v, & + Asubdiag_u, Asubdiag_v, & + Asupdiag_u, Asupdiag_v, & + omega_u, omega_v, & denom_u, denom_v) - + elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then ! Allocate tridiagonal preconditioning matrices - ! Note: (i,j) indices are switched for the A_v matrices to reduce striding. + ! Note: (i,j) indices are switced for the A_v matrices to reduce striding. + allocate(Adiag_u (ilocal,jlocal)) allocate(Asubdiag_u(ilocal,jlocal)) allocate(Asupdiag_u(ilocal,jlocal)) @@ -1913,6 +1813,8 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & allocate(denom_u(ilocal,jlocal)) allocate(xuh_u(ilocal,jlocal)) allocate(xlh_u(ilocal,jlocal)) + allocate(b_u(ilocal,jlocal)) + allocate(x_u(ilocal,jlocal)) allocate(Adiag_v (jlocal,ilocal)) allocate(Asubdiag_v(jlocal,ilocal)) @@ -1921,6 +1823,8 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & allocate(denom_v(jlocal,ilocal)) allocate(xuh_v(jlocal,ilocal)) allocate(xlh_v(jlocal,ilocal)) + allocate(b_v(jlocal,ilocal)) + allocate(x_v(jlocal,ilocal)) ! These two matrices are for gathering data from all tasks on a given row or column. allocate(gather_data_row(8*tasks_row,jlocal)) @@ -1928,53 +1832,87 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & gather_data_row = 0.0d0 gather_data_col = 0.0d0 - ! Compute arrays for tridiagonal preconditioning + ! Compute the entries of the tridiagonal matrices - call setup_preconditioner_tridiag_global_3d(& - nx, ny, & - nz, parallel, & - active_vertex, & - indxA_2d, indxA_3d, & - ilocal, jlocal, & - itest, jtest, rtest, & - Auu, Avv, & - Muu, Mvv, & - Adiag_u, Adiag_v, & - Asubdiag_u, Asubdiag_v, & - Asupdiag_u, Asupdiag_v, & - omega_u, omega_v, & - denom_u, denom_v, & - xuh_u, xuh_v, & - xlh_u, xlh_v) + ! Extract tridiagonal matrix entries from Auu + do j = 1, jlocal + jj = j + staggered_jlo - 1 + do i = 1, ilocal + ii = i + staggered_ilo - 1 + Asubdiag_u(i,j) = Auu(ii,jj,indxA_2d(-1,0)) ! subdiagonal elements + Adiag_u (i,j) = Auu(ii,jj,indxA_2d( 0,0)) ! diagonal elements + Asupdiag_u(i,j) = Auu(ii,jj,indxA_2d( 1,0)) ! superdiagonal elements + enddo + enddo + + ! compute work arrays for the u solve in each matrix row + call setup_preconditioner_tridiag_global_2d(& + ilocal, jlocal, & +!! itest, jtest, rtest, & + itest - staggered_ilo + 1, & ! itest referenced to (ilocal,jlocal) coordinates + jtest - staggered_jlo + 1, & ! jtest referenced to (ilocal,jlocal) coordinates + rtest, & + Adiag_u, & + Asubdiag_u, Asupdiag_u, & + omega_u, denom_u, & + xuh_u, xlh_u) + + ! Extract tridiagonal matrix entries from Avv + do i = 1, ilocal + ii = i + staggered_ilo - 1 + do j = 1, jlocal + jj = j + staggered_jlo - 1 + Asubdiag_v(j,i) = Avv(ii,jj,indxA_2d(0,-1)) ! subdiagonal elements + Adiag_v (j,i) = Avv(ii,jj,indxA_2d(0, 0)) ! diagonal elements + Asupdiag_v(j,i) = Avv(ii,jj,indxA_2d(0, 1)) ! superdiagonal elements + enddo + enddo + + ! compute work arrays for the v solve in each matrix column + ! Note: The *_v arrays have dimensions (jlocal,ilocal) to reduce strides + + call setup_preconditioner_tridiag_global_2d(& + jlocal, ilocal, & +!! itest, jtest, rtest, & + jtest - staggered_jlo + 1, & ! jtest referenced to (jlocal,ilocal) coordinates + itest - staggered_ilo + 1, & ! itest referenced to (jlocal,ilocal) coordinates + rtest, & + Adiag_v, & + Asubdiag_v, Asupdiag_v, & + omega_v, denom_v, & + xuh_v, xlh_v) endif ! precond + !WHL - debug + if (verbose_pcg .and. main_task) write(iulog,*) 'Done in PC setup' + call t_stopf("pcg_precond_init") !---- Initialize scalars and vectors niters = maxiters_chrongear - ru(:,:,:) = 0.d0 - rv(:,:,:) = 0.d0 - du(:,:,:) = 0.d0 - dv(:,:,:) = 0.d0 - zu(:,:,:) = 0.d0 - zv(:,:,:) = 0.d0 - qu(:,:,:) = 0.d0 - qv(:,:,:) = 0.d0 - Azu(:,:,:) = 0.d0 - Azv(:,:,:) = 0.d0 - worku(:,:,:) = 0.d0 - workv(:,:,:) = 0.d0 - work2u(:,:,:,:) = 0.d0 - work2v(:,:,:,:) = 0.d0 + ru(:,:) = 0.d0 + rv(:,:) = 0.d0 + du(:,:) = 0.d0 + dv(:,:) = 0.d0 + zu(:,:) = 0.d0 + zv(:,:) = 0.d0 + qu(:,:) = 0.d0 + qv(:,:) = 0.d0 + Azu(:,:) = 0.d0 + Azv(:,:) = 0.d0 + worku(:,:) = 0.d0 + workv(:,:) = 0.d0 + work2u(:,:,:) = 0.d0 + work2v(:,:,:) = 0.d0 !---- Compute the L2 norm of the RHS vectors !---- (Goal is to obtain L2_resid/L2_rhs < tolerance) call t_startf("pcg_dotprod") - worku(:,:,:) = bu(:,:,:)*bu(:,:,:) ! terms of dot product (b, b) - workv(:,:,:) = bv(:,:,:)*bv(:,:,:) + worku(:,:) = bu(:,:)*bu(:,:) ! terms of dot product (b, b) + workv(:,:) = bv(:,:)*bv(:,:) call t_stopf("pcg_dotprod") ! find global sum of the squared L2 norm @@ -1991,8 +1929,6 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & ! First pass of algorithm !--------------------------------------------------------------- - iter = 1 - ! Note: The matrix A must be complete for all rows corresponding to locally ! owned nodes, and x must have the correct values in ! halo nodes bordering the locally owned nodes. @@ -2008,9 +1944,9 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & !---- Compute A*x (use z as a temp vector for A*x) call t_startf("pcg_matmult_init") - call matvec_multiply_structured_3d(nx, ny, & - nz, parallel, & - indxA_3d, active_vertex, & + call matvec_multiply_structured_2d(nx, ny, & + parallel, & + indxA_2d, active_vertex, & Auu, Auv, & Avu, Avv, & xu, xv, & @@ -2021,8 +1957,8 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & !---- This is correct for locally owned nodes. call t_startf("pcg_vecupdate") - ru(:,:,:) = bu(:,:,:) - zu(:,:,:) - rv(:,:,:) = bv(:,:,:) - zv(:,:,:) + ru(:,:) = bu(:,:) - zu(:,:) + rv(:,:) = bv(:,:) - zv(:,:) call t_stopf("pcg_vecupdate") !---- Halo update for residual @@ -2038,81 +1974,169 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & ! From here on, call timers with 'iter' suffix because this can be considered the first iteration call t_startf("pcg_precond_iter") - if (precond == HO_PRECOND_NONE) then ! no preconditioning + if (precond == HO_PRECOND_NONE) then ! no preconditioning - zu(:,:,:) = ru(:,:,:) ! PC(r) = r - zv(:,:,:) = rv(:,:,:) ! PC(r) = r + zu(:,:) = ru(:,:) ! PC(r) = r + zv(:,:) = rv(:,:) ! PC(r) = r - elseif (precond == HO_PRECOND_DIAG ) then ! diagonal preconditioning + elseif (precond == HO_PRECOND_DIAG) then ! diagonal preconditioning + ! Solve Mz = r, where M is a diagonal matrix do j = 1, ny-1 do i = 1, nx-1 - do k = 1, nz - if (Adiagu(k,i,j) /= 0.d0) then - zu(k,i,j) = ru(k,i,j) / Adiagu(k,i,j) ! PC(r), where PC is formed from diagonal elements of A + if (Adiagu(i,j) /= 0.d0) then + zu(i,j) = ru(i,j) / Adiagu(i,j) ! PC(r), where PC is formed from diagonal elements of A else - zu(k,i,j) = 0.d0 + zu(i,j) = 0.d0 endif - if (Adiagv(k,i,j) /= 0.d0) then - zv(k,i,j) = rv(k,i,j) / Adiagv(k,i,j) + if (Adiagv(i,j) /= 0.d0) then + zv(i,j) = rv(i,j) / Adiagv(i,j) else - zv(k,i,j) = 0.d0 + zv(i,j) = 0.d0 endif - enddo ! k enddo ! i enddo ! j - elseif (precond == HO_PRECOND_SIA) then ! local vertical shallow-ice solver for preconditioning + !WHL - debug + if (verbose_pcg .and. this_rank == rtest) then +! i = itest +! write(iulog,*) ' ' +! write(iulog,*) 'zv solve with diagonal precond, this_rank, i =', this_rank, i +! write(iulog,*) 'j, active, Adiagv, rv, zv, xv:' +! do j = staggered_jhi, staggered_jlo, -1 +! write(iulog,'(i4, l4, 2f12.3, e12.3, f12.3)') j, active_vertex(i,j), Adiagv(i,j), rv(i,j), zv(i,j), xv(i,j) +! enddo + endif - call easy_sia_solver(nx, ny, nz, & - active_vertex, & - Muu, ru, zu) ! solve Muu*zu = ru for zu + elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then ! local - call easy_sia_solver(nx, ny, nz, & - active_vertex, & - Mvv, rv, zv) ! solve Mvv*zv = rv for zv + if (verbose_tridiag .and. this_rank == rtest) then + i = itest + j = jtest + write(iulog,*) 'Residual:' + write(iulog,*) 'r, i, j, ru:', this_rank, i, j, ru(i,j) + write(iulog,*) 'r, i, j, rv:', this_rank, i, j, rv(i,j) + write(iulog,*) ' ' + write(iulog,*) 'jtest =', jtest + write(iulog,*) 'i, ru, rv:' + do i = itest-3, itest+3 + write(iulog,'(i4, 2f15.10)') i, ru(i,j), rv(i,j) + enddo + endif - elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then + if (verbose_pcg .and. main_task) then + write(iulog,*) 'call tridiag_solver_local_2d' + endif - ! Use a local tridiagonal solver to find an approximate solution of A*z = r + ! Solve M*z = r, where M is a local tridiagonal matrix (one matrix per task) - call tridiag_solver_local_3d(& - nx, ny, & - nz, parallel, & - active_vertex, & - itest, jtest, rtest, & - Adiag_u, Adiag_v, & ! entries of 2D preconditioning matrix - Asubdiag_u, Asubdiag_v, & - Asupdiag_u, Asupdiag_v, & - omega_u, omega_v, & - denom_u, denom_v, & - Muu, Mvv, & ! entries of SIA matrix - ru, rv, & ! 3D residual - zu, zv) ! approximate solution of Az = r + !TODO - Test a local solver that can compute zu and zv in the halo + ! (to avoid the halo update below) - elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then + call tridiag_solver_local_2d(nx, ny, & + parallel, & + itest, jtest, rtest, & + Adiag_u, Adiag_v, & ! entries of preconditioning matrix + Asubdiag_u, Asubdiag_v, & + Asupdiag_u, Asupdiag_v, & + omega_u, omega_v, & + denom_u, denom_v, & + ru, rv, & ! right hand side + zu, zv) ! solution - ! Use a global tridiagonal solver to find an approximate solution of A*z = r + !WHL - debug + if (verbose_pcg .and. this_rank == rtest) then + j = jtest + write(iulog,*) ' ' + write(iulog,*) 'tridiag solve: i, ru, rv, zu, zv:' + do i = itest-3, itest+3 + write(iulog,'(i4, 4f16.10)') i, ru(i,j), rv(i,j), zu(i,j), zv(i,j) + enddo + endif - call tridiag_solver_global_3d(& - nx, ny, & - nz, parallel, & - active_vertex, & - ilocal, jlocal, & - tasks_row, tasks_col, & - itest, jtest, rtest, & - Adiag_u, Adiag_v, & ! entries of 2D preconditioning matrix - Asubdiag_u, Asubdiag_v, & - Asupdiag_u, Asupdiag_v, & - omega_u, omega_v, & - denom_u, denom_v, & - xuh_u, xuh_v, & - xlh_u, xlh_v, & - Muu, Mvv, & ! entries of SIA matrix - gather_data_row, gather_data_col, & - .true., & ! first_time = T (first iteration) - ru, rv, & ! 3D residual - zu, zv) ! approximate solution of Az = r + !Note: Need zu and zv in a row of halo cells so that q = A*d is correct in all locally owned cells + !TODO: See whether tridiag solvers could be modified to provide zu and zv in halo cells? + call staggered_parallel_halo(zu, parallel) + call staggered_parallel_halo(zv, parallel) + + elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then ! tridiagonal preconditioning with global solve + + ! convert ru(nx-1,ny-1) to b_u(ilocal,jlocal) + do j = 1, jlocal + jj = j + staggered_jlo - 1 + do i = 1, ilocal + ii = i + staggered_ilo - 1 + b_u(i,j) = ru(ii,jj) + enddo + enddo + + ! Solve M*z = r, where M is a global tridiagonal matrix + + call tridiag_solver_global_2d(ilocal, jlocal, & + parallel, tasks_row, & + 'row', & ! tridiagonal solve for each row +!! itest, jtest, rtest, & + itest - staggered_ilo + 1, & ! itest referenced to (ilocal,jlocal) coordinates + jtest - staggered_jlo + 1, & ! jtest referenced to (ilocal,jlocal) coordinates + rtest, & + Adiag_u, & + Asubdiag_u, Asupdiag_u, & + omega_u, denom_u, & + xuh_u, xlh_u, & + b_u, x_u, & + .true., & ! first_time + gather_data_row) + + ! convert x_u(ilocal,jlocal) to zu(nx-1,ny-1) + zu(:,:) = 0.0d0 + do j = 1, jlocal + jj = j + staggered_jlo - 1 + do i = 1, ilocal + ii = i + staggered_ilo - 1 + zu(ii,jj) = x_u(i,j) + enddo + enddo + + ! convert rv(nx-1,ny-1) to b_v(jlocal,ilocal) + + do i = 1, ilocal + ii = i + staggered_ilo - 1 + do j = 1, jlocal + jj = j + staggered_jlo - 1 + b_v(j,i) = rv(ii,jj) + enddo + enddo + + call tridiag_solver_global_2d(jlocal, ilocal, & + parallel, tasks_col, & + 'col', & ! tridiagonal solve for each column +!! itest, jtest, rtest, & + jtest - staggered_jlo + 1, & ! jtest referenced to (jlocal,ilocal) coordinates + itest - staggered_ilo + 1, & ! itest referenced to (jlocal,ilocal) coordinates + rtest, & + Adiag_v, & + Asubdiag_v, Asupdiag_v, & + omega_v, denom_v, & + xuh_v, xlh_v, & + b_v, x_v, & + .true., & ! first_time + gather_data_col) + + ! convert x_v(jlocal,ilocal) to zv(nx-1,ny-1) + + zv(:,:) = 0.0d0 + do i = 1, ilocal + ii = i + staggered_ilo - 1 + do j = 1, jlocal + jj = j + staggered_jlo - 1 + zv(ii,jj) = x_v(j,i) + enddo + enddo + + !Note: Need zu and zv in a row of halo cells so that q = A*d is correct in all locally owned cells + !TODO: See whether tridiag_solver_local_2d could be modified to provide zu and zv in halo cells? + call staggered_parallel_halo(zu, parallel) + call staggered_parallel_halo(zv, parallel) endif ! precond @@ -2121,23 +2145,23 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & !---- Compute intermediate result for dot product (r,z) call t_startf("pcg_dotprod") - work2u(:,:,:,1) = ru(:,:,:) * zu(:,:,:) - work2v(:,:,:,1) = rv(:,:,:) * zv(:,:,:) + work2u(:,:,1) = ru(:,:) * zu(:,:) + work2v(:,:,1) = rv(:,:) * zv(:,:) call t_stopf("pcg_dotprod") !---- Compute the conjugate direction vector d !---- Since z is correct in halo, so is d - du(:,:,:) = zu(:,:,:) - dv(:,:,:) = zv(:,:,:) + du(:,:) = zu(:,:) + dv(:,:) = zv(:,:) !---- Compute q = A*d - !---- q is correct for locally owned nodes + !---- q is correct for locally owned nodes, provided d extends one layer into the halo call t_startf("pcg_matmult_iter") - call matvec_multiply_structured_3d(nx, ny, & - nz, parallel, & - indxA_3d, active_vertex, & + call matvec_multiply_structured_2d(nx, ny, & + parallel, & + indxA_2d, active_vertex, & Auu, Auv, & Avu, Avv, & du, dv, & @@ -2147,14 +2171,14 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & !---- Compute intermediate result for dot product (d,q) = (d,Ad) call t_startf("pcg_dotprod") - work2u(:,:,:,2) = du(:,:,:) * qu(:,:,:) - work2v(:,:,:,2) = dv(:,:,:) * qv(:,:,:) + work2u(:,:,2) = du(:,:) * qu(:,:) + work2v(:,:,2) = dv(:,:) * qv(:,:) call t_stopf("pcg_dotprod") !---- Find global sums of (r,z) and (d,q) call t_startf("pcg_glbsum_iter") - gsum = parallel_global_sum_stagger(work2u, 2, parallel, work2v) + gsum = parallel_global_sum_stagger(work2u, 2, parallel, work2v) ! nflds = 2 call t_stopf("pcg_glbsum_iter") !---- Halo update for q @@ -2178,112 +2202,214 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & !---- These are correct in halo call t_startf("pcg_vecupdate") - xu(:,:,:) = xu(:,:,:) + alpha*du(:,:,:) - xv(:,:,:) = xv(:,:,:) + alpha*dv(:,:,:) + xu(:,:) = xu(:,:) + alpha*du(:,:) + xv(:,:) = xv(:,:) + alpha*dv(:,:) - ru(:,:,:) = ru(:,:,:) - alpha*qu(:,:,:) ! q = A*d - rv(:,:,:) = rv(:,:,:) - alpha*qv(:,:,:) + ru(:,:) = ru(:,:) - alpha*qu(:,:) ! q = A*d + rv(:,:) = rv(:,:) - alpha*qv(:,:) call t_stopf("pcg_vecupdate") + !WHL - debug if (verbose_pcg .and. this_rank == rtest) then j = jtest - write(iulog,*) ' ' - write(iulog,*) 'iter =', iter - write(iulog,*) 'i, k, xu, xv, ru, rv:' +!! write(iulog,*) ' ' +!! write(iulog,*) 'iter = 1: i, xu, xv, ru, rv:' do i = itest-3, itest+3 - write(iulog,*) ' ' - do k = 1, nz - write(iulog,'(2i4, 4f16.10)') i, k, xu(k,i,j), xv(k,i,j), ru(k,i,j), rv(k,i,j) - enddo +!! write(iulog,'(i4, 4f16.10)') i, xu(i,j), xv(i,j), ru(i,j), rv(i,j) enddo ! i endif + !--------------------------------------------------------------- ! Iterate to solution !--------------------------------------------------------------- iter_loop: do iter = 2, maxiters_chrongear ! first iteration done above + if (verbose_pcg .and. main_task) then +! write(iulog,*) 'iter =', iter + endif + !---- Compute PC(r) = solution z of Mz = r !---- z is correct in halo call t_startf("pcg_precond_iter") - if (precond == HO_PRECOND_NONE) then ! no preconditioning + if (precond == HO_PRECOND_NONE) then ! no preconditioning - zu(:,:,:) = ru(:,:,:) ! PC(r) = r - zv(:,:,:) = rv(:,:,:) ! PC(r) = r + zu(:,:) = ru(:,:) ! PC(r) = r + zv(:,:) = rv(:,:) ! PC(r) = r - elseif (precond == HO_PRECOND_DIAG ) then ! diagonal preconditioning + elseif (precond == HO_PRECOND_DIAG) then ! diagonal preconditioning do j = 1, ny-1 do i = 1, nx-1 - do k = 1, nz - if (Adiagu(k,i,j) /= 0.d0) then - zu(k,i,j) = ru(k,i,j) / Adiagu(k,i,j) ! PC(r), where PC is formed from diagonal elements of A + if (Adiagu(i,j) /= 0.d0) then + zu(i,j) = ru(i,j) / Adiagu(i,j) ! PC(r), where PC is formed from diagonal elements of A else - zu(k,i,j) = 0.d0 + zu(i,j) = 0.d0 endif - if (Adiagv(k,i,j) /= 0.d0) then - zv(k,i,j) = rv(k,i,j) / Adiagv(k,i,j) + if (Adiagv(i,j) /= 0.d0) then + zv(i,j) = rv(i,j) / Adiagv(i,j) else - zv(k,i,j) = 0.d0 + zv(i,j) = 0.d0 endif - enddo ! k enddo ! i enddo ! j - elseif (precond == HO_PRECOND_SIA) then ! local vertical shallow-ice solver for preconditioning + elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then ! tridiagonal preconditioning with local solve - call easy_sia_solver(nx, ny, nz, & - active_vertex, & - Muu, ru, zu) ! solve Muu*zu = ru for zu + ! Solve M*z = r, where M is a local tridiagonal matrix (one matrix per task) - call easy_sia_solver(nx, ny, nz, & - active_vertex, & - Mvv, rv, zv) ! solve Mvv*zv = rv for zv + !TODO - Test a local solver that can compute zu and zv in the halo + ! (to avoid the halo update below) - elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then + !WHL - debug + if (verbose_tridiag .and. this_rank == rtest) then + i = itest + j = jtest +! write(iulog,*) 'Residual:' +! write(iulog,*) 'r, i, j, ru:', this_rank, i, j, ru(i,j) +! write(iulog,*) 'r, i, j, rv:', this_rank, i, j, rv(i,j) +! write(iulog,*) ' ' +! write(iulog,*) 'jtest =', jtest +! write(iulog,*) 'i, ru, rv:' +! do i = staggered_ihi, staggered_ilo, -1 +! write(iulog,'(i4, 2f15.10)') i, ru(i,j), rv(i,j) +! enddo + endif - ! Use a local tridiagonal solver to find an approximate solution of A*z = r - call tridiag_solver_local_3d(& - nx, ny, & - nz, parallel, & - active_vertex, & - itest, jtest, rtest, & - Adiag_u, Adiag_v, & ! entries of 2D preconditioning matrix - Asubdiag_u, Asubdiag_v, & - Asupdiag_u, Asupdiag_v, & - omega_u, omega_v, & - denom_u, denom_v, & - Muu, Mvv, & ! entries of SIA matrix - ru, rv, & ! 3D residual - zu, zv) ! approximate solution of Az = r + !WHL - debug + if (verbose_pcg .and. this_rank == rtest) then + write(iulog,*) ' call tridiag_solver_local_2d' + endif + + call tridiag_solver_local_2d(nx, ny, & + parallel, & + itest, jtest, rtest, & + Adiag_u, Adiag_v, & ! entries of preconditioning matrix + Asubdiag_u, Asubdiag_v, & + Asupdiag_u, Asupdiag_v, & + omega_u, omega_v, & + denom_u, denom_v, & + ru, rv, & ! right hand side + zu, zv) ! solution + + !Note: Need zu and zv in a row of halo cells so that q = A*d is correct in all locally owned cells + !TODO: See whether tridiag solvers could be modified to provide zu and zv in halo cells? + call staggered_parallel_halo(zu, parallel) + call staggered_parallel_halo(zv, parallel) + + if (verbose_pcg .and. this_rank == rtest) then + j = jtest + write(iulog,*) ' ' + write(iulog,*) 'tridiag solve: i, ru, rv, zu, zv:' + do i = itest-3, itest+3 + write(iulog,'(i4, 4f16.10)') i, ru(i,j), rv(i,j), zu(i,j), zv(i,j) + enddo + endif elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then ! tridiagonal preconditioning with global solve - ! Use a global tridiagonal solver to find an approximate solution of A*z = r + !WHL - debug + if (verbose_tridiag .and. this_rank == rtest) then + j = jtest + write(iulog,*) ' ' + write(iulog,*) 'jtest =', jtest + write(iulog,*) 'i, ru, rv:' + do i = itest-3, itest+3 + write(iulog,'(i4, 2f15.10)') i, ru(i,j), rv(i,j) + enddo + endif - call tridiag_solver_global_3d(& - nx, ny, & - nz, parallel, & - active_vertex, & - ilocal, jlocal, & - tasks_row, tasks_col, & - itest, jtest, rtest, & - Adiag_u, Adiag_v, & ! entries of 2D preconditioning matrix - Asubdiag_u, Asubdiag_v, & - Asupdiag_u, Asupdiag_v, & - omega_u, omega_v, & - denom_u, denom_v, & - xuh_u, xuh_v, & - xlh_u, xlh_v, & - Muu, Mvv, & ! entries of SIA matrix - gather_data_row, gather_data_col, & - .false., & ! first_time = F (iteration 2+) - ru, rv, & ! 3D residual - zu, zv) ! approximate solution of Az = r + ! convert ru(nx-1,ny-1) to b_u(ilocal,jlocal) + + do j = 1, jlocal + jj = j + staggered_jlo - 1 + do i = 1, ilocal + ii = i + staggered_ilo - 1 + b_u(i,j) = ru(ii,jj) + enddo + enddo + + !WHL - debug + if (verbose_pcg .and. this_rank == rtest) then + j = jtest + write(iulog,*) ' ' + write(iulog,*) 'Before global tridiag PC u solve, r, j =', rtest, jtest + write(iulog,*) ' ' + write(iulog,*) 'i, Adiag_u, Asubdiag_u, Asupdiag_u, b_u:' + do i = itest-3, itest+3 + write(iulog,'(i4, 4e16.8)') i, Adiag_u(i,j), Asubdiag_u(i,j), Asupdiag_u(i,j), b_u(i,j) + enddo + endif + + call tridiag_solver_global_2d(ilocal, jlocal, & + parallel, tasks_row, & + 'row', & ! tridiagonal solve for each row +!! itest, jtest, rtest, & + itest - staggered_ilo + 1, & ! itest referenced to (ilocal,jlocal) coordinates + jtest - staggered_jlo + 1, & ! jtest referenced to (ilocal,jlocal) coordinates + rtest, & + Adiag_u, & + Asubdiag_u, Asupdiag_u, & + omega_u, denom_u, & + xuh_u, xlh_u, & + b_u, x_u, & + .false., & ! first_time + gather_data_row) + + ! convert x_u(ilocal,jlocal) to zu(nx-1,ny-1) + zu(:,:) = 0.0d0 + do j = 1, jlocal + jj = j + staggered_jlo - 1 + do i = 1, ilocal + ii = i + staggered_ilo - 1 + zu(ii,jj) = x_u(i,j) + enddo + enddo + + ! convert rv(nx-1,ny-1) to b_v(jlocal,ilocal) + + do i = 1, ilocal + ii = i + staggered_ilo - 1 + do j = 1, jlocal + jj = j + staggered_jlo - 1 + b_v(j,i) = rv(ii,jj) + enddo + enddo + + call tridiag_solver_global_2d(jlocal, ilocal, & + parallel, tasks_col, & + 'col', & ! tridiagonal solve for each column +!! itest, jtest, rtest, & + jtest - staggered_jlo + 1, & ! jtest referenced to (jlocal,ilocal) coordinates + itest - staggered_ilo + 1, & ! itest referenced to (jlocal,ilocal) coordinates + rtest, & + Adiag_v, & + Asubdiag_v, Asupdiag_v, & + omega_v, denom_v, & + xuh_v, xlh_v, & + b_v, x_v, & + .false., & ! first_time + gather_data_col) + + ! convert x_v(jlocal,ilocal) to zv(nx-1,ny-1) + + zv(:,:) = 0.0d0 + do i = 1, ilocal + ii = i + staggered_ilo - 1 + do j = 1, jlocal + jj = j + staggered_jlo - 1 + zv(ii,jj) = x_v(j,i) + enddo + enddo + + !Note: Need zu and zv in a row of halo cells so that q = A*d is correct in all locally owned cells + !TODO: See whether tridiag solvers could be modified to provide zu and zv in halo cells? + call staggered_parallel_halo(zu, parallel) + call staggered_parallel_halo(zv, parallel) endif ! precond @@ -2291,27 +2417,26 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & !---- Compute Az = A*z !---- This is the one matvec multiply required per iteration - !---- Az is correct for local owned nodes and needs a halo update (below) + !---- Az is correct for locally owned nodes and needs a halo update (below) call t_startf("pcg_matmult_iter") - call matvec_multiply_structured_3d(nx, ny, & - nz, parallel, & - indxA_3d, active_vertex, & + call matvec_multiply_structured_2d(nx, ny, & + parallel, & + indxA_2d, active_vertex, & Auu, Auv, & Avu, Avv, & zu, zv, & Azu, Azv) call t_stopf("pcg_matmult_iter") - !---- Compute intermediate results for the dot products (r,z) and (Az,z) call t_startf("pcg_dotprod") - work2u(:,:,:,1) = ru(:,:,:)*zu(:,:,:) ! terms of dot product (r,z) - work2v(:,:,:,1) = rv(:,:,:)*zv(:,:,:) + work2u(:,:,1) = ru(:,:)*zu(:,:) ! terms of dot product (r,z) + work2v(:,:,1) = rv(:,:)*zv(:,:) - work2u(:,:,:,2) = Azu(:,:,:)*zu(:,:,:) ! terms of dot product (A*z,z) - work2v(:,:,:,2) = Azv(:,:,:)*zv(:,:,:) + work2u(:,:,2) = Azu(:,:)*zu(:,:) ! terms of dot product (A*z,z) + work2v(:,:,2) = Azv(:,:)*zv(:,:) call t_stopf("pcg_dotprod") ! Take the global sums of (r,z) and (Az,z) @@ -2319,11 +2444,10 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & ! this is the one MPI global reduction per iteration. call t_startf("pcg_glbsum_iter") - gsum = parallel_global_sum_stagger(work2u, 2, parallel, work2v) + gsum = parallel_global_sum_stagger(work2u, 2, parallel, work2v) ! nflds = 2 call t_stopf("pcg_glbsum_iter") !---- Halo update for Az - !---- This is the one halo update required per iteration call t_startf("pcg_halo_iter") call staggered_parallel_halo(Azu, parallel) @@ -2342,7 +2466,7 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & if (alpha /= alpha) then ! alpha is NaN !! write(iulog,*) 'rho, sigma, alpha:', rho, sigma, alpha - call write_log('Chron-Gear PCG solver has failed, alpha = NaN', GM_FATAL) + call write_log('Chron_Gear PCG solver has failed, alpha = NaN', GM_FATAL) endif !---- Update d and q @@ -2350,38 +2474,25 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & call t_startf("pcg_vecupdate") - du(:,:,:) = zu(:,:,:) + beta*du(:,:,:) ! d_(i+1) = PC(r_(i+1)) + beta_(i+1)*d_i - dv(:,:,:) = zv(:,:,:) + beta*dv(:,:,:) ! - ! (r_(i+1), PC(r_(i+1))) - ! where beta_(i+1) = -------------------- - ! (r_i, PC(r_i)) - qu(:,:,:) = Azu(:,:,:) + beta*qu(:,:,:) - qv(:,:,:) = Azv(:,:,:) + beta*qv(:,:,:) + du(:,:) = zu(:,:) + beta*du(:,:) ! d_(i+1) = PC(r_(i+1)) + beta_(i+1)*d_i + dv(:,:) = zv(:,:) + beta*dv(:,:) ! + ! (r_(i+1), PC(r_(i+1))) + ! where beta_(i+1) = -------------------- + ! (r_i, PC(r_i)) + qu(:,:) = Azu(:,:) + beta*qu(:,:) + qv(:,:) = Azv(:,:) + beta*qv(:,:) !---- Update solution and residual !---- These are correct in halo - xu(:,:,:) = xu(:,:,:) + alpha*du(:,:,:) - xv(:,:,:) = xv(:,:,:) + alpha*dv(:,:,:) + xu(:,:) = xu(:,:) + alpha*du(:,:) + xv(:,:) = xv(:,:) + alpha*dv(:,:) - ru(:,:,:) = ru(:,:,:) - alpha*qu(:,:,:) - rv(:,:,:) = rv(:,:,:) - alpha*qv(:,:,:) + ru(:,:) = ru(:,:) - alpha*qu(:,:) + rv(:,:) = rv(:,:) - alpha*qv(:,:) call t_stopf("pcg_vecupdate") - if (verbose_pcg .and. this_rank == rtest) then - j = jtest - write(iulog,*) ' ' - write(iulog,*) 'iter =', iter - write(iulog,*) 'i, k, xu, xv, ru, rv:' - do i = itest-3, itest+3 - write(iulog,*) ' ' - do k = 1, nz - write(iulog,'(2i4, 4f16.10)') i, k, xu(k,i,j), xv(k,i,j), ru(k,i,j), rv(k,i,j) - enddo - enddo ! i - endif - ! Check for convergence every linear_solve_ncheck iterations. ! Also check at iter = 5, to reduce iterations when the nonlinear solver is close to convergence. ! TODO: Check at iter = linear_solve_ncheck/2 instead of 5? This would be answer-changing. @@ -2392,15 +2503,19 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & if (verbose_pcg .and. main_task) then write(iulog,*) ' ' - write(iulog,*) 'Check convergence, iter =', iter + write(iulog,*) ' check convergence, iter =', iter endif !---- Compute z = A*x (use z as a temp vector for A*x) + !WHL - debug - don't think this is needed, but try just in case +!! call staggered_parallel_halo(xu, parallel) +!! call staggered_parallel_halo(xv, parallel) + call t_startf("pcg_matmult_resid") - call matvec_multiply_structured_3d(nx, ny, & - nz, parallel, & - indxA_3d, active_vertex, & + call matvec_multiply_structured_2d(nx, ny, & + parallel, & + indxA_2d, active_vertex, & Auu, Auv, & Avu, Avv, & xu, xv, & @@ -2409,27 +2524,35 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & !---- Compute residual r = b - A*x + !WHL - debug - don't think this is needed, but try just in case +!! call staggered_parallel_halo(bu, parallel) +!! call staggered_parallel_halo(bv, parallel) + !WHL - debug - don't think this is needed, but try just in case +!! call staggered_parallel_halo(zu, parallel) +!! call staggered_parallel_halo(zv, parallel) + call t_startf("pcg_vecupdate") - ru(:,:,:) = bu(:,:,:) - zu(:,:,:) - rv(:,:,:) = bv(:,:,:) - zv(:,:,:) + ru(:,:) = bu(:,:) - zu(:,:) + rv(:,:) = bv(:,:) - zv(:,:) call t_stopf("pcg_vecupdate") + !WHL - debug - don't think this is needed, but try just in case +!! call staggered_parallel_halo(ru, parallel) +!! call staggered_parallel_halo(rv, parallel) + !---- Compute dot product (r, r) call t_startf("pcg_dotprod") - worku(:,:,:) = ru(:,:,:)*ru(:,:,:) - workv(:,:,:) = rv(:,:,:)*rv(:,:,:) + worku(:,:) = ru(:,:)*ru(:,:) + workv(:,:) = rv(:,:)*rv(:,:) call t_stopf("pcg_dotprod") call t_startf("pcg_glbsum_resid") rr = parallel_global_sum_stagger(worku, parallel, workv) call t_stopf("pcg_glbsum_resid") - ! take square root - L2_resid = sqrt(rr) - - ! compute normalized error - err = L2_resid/L2_rhs + L2_resid = sqrt(rr) ! L2 norm of residual + err = L2_resid/L2_rhs ! normalized error if (verbose_pcg .and. main_task) then write(iulog,*) 'iter, L2_resid, L2_rhs, error =', iter, L2_resid, L2_rhs, err @@ -2443,13 +2566,13 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & ju_max = 0 do j = staggered_jlo, staggered_jhi do i = staggered_ilo, staggered_ihi - if (sum(abs(ru(:,i,j))) > ru_max) then - ru_max = sum(abs(ru(:,i,j))) + if (abs(ru(i,j)) > ru_max) then + ru_max = ru(i,j) iu_max = i ju_max = j endif - if (sum(abs(rv(:,i,j))) > rv_max) then - rv_max = sum(abs(rv(:,i,j))) + if (abs(rv(i,j)) > rv_max) then + rv_max = rv(i,j) iv_max = i jv_max = j endif @@ -2497,16 +2620,19 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & if (allocated(denom_u)) deallocate(denom_u, denom_v) if (allocated(xuh_u)) deallocate(xuh_u, xuh_v) if (allocated(xlh_u)) deallocate(xlh_u, xlh_v) + if (allocated(b_u)) deallocate(b_u, b_v) + if (allocated(x_u)) deallocate(x_u, x_v) if (allocated(gather_data_row)) deallocate(gather_data_row) if (allocated(gather_data_col)) deallocate(gather_data_col) - end subroutine pcg_solver_chrongear_3d + end subroutine pcg_solver_chrongear_2d !**************************************************************************** - - subroutine pcg_solver_chrongear_2d(nx, ny, & - parallel, & - indxA_2d, active_vertex, & + + subroutine pcg_solver_chrongear_3d(nx, ny, & + nz, parallel, & + indxA_2d, indxA_3d, & + active_vertex, & Auu, Auv, & Avu, Avv, & bu, bv, & @@ -2518,26 +2644,104 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & !--------------------------------------------------------------- ! This subroutine uses a Chronopoulos-Gear preconditioned conjugate-gradient - ! algorithm to solve the equation $Ax=b$. (See references in subroutine above.) + ! algorithm to solve the equation $Ax=b$. ! - ! It is similar to subroutine pcg_solver_chrongear_3d, but modified - ! to solve for x and y at a single horizontal level, as in the - ! shallow-shelf approximation. See the comments in that subroutine - ! (above) for more details on data structure and solver methods. + ! It is based on the Chronopoulos-Gear PCG solver in the POP ocean model + ! (author Frank Bryan, NCAR). It is a rearranged conjugate gradient solver + ! that reduces the number of global reductions per iteration from two to one + ! (not counting the convergence check). Convergence is checked every + ! {\em linear_solve_ncheck} steps. ! - ! Input and output arrays are located on a structured (i,j) grid - ! as defined in the glissade_velo_higher module. The global matrix - ! is sparse, but its nonzero element are stored in four dense matrices - ! called Auu, Avv, Auv, and Avu. Each matrix has 3x3 = 9 potential - ! nonzero elements per node (i,j). + ! References are: ! - ! The current preconditioning options for the solver are + ! Chronopoulos, A.T., A Class of Parallel Iterative Methods Implemented on Multiprocessors, + ! Ph.D. thesis, Technical Report UIUCDCS-R-86-1267, Department of Computer Science, + ! University of Illinois, Urbana, Illinois, pp. 1-116, 1986. + ! + ! Chronopoulos, A.T., and C.W. Gear. s-step iterative methods + ! for symmetric linear systems. J. Comput. Appl. Math., 25(2), + ! 153-168, 1989. + ! + ! Dongarra, J. and V. Eijkhout. LAPACK Working Note 159. + ! Finite-choice algorithm optimization in conjugate gradients. + ! Tech. Rep. ut-cs-03-502. Computer Science Department. + ! University of Tennessee, Knoxville. 2003. + ! + ! D Azevedo, E.F., V.L. Eijkhout, and C.H. Romine. LAPACK Working + ! Note 56. Conjugate gradient algorithms with reduced + ! synchronization overhead on distributed memory multiprocessors. + ! Tech. Rep. CS-93-185. Computer Science Department. + ! University of Tennessee, Knoxville. 1993. + !--------------------------------------------------------------- + ! + ! The input and output arrays are located on a structured (i,j,k) grid + ! as defined in the glissade_velo_higher module. + ! The global matrix is sparse, but its nonzero elements are stored in + ! four dense matrices called Auu, Avv, Auv, and Avu. + ! Each matrix has 3x3x3 = 27 potential nonzero elements per node (i,j,k). + ! + ! The current preconditioning options are ! (0) no preconditioning ! (1) diagonal preconditioning - ! (3) local tridiagonal preconditioning - ! (4) global tridiagonal preconditioning - ! The SIA-based preconditioning option is not available for a 2D solve. + ! (2) preconditioning using a physics-based SIA solver + ! + ! For the dome test case with higher-order dynamics, option (2) is best. + ! + ! Here is a schematic of the method implemented below for solving Ax = b: + ! + ! Set up preconditioner M + ! work0 = (b,b) + ! bb = global_sum(work0) + ! + ! First pass of algorithm: + ! halo_update(x) + ! r = b - A*x + ! halo_update(r) + ! solve Mz = r for z + ! work(1) = (r,z) + ! d = z + ! q = A*d + ! work(2) = (d,q) + ! halo_update(q) + ! rho_old = global_sum(work(1)) + ! sigma = global_sum(work(2)) + ! alpha = rho_old/sigma + ! x = x + alpha*d + ! r = r - alpha*q + ! + ! Iterative loop: + ! while (not converged) + ! solve Mz = r for z + ! Az = A*z + ! work(1) = (r,z) + ! work(2) = (Az,z) + ! halo_update(Az) + ! rho = global_sum(work(1)) + ! delta = global_sum(work(2)) + ! beta = rho/rho_old + ! sigma = delta - beta^2 * sigma + ! alpha = rho/sigma + ! rho_old = rho + ! d = z + beta*d + ! q = Az + beta*q + ! x = x + alpha*d + ! r = r - alpha*q + ! if (time to check convergence) then + ! r = b - A*x + ! work0 = (r,r) + ! halo_update(r) + ! rr = global_sum(work0) + ! if (sqrt(r,r)/sqrt(b,b) < tolerance) exit + ! endif + ! end while ! + ! where x = solution vector + ! d = conjugate direction vector + ! r = residual vector + ! M = preconditioning matrix + ! (r,z) = dot product of vectors r and z + ! and similarly for (Az,z), etc. + ! !--------------------------------------------------------------- !--------------------------------------------------------------- @@ -2545,8 +2749,9 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & !--------------------------------------------------------------- integer, intent(in) :: & - nx, ny ! horizontal grid dimensions (for scalars) - ! velocity grid has dimensions (nx-1,ny-1) + nx, ny, & ! horizontal grid dimensions (for scalars) + ! velocity grid has dimensions (nx-1,ny-1) + nz ! number of vertical levels where velocity is computed type(parallel_type), intent(in) :: & parallel ! info for parallel communication @@ -2554,54 +2759,60 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & integer, dimension(-1:1,-1:1), intent(in) :: & indxA_2d ! maps relative (x,y) coordinates to an index between 1 and 9 + integer, dimension(-1:1,-1:1,-1:1), intent(in) :: & + indxA_3d ! maps relative (x,y,z) coordinates to an index between 1 and 27 + logical, dimension(nx-1,ny-1), intent(in) :: & active_vertex ! T for columns (i,j) where velocity is computed, else F - real(dp), dimension(nx-1,ny-1,9), intent(in) :: & - Auu, Auv, & ! four components of assembled matrix - Avu, Avv ! 3rd dimension = 9 (node and its nearest neighbors in x and y direction) - ! 1st and 2nd dimensions = (x,y) indices + real(dp), dimension(27,nz,nx-1,ny-1), intent(in) :: & + Auu, Auv, Avu, Avv ! four components of assembled matrix + ! 1st dimension = 27 (node and its nearest neighbors in x, y and z direction) + ! other dimensions = (z,x,y) indices ! ! Auu | Auv ! _____|____ ! Avu | Avv ! | - real(dp), dimension(nx-1,ny-1), intent(in) :: & + real(dp), dimension(nz,nx-1,ny-1), intent(in) :: & bu, bv ! assembled load (rhs) vector, divided into 2 parts - real(dp), dimension(nx-1,ny-1), intent(inout) :: & + real(dp), dimension(nz,nx-1,ny-1), intent(inout) :: & xu, xv ! u and v components of solution (i.e., uvel and vvel) integer, intent(in) :: & precond ! = 0 for no preconditioning ! = 1 for diagonal preconditioning (best option for SSA-dominated flow) + ! = 2 for preconditioning with SIA solver (works well for SIA-dominated flow) integer, intent(in) :: & - linear_solve_ncheck ! number of iterations between convergence checks in the linear solver + linear_solve_ncheck ! number of iterations between convergence checks in the linear solver integer, intent(in) :: & - maxiters ! max number of linear iterations before quitting + maxiters ! max number of linear iterations before quitting real(dp), intent(in) :: & - tolerance ! tolerance for linear solver + tolerance ! tolerance for linear solver real(dp), intent(out) :: & - err ! error (L2 norm of residual) in final solution + err ! error (L2 norm of residual) in final solution integer, intent(out) :: & - niters ! iterations needed to solution + niters ! iterations needed to solution integer, intent(in) :: & - itest, jtest, rtest ! point for debugging diagnostics + itest, jtest, rtest ! point for debugging diagnostics !--------------------------------------------------------------- ! Local variables and parameters !--------------------------------------------------------------- - integer :: i, j, ii, jj ! grid indices - integer :: m ! matrix element index - integer :: iter ! iteration counter + integer :: i, j, k, m ! grid indices + integer :: ii, jj + integer :: ilocal, jlocal ! number of locally owned vertices in each direction + integer :: iter ! iteration counter + integer :: maxiters_chrongear ! max number of linear iterations before quitting real(dp) :: & alpha, &! rho/sigma = term in expression for new residual and solution @@ -2615,7 +2826,8 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & gsum ! result of global sum for dot products ! vectors (each of these is split into u and v components) - real(dp), dimension(nx-1,ny-1) :: & + real(dp), dimension(nz,nx-1,ny-1) :: & + Adiagu, Adiagv, &! diagonal terms of matrices Auu and Avv ru, rv, &! residual vector (b-Ax) du, dv, &! conjugate direction vector zu, zv, &! solution of Mz = r @@ -2623,7 +2835,7 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & Azu, Azv, &! result of matvec multiply A*z worku, workv ! intermediate results - real(dp), dimension(nx-1,ny-1,2) :: & + real(dp), dimension(nz,nx-1,ny-1,2) :: & work2u, work2v ! intermediate results real(dp) :: & @@ -2633,11 +2845,12 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & L2_rhs ! L2 norm of rhs vector = sqrt(b,b) ! solver is converged when L2_resid/L2_rhs < tolerance - ! diagonal matrix elements - real(dp), dimension(nx-1,ny-1) :: & - Adiagu, Adiagv ! diagonal terms of matrices Auu and Avv + real(dp), dimension(-1:1,nz,nx-1,ny-1) :: & + Muu, Mvv ! simplified SIA matrices for preconditioning + + ! arrays for tridiagonal preconditioning + ! Note: 2D diagonal entries are Adiag_u and Adiag_v; distinct from 3D Adiagu and Adiagv above - ! tridiagonal matrix elements real(dp), dimension(:,:), allocatable :: & Asubdiag_u, Adiag_u, Asupdiag_u, & ! matrix entries from Auu for tridiagonal preconditioning Asubdiag_v, Adiag_v, Asupdiag_v ! matrix entries from Avv for tridiagonal preconditioning @@ -2648,17 +2861,18 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & xuh_u, xuh_v, & xlh_u, xlh_v - real(dp), dimension(:,:), allocatable :: & - b_u, b_v, x_u, x_v + ! Note: These two matrices are global in the EW and NS dimensions, respectively. + ! Each holds 8 pieces of information for each task on each row or column. + ! Since only 2 of these 8 pieces of information change from one iteration to the next, + ! it is more efficient to gather the remaining information once and pass the arrays + ! with intent(inout), than to declare the arrays in subroutine tridiag_solver_global_2d + ! and gather all the information every time the subroutine is called. + ! TODO: Revisit this. Is the efficiency gain large enough to justify the extra complexity? real(dp), dimension(:,:), allocatable :: & gather_data_row, & ! arrays for gathering data from every task on a row or column gather_data_col - integer :: ilocal, jlocal ! number of locally owned vertices in each direction - - integer :: maxiters_chrongear ! max number of linear iterations before quitting - integer :: & staggered_ilo, staggered_ihi, & ! bounds of locally owned vertices on staggered grid staggered_jlo, staggered_jhi @@ -2667,17 +2881,9 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & tasks_row, & ! number of tasks per row and column for tridiagonal solve tasks_col - integer, parameter :: & - maxiters_tridiag = 100 ! max number of linear iterations for tridiagonal preconditioning, - ! which generally leads to faster convergence than diagonal preconditioning - - !WHL - debug - real(dp) :: usum, usum_global, vsum, vsum_global - !WHL - debug integer :: iu_max, ju_max, iv_max, jv_max real(dp) :: ru_max, rv_max - real(dp) :: sum_temp staggered_ilo = parallel%staggered_ilo staggered_ihi = parallel%staggered_ihi @@ -2687,17 +2893,22 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & tasks_row = parallel%tasks_row tasks_col = parallel%tasks_col + ! Note: maxiters_tridiag commented out here, because the BP tridiagonal solver + ! tends not to converge as well as the 2D version. + ! TODO: Make maxiters a config option. + ! Set the maximum number of linear iterations. ! Typically allow up to 200 iterations with diagonal preconditioning, but only 100 ! with tridiagonal, which usually converges faster. - if (precond == HO_PRECOND_TRIDIAG_LOCAL .or. precond == HO_PRECOND_TRIDIAG_GLOBAL) then - maxiters_chrongear = maxiters_tridiag - else + !TODO - Test whether maxiters_tridiag (currently = 100) is sufficient for convergence with 3D solver +!! if (precond == HO_PRECOND_TRIDIAG_LOCAL .or. precond == HO_PRECOND_TRIDIAG_GLOBAL) then +!! maxiters_chrongear = maxiters_tridiag +!! else maxiters_chrongear = maxiters - endif +!! endif - if (verbose_pcg .and. main_task) then + if (verbose_pcg .and. this_rank == rtest) then write(iulog,*) 'Using native PCG solver (Chronopoulos-Gear)' write(iulog,*) 'tolerance, maxiters, precond =', tolerance, maxiters_chrongear, precond endif @@ -2712,14 +2923,14 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & if (precond == HO_PRECOND_NONE) then ! no preconditioner - if (verbose_pcg .and. main_task) then + if (verbose_pcg .and. this_rank == rtest) then write(iulog,*) 'Using no preconditioner' endif elseif (precond == HO_PRECOND_DIAG) then - call setup_preconditioner_diag_2d(nx, ny, & - indxA_2d, & + call setup_preconditioner_diag_3d(nx, ny, & + nz, indxA_3d, & Auu, Avv, & Adiagu, Adiagv) @@ -2728,22 +2939,32 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & i = itest j = jtest write(iulog,*) 'i, j, r =', i, j, this_rank - write(iulog,*) 'Au diag =', Adiagu(i,j) - write(iulog,*) 'Av diag =', Adiagv(i,j) + write(iulog,*) 'Auu diag =', Adiagu(:,i,j) + write(iulog,*) 'Avv diag =', Adiagv(:,i,j) endif - elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then + elseif (precond == HO_PRECOND_SIA) then - !WHL - debug - if (verbose_tridiag .and. this_rank==rtest) then - i = itest + call setup_preconditioner_sia_3d(nx, ny, & + nz, indxA_3d, & + Auu, Avv, & + Muu, Mvv) + + if (verbose_pcg .and. this_rank == rtest) then j = jtest write(iulog,*) ' ' - write(iulog,*) 'r, i, j =', this_rank, i, j - write(iulog,*) 'Auu =', Auu(i,j,:) - write(iulog,*) 'Avv =', Avv(i,j,:) + write(iulog,*) 'i, k, Muu_sia, Mvv_sia:' + do i = staggered_ihi, staggered_ilo, -1 + write(iulog,*) ' ' + do k = 1, nz + write(iulog,'(2i4, 6e13.5)') i, k, Muu(:,k,i,j), Mvv(:,k,i,j) + enddo + enddo ! i endif + elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then + + ! Allocate tridiagonal preconditioning matrices allocate(Adiag_u (nx-1,ny-1)) allocate(Asubdiag_u(nx-1,ny-1)) allocate(Asupdiag_u(nx-1,ny-1)) @@ -2756,22 +2977,26 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & allocate(omega_v (nx-1,ny-1)) allocate(denom_v (nx-1,ny-1)) - call setup_preconditioner_tridiag_local_2d(& - nx, ny, & - parallel, indxA_2d, & - itest, jtest, rtest, & - Auu, Avv, & - Adiag_u, Adiag_v, & - Asubdiag_u, Asubdiag_v, & - Asupdiag_u, Asupdiag_v, & - omega_u, omega_v, & - denom_u, denom_v) + ! Compute arrays for tridiagonal preconditioning + call setup_preconditioner_tridiag_local_3d(& + nx, ny, & + nz, parallel, & + active_vertex, & + indxA_2d, indxA_3d, & + itest, jtest, rtest, & + Auu, Avv, & + Muu, Mvv, & + Adiag_u, Adiag_v, & + Asubdiag_u, Asubdiag_v, & + Asupdiag_u, Asupdiag_v, & + omega_u, omega_v, & + denom_u, denom_v) + elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then ! Allocate tridiagonal preconditioning matrices - ! Note: (i,j) indices are switced for the A_v matrices to reduce striding. - + ! Note: (i,j) indices are switched for the A_v matrices to reduce striding. allocate(Adiag_u (ilocal,jlocal)) allocate(Asubdiag_u(ilocal,jlocal)) allocate(Asupdiag_u(ilocal,jlocal)) @@ -2779,8 +3004,6 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & allocate(denom_u(ilocal,jlocal)) allocate(xuh_u(ilocal,jlocal)) allocate(xlh_u(ilocal,jlocal)) - allocate(b_u(ilocal,jlocal)) - allocate(x_u(ilocal,jlocal)) allocate(Adiag_v (jlocal,ilocal)) allocate(Asubdiag_v(jlocal,ilocal)) @@ -2789,8 +3012,6 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & allocate(denom_v(jlocal,ilocal)) allocate(xuh_v(jlocal,ilocal)) allocate(xlh_v(jlocal,ilocal)) - allocate(b_v(jlocal,ilocal)) - allocate(x_v(jlocal,ilocal)) ! These two matrices are for gathering data from all tasks on a given row or column. allocate(gather_data_row(8*tasks_row,jlocal)) @@ -2798,87 +3019,53 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & gather_data_row = 0.0d0 gather_data_col = 0.0d0 - ! Compute the entries of the tridiagonal matrices - - ! Extract tridiagonal matrix entries from Auu - do j = 1, jlocal - jj = j + staggered_jlo - 1 - do i = 1, ilocal - ii = i + staggered_ilo - 1 - Asubdiag_u(i,j) = Auu(ii,jj,indxA_2d(-1,0)) ! subdiagonal elements - Adiag_u (i,j) = Auu(ii,jj,indxA_2d( 0,0)) ! diagonal elements - Asupdiag_u(i,j) = Auu(ii,jj,indxA_2d( 1,0)) ! superdiagonal elements - enddo - enddo - - ! compute work arrays for the u solve in each matrix row - call setup_preconditioner_tridiag_global_2d(& - ilocal, jlocal, & -!! itest, jtest, rtest, & - itest - staggered_ilo + 1, & ! itest referenced to (ilocal,jlocal) coordinates - jtest - staggered_jlo + 1, & ! jtest referenced to (ilocal,jlocal) coordinates - rtest, & - Adiag_u, & - Asubdiag_u, Asupdiag_u, & - omega_u, denom_u, & - xuh_u, xlh_u) - - ! Extract tridiagonal matrix entries from Avv - do i = 1, ilocal - ii = i + staggered_ilo - 1 - do j = 1, jlocal - jj = j + staggered_jlo - 1 - Asubdiag_v(j,i) = Avv(ii,jj,indxA_2d(0,-1)) ! subdiagonal elements - Adiag_v (j,i) = Avv(ii,jj,indxA_2d(0, 0)) ! diagonal elements - Asupdiag_v(j,i) = Avv(ii,jj,indxA_2d(0, 1)) ! superdiagonal elements - enddo - enddo - - ! compute work arrays for the v solve in each matrix column - ! Note: The *_v arrays have dimensions (jlocal,ilocal) to reduce strides + ! Compute arrays for tridiagonal preconditioning - call setup_preconditioner_tridiag_global_2d(& - jlocal, ilocal, & -!! itest, jtest, rtest, & - jtest - staggered_jlo + 1, & ! jtest referenced to (jlocal,ilocal) coordinates - itest - staggered_ilo + 1, & ! itest referenced to (jlocal,ilocal) coordinates - rtest, & - Adiag_v, & - Asubdiag_v, Asupdiag_v, & - omega_v, denom_v, & - xuh_v, xlh_v) + call setup_preconditioner_tridiag_global_3d(& + nx, ny, & + nz, parallel, & + active_vertex, & + indxA_2d, indxA_3d, & + ilocal, jlocal, & + itest, jtest, rtest, & + Auu, Avv, & + Muu, Mvv, & + Adiag_u, Adiag_v, & + Asubdiag_u, Asubdiag_v, & + Asupdiag_u, Asupdiag_v, & + omega_u, omega_v, & + denom_u, denom_v, & + xuh_u, xuh_v, & + xlh_u, xlh_v) endif ! precond - !WHL - debug - if (verbose_pcg .and. main_task) write(iulog,*) 'Done in PC setup' - call t_stopf("pcg_precond_init") !---- Initialize scalars and vectors niters = maxiters_chrongear - ru(:,:) = 0.d0 - rv(:,:) = 0.d0 - du(:,:) = 0.d0 - dv(:,:) = 0.d0 - zu(:,:) = 0.d0 - zv(:,:) = 0.d0 - qu(:,:) = 0.d0 - qv(:,:) = 0.d0 - Azu(:,:) = 0.d0 - Azv(:,:) = 0.d0 - worku(:,:) = 0.d0 - workv(:,:) = 0.d0 - work2u(:,:,:) = 0.d0 - work2v(:,:,:) = 0.d0 + ru(:,:,:) = 0.d0 + rv(:,:,:) = 0.d0 + du(:,:,:) = 0.d0 + dv(:,:,:) = 0.d0 + zu(:,:,:) = 0.d0 + zv(:,:,:) = 0.d0 + qu(:,:,:) = 0.d0 + qv(:,:,:) = 0.d0 + Azu(:,:,:) = 0.d0 + Azv(:,:,:) = 0.d0 + worku(:,:,:) = 0.d0 + workv(:,:,:) = 0.d0 + work2u(:,:,:,:) = 0.d0 + work2v(:,:,:,:) = 0.d0 !---- Compute the L2 norm of the RHS vectors !---- (Goal is to obtain L2_resid/L2_rhs < tolerance) call t_startf("pcg_dotprod") - worku(:,:) = bu(:,:)*bu(:,:) ! terms of dot product (b, b) - workv(:,:) = bv(:,:)*bv(:,:) + worku(:,:,:) = bu(:,:,:)*bu(:,:,:) ! terms of dot product (b, b) + workv(:,:,:) = bv(:,:,:)*bv(:,:,:) call t_stopf("pcg_dotprod") ! find global sum of the squared L2 norm @@ -2895,6 +3082,8 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & ! First pass of algorithm !--------------------------------------------------------------- + iter = 1 + ! Note: The matrix A must be complete for all rows corresponding to locally ! owned nodes, and x must have the correct values in ! halo nodes bordering the locally owned nodes. @@ -2910,9 +3099,9 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & !---- Compute A*x (use z as a temp vector for A*x) call t_startf("pcg_matmult_init") - call matvec_multiply_structured_2d(nx, ny, & - parallel, & - indxA_2d, active_vertex, & + call matvec_multiply_structured_3d(nx, ny, & + nz, parallel, & + indxA_3d, active_vertex, & Auu, Auv, & Avu, Avv, & xu, xv, & @@ -2923,8 +3112,8 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & !---- This is correct for locally owned nodes. call t_startf("pcg_vecupdate") - ru(:,:) = bu(:,:) - zu(:,:) - rv(:,:) = bv(:,:) - zv(:,:) + ru(:,:,:) = bu(:,:,:) - zu(:,:,:) + rv(:,:,:) = bv(:,:,:) - zv(:,:,:) call t_stopf("pcg_vecupdate") !---- Halo update for residual @@ -2940,169 +3129,81 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & ! From here on, call timers with 'iter' suffix because this can be considered the first iteration call t_startf("pcg_precond_iter") - if (precond == HO_PRECOND_NONE) then ! no preconditioning + if (precond == HO_PRECOND_NONE) then ! no preconditioning - zu(:,:) = ru(:,:) ! PC(r) = r - zv(:,:) = rv(:,:) ! PC(r) = r + zu(:,:,:) = ru(:,:,:) ! PC(r) = r + zv(:,:,:) = rv(:,:,:) ! PC(r) = r - elseif (precond == HO_PRECOND_DIAG) then ! diagonal preconditioning + elseif (precond == HO_PRECOND_DIAG ) then ! diagonal preconditioning - ! Solve Mz = r, where M is a diagonal matrix do j = 1, ny-1 do i = 1, nx-1 - if (Adiagu(i,j) /= 0.d0) then - zu(i,j) = ru(i,j) / Adiagu(i,j) ! PC(r), where PC is formed from diagonal elements of A + do k = 1, nz + if (Adiagu(k,i,j) /= 0.d0) then + zu(k,i,j) = ru(k,i,j) / Adiagu(k,i,j) ! PC(r), where PC is formed from diagonal elements of A else - zu(i,j) = 0.d0 + zu(k,i,j) = 0.d0 endif - if (Adiagv(i,j) /= 0.d0) then - zv(i,j) = rv(i,j) / Adiagv(i,j) + if (Adiagv(k,i,j) /= 0.d0) then + zv(k,i,j) = rv(k,i,j) / Adiagv(k,i,j) else - zv(i,j) = 0.d0 + zv(k,i,j) = 0.d0 endif + enddo ! k enddo ! i enddo ! j - !WHL - debug - if (verbose_pcg .and. this_rank == rtest) then -! i = itest -! write(iulog,*) ' ' -! write(iulog,*) 'zv solve with diagonal precond, this_rank, i =', this_rank, i -! write(iulog,*) 'j, active, Adiagv, rv, zv, xv:' -! do j = staggered_jhi, staggered_jlo, -1 -! write(iulog,'(i4, l4, 2f12.3, e12.3, f12.3)') j, active_vertex(i,j), Adiagv(i,j), rv(i,j), zv(i,j), xv(i,j) -! enddo - endif - - elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then ! local - - if (verbose_tridiag .and. this_rank == rtest) then - i = itest - j = jtest - write(iulog,*) 'Residual:' - write(iulog,*) 'r, i, j, ru:', this_rank, i, j, ru(i,j) - write(iulog,*) 'r, i, j, rv:', this_rank, i, j, rv(i,j) - write(iulog,*) ' ' - write(iulog,*) 'jtest =', jtest - write(iulog,*) 'i, ru, rv:' - do i = itest-3, itest+3 - write(iulog,'(i4, 2f15.10)') i, ru(i,j), rv(i,j) - enddo - endif - - if (verbose_pcg .and. main_task) then - write(iulog,*) 'call tridiag_solver_local_2d' - endif - - ! Solve M*z = r, where M is a local tridiagonal matrix (one matrix per task) - - !TODO - Test a local solver that can compute zu and zv in the halo - ! (to avoid the halo update below) - - call tridiag_solver_local_2d(nx, ny, & - parallel, & - itest, jtest, rtest, & - Adiag_u, Adiag_v, & ! entries of preconditioning matrix - Asubdiag_u, Asubdiag_v, & - Asupdiag_u, Asupdiag_v, & - omega_u, omega_v, & - denom_u, denom_v, & - ru, rv, & ! right hand side - zu, zv) ! solution - - !WHL - debug - if (verbose_pcg .and. this_rank == rtest) then - j = jtest - write(iulog,*) ' ' - write(iulog,*) 'tridiag solve: i, ru, rv, zu, zv:' - do i = itest-3, itest+3 - write(iulog,'(i4, 4f16.10)') i, ru(i,j), rv(i,j), zu(i,j), zv(i,j) - enddo - endif - - !Note: Need zu and zv in a row of halo cells so that q = A*d is correct in all locally owned cells - !TODO: See whether tridiag solvers could be modified to provide zu and zv in halo cells? - call staggered_parallel_halo(zu, parallel) - call staggered_parallel_halo(zv, parallel) - - elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then ! tridiagonal preconditioning with global solve - - ! convert ru(nx-1,ny-1) to b_u(ilocal,jlocal) - do j = 1, jlocal - jj = j + staggered_jlo - 1 - do i = 1, ilocal - ii = i + staggered_ilo - 1 - b_u(i,j) = ru(ii,jj) - enddo - enddo - - ! Solve M*z = r, where M is a global tridiagonal matrix + elseif (precond == HO_PRECOND_SIA) then ! local vertical shallow-ice solver for preconditioning - call tridiag_solver_global_2d(ilocal, jlocal, & - parallel, tasks_row, & - 'row', & ! tridiagonal solve for each row -!! itest, jtest, rtest, & - itest - staggered_ilo + 1, & ! itest referenced to (ilocal,jlocal) coordinates - jtest - staggered_jlo + 1, & ! jtest referenced to (ilocal,jlocal) coordinates - rtest, & - Adiag_u, & - Asubdiag_u, Asupdiag_u, & - omega_u, denom_u, & - xuh_u, xlh_u, & - b_u, x_u, & - .true., & ! first_time - gather_data_row) + call easy_sia_solver(nx, ny, nz, & + active_vertex, & + Muu, ru, zu) ! solve Muu*zu = ru for zu - ! convert x_u(ilocal,jlocal) to zu(nx-1,ny-1) - zu(:,:) = 0.0d0 - do j = 1, jlocal - jj = j + staggered_jlo - 1 - do i = 1, ilocal - ii = i + staggered_ilo - 1 - zu(ii,jj) = x_u(i,j) - enddo - enddo + call easy_sia_solver(nx, ny, nz, & + active_vertex, & + Mvv, rv, zv) ! solve Mvv*zv = rv for zv - ! convert rv(nx-1,ny-1) to b_v(jlocal,ilocal) + elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then - do i = 1, ilocal - ii = i + staggered_ilo - 1 - do j = 1, jlocal - jj = j + staggered_jlo - 1 - b_v(j,i) = rv(ii,jj) - enddo - enddo + ! Use a local tridiagonal solver to find an approximate solution of A*z = r - call tridiag_solver_global_2d(jlocal, ilocal, & - parallel, tasks_col, & - 'col', & ! tridiagonal solve for each column -!! itest, jtest, rtest, & - jtest - staggered_jlo + 1, & ! jtest referenced to (jlocal,ilocal) coordinates - itest - staggered_ilo + 1, & ! itest referenced to (jlocal,ilocal) coordinates - rtest, & - Adiag_v, & - Asubdiag_v, Asupdiag_v, & - omega_v, denom_v, & - xuh_v, xlh_v, & - b_v, x_v, & - .true., & ! first_time - gather_data_col) + call tridiag_solver_local_3d(& + nx, ny, & + nz, parallel, & + active_vertex, & + itest, jtest, rtest, & + Adiag_u, Adiag_v, & ! entries of 2D preconditioning matrix + Asubdiag_u, Asubdiag_v, & + Asupdiag_u, Asupdiag_v, & + omega_u, omega_v, & + denom_u, denom_v, & + Muu, Mvv, & ! entries of SIA matrix + ru, rv, & ! 3D residual + zu, zv) ! approximate solution of Az = r - ! convert x_v(jlocal,ilocal) to zv(nx-1,ny-1) + elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then - zv(:,:) = 0.0d0 - do i = 1, ilocal - ii = i + staggered_ilo - 1 - do j = 1, jlocal - jj = j + staggered_jlo - 1 - zv(ii,jj) = x_v(j,i) - enddo - enddo + ! Use a global tridiagonal solver to find an approximate solution of A*z = r - !Note: Need zu and zv in a row of halo cells so that q = A*d is correct in all locally owned cells - !TODO: See whether tridiag_solver_local_2d could be modified to provide zu and zv in halo cells? - call staggered_parallel_halo(zu, parallel) - call staggered_parallel_halo(zv, parallel) + call tridiag_solver_global_3d(& + nx, ny, & + nz, parallel, & + active_vertex, & + ilocal, jlocal, & + tasks_row, tasks_col, & + itest, jtest, rtest, & + Adiag_u, Adiag_v, & ! entries of 2D preconditioning matrix + Asubdiag_u, Asubdiag_v, & + Asupdiag_u, Asupdiag_v, & + omega_u, omega_v, & + denom_u, denom_v, & + xuh_u, xuh_v, & + xlh_u, xlh_v, & + Muu, Mvv, & ! entries of SIA matrix + gather_data_row, gather_data_col, & + .true., & ! first_time = T (first iteration) + ru, rv, & ! 3D residual + zu, zv) ! approximate solution of Az = r endif ! precond @@ -3111,23 +3212,23 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & !---- Compute intermediate result for dot product (r,z) call t_startf("pcg_dotprod") - work2u(:,:,1) = ru(:,:) * zu(:,:) - work2v(:,:,1) = rv(:,:) * zv(:,:) + work2u(:,:,:,1) = ru(:,:,:) * zu(:,:,:) + work2v(:,:,:,1) = rv(:,:,:) * zv(:,:,:) call t_stopf("pcg_dotprod") !---- Compute the conjugate direction vector d !---- Since z is correct in halo, so is d - du(:,:) = zu(:,:) - dv(:,:) = zv(:,:) + du(:,:,:) = zu(:,:,:) + dv(:,:,:) = zv(:,:,:) !---- Compute q = A*d - !---- q is correct for locally owned nodes, provided d extends one layer into the halo + !---- q is correct for locally owned nodes call t_startf("pcg_matmult_iter") - call matvec_multiply_structured_2d(nx, ny, & - parallel, & - indxA_2d, active_vertex, & + call matvec_multiply_structured_3d(nx, ny, & + nz, parallel, & + indxA_3d, active_vertex, & Auu, Auv, & Avu, Avv, & du, dv, & @@ -3137,14 +3238,14 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & !---- Compute intermediate result for dot product (d,q) = (d,Ad) call t_startf("pcg_dotprod") - work2u(:,:,2) = du(:,:) * qu(:,:) - work2v(:,:,2) = dv(:,:) * qv(:,:) + work2u(:,:,:,2) = du(:,:,:) * qu(:,:,:) + work2v(:,:,:,2) = dv(:,:,:) * qv(:,:,:) call t_stopf("pcg_dotprod") !---- Find global sums of (r,z) and (d,q) call t_startf("pcg_glbsum_iter") - gsum = parallel_global_sum_stagger(work2u, 2, parallel, work2v) ! nflds = 2 + gsum = parallel_global_sum_stagger(work2u, 2, parallel, work2v) call t_stopf("pcg_glbsum_iter") !---- Halo update for q @@ -3168,20 +3269,23 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & !---- These are correct in halo call t_startf("pcg_vecupdate") - xu(:,:) = xu(:,:) + alpha*du(:,:) - xv(:,:) = xv(:,:) + alpha*dv(:,:) + xu(:,:,:) = xu(:,:,:) + alpha*du(:,:,:) + xv(:,:,:) = xv(:,:,:) + alpha*dv(:,:,:) - ru(:,:) = ru(:,:) - alpha*qu(:,:) ! q = A*d - rv(:,:) = rv(:,:) - alpha*qv(:,:) + ru(:,:,:) = ru(:,:,:) - alpha*qu(:,:,:) ! q = A*d + rv(:,:,:) = rv(:,:,:) - alpha*qv(:,:,:) call t_stopf("pcg_vecupdate") - !WHL - debug if (verbose_pcg .and. this_rank == rtest) then j = jtest -!! write(iulog,*) ' ' -!! write(iulog,*) 'iter = 1: i, xu, xv, ru, rv:' + write(iulog,*) ' ' + write(iulog,*) 'iter =', iter + write(iulog,*) 'i, k, xu, xv, ru, rv:' do i = itest-3, itest+3 -!! write(iulog,'(i4, 4f16.10)') i, xu(i,j), xv(i,j), ru(i,j), rv(i,j) + write(iulog,*) ' ' + do k = 1, nz + write(iulog,'(2i4, 4f16.10)') i, k, xu(k,i,j), xv(k,i,j), ru(k,i,j), rv(k,i,j) + enddo enddo ! i endif @@ -3191,190 +3295,86 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & iter_loop: do iter = 2, maxiters_chrongear ! first iteration done above - if (verbose_pcg .and. main_task) then -! write(iulog,*) 'iter =', iter - endif - !---- Compute PC(r) = solution z of Mz = r !---- z is correct in halo call t_startf("pcg_precond_iter") - if (precond == HO_PRECOND_NONE) then ! no preconditioning - - zu(:,:) = ru(:,:) ! PC(r) = r - zv(:,:) = rv(:,:) ! PC(r) = r - - elseif (precond == HO_PRECOND_DIAG) then ! diagonal preconditioning - - do j = 1, ny-1 - do i = 1, nx-1 - if (Adiagu(i,j) /= 0.d0) then - zu(i,j) = ru(i,j) / Adiagu(i,j) ! PC(r), where PC is formed from diagonal elements of A - else - zu(i,j) = 0.d0 - endif - if (Adiagv(i,j) /= 0.d0) then - zv(i,j) = rv(i,j) / Adiagv(i,j) - else - zv(i,j) = 0.d0 - endif - enddo ! i - enddo ! j - - elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then ! tridiagonal preconditioning with local solve - - ! Solve M*z = r, where M is a local tridiagonal matrix (one matrix per task) - - !TODO - Test a local solver that can compute zu and zv in the halo - ! (to avoid the halo update below) - - !WHL - debug - if (verbose_tridiag .and. this_rank == rtest) then - i = itest - j = jtest -! write(iulog,*) 'Residual:' -! write(iulog,*) 'r, i, j, ru:', this_rank, i, j, ru(i,j) -! write(iulog,*) 'r, i, j, rv:', this_rank, i, j, rv(i,j) -! write(iulog,*) ' ' -! write(iulog,*) 'jtest =', jtest -! write(iulog,*) 'i, ru, rv:' -! do i = staggered_ihi, staggered_ilo, -1 -! write(iulog,'(i4, 2f15.10)') i, ru(i,j), rv(i,j) -! enddo - endif - - - !WHL - debug - if (verbose_pcg .and. this_rank == rtest) then - write(iulog,*) ' call tridiag_solver_local_2d' - endif - - call tridiag_solver_local_2d(nx, ny, & - parallel, & - itest, jtest, rtest, & - Adiag_u, Adiag_v, & ! entries of preconditioning matrix - Asubdiag_u, Asubdiag_v, & - Asupdiag_u, Asupdiag_v, & - omega_u, omega_v, & - denom_u, denom_v, & - ru, rv, & ! right hand side - zu, zv) ! solution - - !Note: Need zu and zv in a row of halo cells so that q = A*d is correct in all locally owned cells - !TODO: See whether tridiag solvers could be modified to provide zu and zv in halo cells? - call staggered_parallel_halo(zu, parallel) - call staggered_parallel_halo(zv, parallel) - - if (verbose_pcg .and. this_rank == rtest) then - j = jtest - write(iulog,*) ' ' - write(iulog,*) 'tridiag solve: i, ru, rv, zu, zv:' - do i = itest-3, itest+3 - write(iulog,'(i4, 4f16.10)') i, ru(i,j), rv(i,j), zu(i,j), zv(i,j) - enddo - endif - - elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then ! tridiagonal preconditioning with global solve + if (precond == HO_PRECOND_NONE) then ! no preconditioning - !WHL - debug - if (verbose_tridiag .and. this_rank == rtest) then - j = jtest - write(iulog,*) ' ' - write(iulog,*) 'jtest =', jtest - write(iulog,*) 'i, ru, rv:' - do i = itest-3, itest+3 - write(iulog,'(i4, 2f15.10)') i, ru(i,j), rv(i,j) - enddo - endif + zu(:,:,:) = ru(:,:,:) ! PC(r) = r + zv(:,:,:) = rv(:,:,:) ! PC(r) = r - ! convert ru(nx-1,ny-1) to b_u(ilocal,jlocal) + elseif (precond == HO_PRECOND_DIAG ) then ! diagonal preconditioning - do j = 1, jlocal - jj = j + staggered_jlo - 1 - do i = 1, ilocal - ii = i + staggered_ilo - 1 - b_u(i,j) = ru(ii,jj) - enddo - enddo + do j = 1, ny-1 + do i = 1, nx-1 + do k = 1, nz + if (Adiagu(k,i,j) /= 0.d0) then + zu(k,i,j) = ru(k,i,j) / Adiagu(k,i,j) ! PC(r), where PC is formed from diagonal elements of A + else + zu(k,i,j) = 0.d0 + endif + if (Adiagv(k,i,j) /= 0.d0) then + zv(k,i,j) = rv(k,i,j) / Adiagv(k,i,j) + else + zv(k,i,j) = 0.d0 + endif + enddo ! k + enddo ! i + enddo ! j - !WHL - debug - if (verbose_pcg .and. this_rank == rtest) then - j = jtest - write(iulog,*) ' ' - write(iulog,*) 'Before global tridiag PC u solve, r, j =', rtest, jtest - write(iulog,*) ' ' - write(iulog,*) 'i, Adiag_u, Asubdiag_u, Asupdiag_u, b_u:' - do i = itest-3, itest+3 - write(iulog,'(i4, 4e16.8)') i, Adiag_u(i,j), Asubdiag_u(i,j), Asupdiag_u(i,j), b_u(i,j) - enddo - endif + elseif (precond == HO_PRECOND_SIA) then ! local vertical shallow-ice solver for preconditioning - call tridiag_solver_global_2d(ilocal, jlocal, & - parallel, tasks_row, & - 'row', & ! tridiagonal solve for each row -!! itest, jtest, rtest, & - itest - staggered_ilo + 1, & ! itest referenced to (ilocal,jlocal) coordinates - jtest - staggered_jlo + 1, & ! jtest referenced to (ilocal,jlocal) coordinates - rtest, & - Adiag_u, & - Asubdiag_u, Asupdiag_u, & - omega_u, denom_u, & - xuh_u, xlh_u, & - b_u, x_u, & - .false., & ! first_time - gather_data_row) + call easy_sia_solver(nx, ny, nz, & + active_vertex, & + Muu, ru, zu) ! solve Muu*zu = ru for zu - ! convert x_u(ilocal,jlocal) to zu(nx-1,ny-1) - zu(:,:) = 0.0d0 - do j = 1, jlocal - jj = j + staggered_jlo - 1 - do i = 1, ilocal - ii = i + staggered_ilo - 1 - zu(ii,jj) = x_u(i,j) - enddo - enddo + call easy_sia_solver(nx, ny, nz, & + active_vertex, & + Mvv, rv, zv) ! solve Mvv*zv = rv for zv - ! convert rv(nx-1,ny-1) to b_v(jlocal,ilocal) + elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then - do i = 1, ilocal - ii = i + staggered_ilo - 1 - do j = 1, jlocal - jj = j + staggered_jlo - 1 - b_v(j,i) = rv(ii,jj) - enddo - enddo + ! Use a local tridiagonal solver to find an approximate solution of A*z = r - call tridiag_solver_global_2d(jlocal, ilocal, & - parallel, tasks_col, & - 'col', & ! tridiagonal solve for each column -!! itest, jtest, rtest, & - jtest - staggered_jlo + 1, & ! jtest referenced to (jlocal,ilocal) coordinates - itest - staggered_ilo + 1, & ! itest referenced to (jlocal,ilocal) coordinates - rtest, & - Adiag_v, & - Asubdiag_v, Asupdiag_v, & - omega_v, denom_v, & - xuh_v, xlh_v, & - b_v, x_v, & - .false., & ! first_time - gather_data_col) + call tridiag_solver_local_3d(& + nx, ny, & + nz, parallel, & + active_vertex, & + itest, jtest, rtest, & + Adiag_u, Adiag_v, & ! entries of 2D preconditioning matrix + Asubdiag_u, Asubdiag_v, & + Asupdiag_u, Asupdiag_v, & + omega_u, omega_v, & + denom_u, denom_v, & + Muu, Mvv, & ! entries of SIA matrix + ru, rv, & ! 3D residual + zu, zv) ! approximate solution of Az = r - ! convert x_v(jlocal,ilocal) to zv(nx-1,ny-1) + elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then ! tridiagonal preconditioning with global solve - zv(:,:) = 0.0d0 - do i = 1, ilocal - ii = i + staggered_ilo - 1 - do j = 1, jlocal - jj = j + staggered_jlo - 1 - zv(ii,jj) = x_v(j,i) - enddo - enddo + ! Use a global tridiagonal solver to find an approximate solution of A*z = r - !Note: Need zu and zv in a row of halo cells so that q = A*d is correct in all locally owned cells - !TODO: See whether tridiag solvers could be modified to provide zu and zv in halo cells? - call staggered_parallel_halo(zu, parallel) - call staggered_parallel_halo(zv, parallel) + call tridiag_solver_global_3d(& + nx, ny, & + nz, parallel, & + active_vertex, & + ilocal, jlocal, & + tasks_row, tasks_col, & + itest, jtest, rtest, & + Adiag_u, Adiag_v, & ! entries of 2D preconditioning matrix + Asubdiag_u, Asubdiag_v, & + Asupdiag_u, Asupdiag_v, & + omega_u, omega_v, & + denom_u, denom_v, & + xuh_u, xuh_v, & + xlh_u, xlh_v, & + Muu, Mvv, & ! entries of SIA matrix + gather_data_row, gather_data_col, & + .false., & ! first_time = F (iteration 2+) + ru, rv, & ! 3D residual + zu, zv) ! approximate solution of Az = r endif ! precond @@ -3382,26 +3382,27 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & !---- Compute Az = A*z !---- This is the one matvec multiply required per iteration - !---- Az is correct for locally owned nodes and needs a halo update (below) + !---- Az is correct for local owned nodes and needs a halo update (below) call t_startf("pcg_matmult_iter") - call matvec_multiply_structured_2d(nx, ny, & - parallel, & - indxA_2d, active_vertex, & + call matvec_multiply_structured_3d(nx, ny, & + nz, parallel, & + indxA_3d, active_vertex, & Auu, Auv, & Avu, Avv, & zu, zv, & Azu, Azv) call t_stopf("pcg_matmult_iter") + !---- Compute intermediate results for the dot products (r,z) and (Az,z) call t_startf("pcg_dotprod") - work2u(:,:,1) = ru(:,:)*zu(:,:) ! terms of dot product (r,z) - work2v(:,:,1) = rv(:,:)*zv(:,:) + work2u(:,:,:,1) = ru(:,:,:)*zu(:,:,:) ! terms of dot product (r,z) + work2v(:,:,:,1) = rv(:,:,:)*zv(:,:,:) - work2u(:,:,2) = Azu(:,:)*zu(:,:) ! terms of dot product (A*z,z) - work2v(:,:,2) = Azv(:,:)*zv(:,:) + work2u(:,:,:,2) = Azu(:,:,:)*zu(:,:,:) ! terms of dot product (A*z,z) + work2v(:,:,:,2) = Azv(:,:,:)*zv(:,:,:) call t_stopf("pcg_dotprod") ! Take the global sums of (r,z) and (Az,z) @@ -3409,10 +3410,11 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & ! this is the one MPI global reduction per iteration. call t_startf("pcg_glbsum_iter") - gsum = parallel_global_sum_stagger(work2u, 2, parallel, work2v) ! nflds = 2 + gsum = parallel_global_sum_stagger(work2u, 2, parallel, work2v) call t_stopf("pcg_glbsum_iter") !---- Halo update for Az + !---- This is the one halo update required per iteration call t_startf("pcg_halo_iter") call staggered_parallel_halo(Azu, parallel) @@ -3431,7 +3433,7 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & if (alpha /= alpha) then ! alpha is NaN !! write(iulog,*) 'rho, sigma, alpha:', rho, sigma, alpha - call write_log('Chron_Gear PCG solver has failed, alpha = NaN', GM_FATAL) + call write_log('Chron-Gear PCG solver has failed, alpha = NaN', GM_FATAL) endif !---- Update d and q @@ -3439,25 +3441,38 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & call t_startf("pcg_vecupdate") - du(:,:) = zu(:,:) + beta*du(:,:) ! d_(i+1) = PC(r_(i+1)) + beta_(i+1)*d_i - dv(:,:) = zv(:,:) + beta*dv(:,:) ! - ! (r_(i+1), PC(r_(i+1))) - ! where beta_(i+1) = -------------------- - ! (r_i, PC(r_i)) - qu(:,:) = Azu(:,:) + beta*qu(:,:) - qv(:,:) = Azv(:,:) + beta*qv(:,:) + du(:,:,:) = zu(:,:,:) + beta*du(:,:,:) ! d_(i+1) = PC(r_(i+1)) + beta_(i+1)*d_i + dv(:,:,:) = zv(:,:,:) + beta*dv(:,:,:) ! + ! (r_(i+1), PC(r_(i+1))) + ! where beta_(i+1) = -------------------- + ! (r_i, PC(r_i)) + qu(:,:,:) = Azu(:,:,:) + beta*qu(:,:,:) + qv(:,:,:) = Azv(:,:,:) + beta*qv(:,:,:) !---- Update solution and residual !---- These are correct in halo - xu(:,:) = xu(:,:) + alpha*du(:,:) - xv(:,:) = xv(:,:) + alpha*dv(:,:) + xu(:,:,:) = xu(:,:,:) + alpha*du(:,:,:) + xv(:,:,:) = xv(:,:,:) + alpha*dv(:,:,:) - ru(:,:) = ru(:,:) - alpha*qu(:,:) - rv(:,:) = rv(:,:) - alpha*qv(:,:) + ru(:,:,:) = ru(:,:,:) - alpha*qu(:,:,:) + rv(:,:,:) = rv(:,:,:) - alpha*qv(:,:,:) call t_stopf("pcg_vecupdate") + if (verbose_pcg .and. this_rank == rtest) then + j = jtest + write(iulog,*) ' ' + write(iulog,*) 'iter =', iter + write(iulog,*) 'i, k, xu, xv, ru, rv:' + do i = itest-3, itest+3 + write(iulog,*) ' ' + do k = 1, nz + write(iulog,'(2i4, 4f16.10)') i, k, xu(k,i,j), xv(k,i,j), ru(k,i,j), rv(k,i,j) + enddo + enddo ! i + endif + ! Check for convergence every linear_solve_ncheck iterations. ! Also check at iter = 5, to reduce iterations when the nonlinear solver is close to convergence. ! TODO: Check at iter = linear_solve_ncheck/2 instead of 5? This would be answer-changing. @@ -3468,19 +3483,15 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & if (verbose_pcg .and. main_task) then write(iulog,*) ' ' - write(iulog,*) ' check convergence, iter =', iter + write(iulog,*) 'Check convergence, iter =', iter endif !---- Compute z = A*x (use z as a temp vector for A*x) - !WHL - debug - don't think this is needed, but try just in case -!! call staggered_parallel_halo(xu, parallel) -!! call staggered_parallel_halo(xv, parallel) - call t_startf("pcg_matmult_resid") - call matvec_multiply_structured_2d(nx, ny, & - parallel, & - indxA_2d, active_vertex, & + call matvec_multiply_structured_3d(nx, ny, & + nz, parallel, & + indxA_3d, active_vertex, & Auu, Auv, & Avu, Avv, & xu, xv, & @@ -3489,35 +3500,27 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & !---- Compute residual r = b - A*x - !WHL - debug - don't think this is needed, but try just in case -!! call staggered_parallel_halo(bu, parallel) -!! call staggered_parallel_halo(bv, parallel) - !WHL - debug - don't think this is needed, but try just in case -!! call staggered_parallel_halo(zu, parallel) -!! call staggered_parallel_halo(zv, parallel) - call t_startf("pcg_vecupdate") - ru(:,:) = bu(:,:) - zu(:,:) - rv(:,:) = bv(:,:) - zv(:,:) + ru(:,:,:) = bu(:,:,:) - zu(:,:,:) + rv(:,:,:) = bv(:,:,:) - zv(:,:,:) call t_stopf("pcg_vecupdate") - !WHL - debug - don't think this is needed, but try just in case -!! call staggered_parallel_halo(ru, parallel) -!! call staggered_parallel_halo(rv, parallel) - !---- Compute dot product (r, r) call t_startf("pcg_dotprod") - worku(:,:) = ru(:,:)*ru(:,:) - workv(:,:) = rv(:,:)*rv(:,:) + worku(:,:,:) = ru(:,:,:)*ru(:,:,:) + workv(:,:,:) = rv(:,:,:)*rv(:,:,:) call t_stopf("pcg_dotprod") call t_startf("pcg_glbsum_resid") rr = parallel_global_sum_stagger(worku, parallel, workv) call t_stopf("pcg_glbsum_resid") - L2_resid = sqrt(rr) ! L2 norm of residual - err = L2_resid/L2_rhs ! normalized error + ! take square root + L2_resid = sqrt(rr) + + ! compute normalized error + err = L2_resid/L2_rhs if (verbose_pcg .and. main_task) then write(iulog,*) 'iter, L2_resid, L2_rhs, error =', iter, L2_resid, L2_rhs, err @@ -3531,13 +3534,13 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & ju_max = 0 do j = staggered_jlo, staggered_jhi do i = staggered_ilo, staggered_ihi - if (abs(ru(i,j)) > ru_max) then - ru_max = ru(i,j) + if (sum(abs(ru(:,i,j))) > ru_max) then + ru_max = sum(abs(ru(:,i,j))) iu_max = i ju_max = j endif - if (abs(rv(i,j)) > rv_max) then - rv_max = rv(i,j) + if (sum(abs(rv(:,i,j))) > rv_max) then + rv_max = sum(abs(rv(:,i,j))) iv_max = i jv_max = j endif @@ -3585,12 +3588,10 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & if (allocated(denom_u)) deallocate(denom_u, denom_v) if (allocated(xuh_u)) deallocate(xuh_u, xuh_v) if (allocated(xlh_u)) deallocate(xlh_u, xlh_v) - if (allocated(b_u)) deallocate(b_u, b_v) - if (allocated(x_u)) deallocate(x_u, x_v) if (allocated(gather_data_row)) deallocate(gather_data_row) if (allocated(gather_data_col)) deallocate(gather_data_col) - end subroutine pcg_solver_chrongear_2d + end subroutine pcg_solver_chrongear_3d !**************************************************************************** From e60ae9bcc687fee89596f8990a4fa0f6baae7051 Mon Sep 17 00:00:00 2001 From: William Lipscomb Date: Fri, 23 Jan 2026 15:20:41 -0700 Subject: [PATCH 16/21] Reproducible sums for norm of residual vector This commit implements reproducible sums for computations of the L2 norm (and related quantities) of the residual vector A*x - b in the velocity solver. The summed L2 norm is compared to a target value to check whether the solver has converged. Reproducible sums are needed only if the L2 norm is very close to the target, such that the norm exceeds the target for (say) 128 processors and fall below the target for 256 processors, due to roundoff differences. This is unlikely but not impossible, so it's better to be on the safe side. --- libglissade/glissade_velo_higher.F90 | 192 ++++++++++++++------------- 1 file changed, 97 insertions(+), 95 deletions(-) diff --git a/libglissade/glissade_velo_higher.F90 b/libglissade/glissade_velo_higher.F90 index d68d3253..4bcf8755 100644 --- a/libglissade/glissade_velo_higher.F90 +++ b/libglissade/glissade_velo_higher.F90 @@ -9463,6 +9463,10 @@ subroutine compute_residual_vector_3d(nx, ny, nz, & ! Compute the residual vector Ax - b and its L2 norm. ! This subroutine assumes that the matrix is stored in structured (x/y/z) format. + !---------------------------------------------------------------- + ! Input/output variables + !---------------------------------------------------------------- + integer, intent(in) :: & nx, ny, & ! horizontal grid dimensions (for scalars) nz ! number of vertical levels where velocity is computed @@ -9502,8 +9506,13 @@ subroutine compute_residual_vector_3d(nx, ny, nz, & real(dp), intent(out), optional :: & L2_norm_relative ! L2 norm of residual vector relative to rhs, |Ax - b| / |b| + !---------------------------------------------------------------- + ! Local variables + !---------------------------------------------------------------- + real(dp), dimension(nz,nx-1,ny-1) :: & - resid_sq ! resid_u^2 + resid_v^2 + worku, workv, & ! work arrays for global sums + resid_sq ! resid_u^2 + resid_v^2 real(dp) :: my_max_resid, global_max_resid @@ -9525,79 +9534,61 @@ subroutine compute_residual_vector_3d(nx, ny, nz, & resid_u(:,:,:) = 0.d0 resid_v(:,:,:) = 0.d0 - !TODO - Replace the following by a call to matvec_multiply_structured_3d ! Loop over locally owned vertices - do j = staggered_jlo, staggered_jhi - do i = staggered_ilo, staggered_ihi - - if (active_vertex(i,j)) then - - do k = 1, nz - - do kA = -1,1 - do jA = -1,1 - do iA = -1,1 - - if ( (k+kA >= 1 .and. k+kA <= nz) & - .and. & - (i+iA >= 1 .and. i+iA <= nx-1) & - .and. & - (j+jA >= 1 .and. j+jA <= ny-1) ) then - - m = indxA_3d(iA,jA,kA) - - resid_u(k,i,j) = resid_u(k,i,j) & - + Auu(m,k,i,j)*uvel(k+kA,i+iA,j+jA) & - + Auv(m,k,i,j)*vvel(k+kA,i+iA,j+jA) - - resid_v(k,i,j) = resid_v(k,i,j) & - + Avu(m,k,i,j)*uvel(k+kA,i+iA,j+jA) & - + Avv(m,k,i,j)*vvel(k+kA,i+iA,j+jA) - - endif ! in bounds - - enddo ! kA - enddo ! iA - enddo ! jA + do i = staggered_ilo, staggered_ihi + if (active_vertex(i,j)) then + do k = 1, nz + do kA = -1,1 + do jA = -1,1 + do iA = -1,1 + if ( (k+kA >= 1 .and. k+kA <= nz) & + .and. & + (i+iA >= 1 .and. i+iA <= nx-1) & + .and. & + (j+jA >= 1 .and. j+jA <= ny-1) ) then - enddo ! k + m = indxA_3d(iA,jA,kA) - endif ! active_vertex + resid_u(k,i,j) = resid_u(k,i,j) & + + Auu(m,k,i,j)*uvel(k+kA,i+iA,j+jA) & + + Auv(m,k,i,j)*vvel(k+kA,i+iA,j+jA) - enddo ! i + resid_v(k,i,j) = resid_v(k,i,j) & + + Avu(m,k,i,j)*uvel(k+kA,i+iA,j+jA) & + + Avv(m,k,i,j)*vvel(k+kA,i+iA,j+jA) + endif ! in bounds + enddo ! kA + enddo ! iA + enddo ! jA + enddo ! k + endif ! active_vertex + enddo ! i enddo ! j ! Subtract b to get A*x - b - ! Sum up squared L2 norm as we go - L2_norm = 0.d0 - resid_sq(:,:,:) = 0.0d0 + worku(:,:,:) = 0.0d0 + workv(:,:,:) = 0.0d0 ! Loop over locally owned vertices - do j = staggered_jlo, staggered_jhi - do i = staggered_ilo, staggered_ihi - if (active_vertex(i,j)) then - do k = 1, nz - resid_u(k,i,j) = resid_u(k,i,j) - bu(k,i,j) - resid_v(k,i,j) = resid_v(k,i,j) - bv(k,i,j) - resid_sq(k,i,j) = resid_u(k,i,j)*resid_u(k,i,j) + resid_v(k,i,j)*resid_v(k,i,j) - L2_norm = L2_norm + resid_sq(k,i,j) - enddo ! k - endif ! active vertex - enddo ! i + do i = staggered_ilo, staggered_ihi + if (active_vertex(i,j)) then + do k = 1, nz + resid_u(k,i,j) = resid_u(k,i,j) - bu(k,i,j) + resid_v(k,i,j) = resid_v(k,i,j) - bv(k,i,j) + worku(k,i,j) = resid_u(k,i,j)*resid_u(k,i,j) + workv(k,i,j) = resid_v(k,i,j)*resid_v(k,i,j) + enddo ! k + endif ! active vertex + enddo ! i enddo ! j ! Take global sum, then take square root - L2_norm = parallel_reduce_sum(L2_norm) + L2_norm = parallel_global_sum_stagger(worku, parallel, workv) L2_norm = sqrt(L2_norm) -!! sum_resid_u_sq = parallel_global_sum(bu*bu, parallel, active_vertex) -!! sum_resid_v_sq = parallel_global_sum(bv*bv, parallel, active_vertex) -!! L2_norm = parallel_global_sum(resid_sq, parallel) -!! L2_norm = sqrt(L2_norm) - if (verbose_residual) then if (this_rank==rtest) then @@ -9633,26 +9624,28 @@ subroutine compute_residual_vector_3d(nx, ny, nz, & if (present(L2_norm_relative)) then ! compute L2_norm relative to rhs - L2_norm_rhs = 0.d0 + worku(:,:,:) = 0.0d0 + workv(:,:,:) = 0.0d0 do j = staggered_jlo, staggered_jhi - do i = staggered_ilo, staggered_ihi - if (active_vertex(i,j)) then - do k = 1, nz - L2_norm_rhs = L2_norm_rhs + bu(k,i,j)*bu(k,i,j) + bv(k,i,j)*bv(k,i,j) - enddo ! k - endif ! active vertex - enddo ! i + do i = staggered_ilo, staggered_ihi + if (active_vertex(i,j)) then + do k = 1, nz + worku(k,i,j) = bu(k,i,j)*bu(k,i,j) + workv(k,i,j) = bv(k,i,j)*bv(k,i,j) + enddo ! k + endif ! active vertex + enddo ! i enddo ! j ! Take global sum, then take square root - L2_norm_rhs = parallel_reduce_sum(L2_norm_rhs) + L2_norm_rhs = parallel_global_sum_stagger(worku, parallel, workv) L2_norm_rhs = sqrt(L2_norm_rhs) - if (L2_norm_rhs > 0.d0) then + if (L2_norm_rhs > 0.0d0) then L2_norm_relative = L2_norm / L2_norm_rhs else - L2_norm_relative = 0.d0 + L2_norm_relative = 0.0d0 endif endif @@ -9675,6 +9668,10 @@ subroutine compute_residual_vector_2d(nx, ny, & ! Compute the residual vector Ax - b and its L2 norm. ! This subroutine assumes that the matrix is stored in structured (x/y/z) format. + !---------------------------------------------------------------- + ! Input/output arguments + !---------------------------------------------------------------- + integer, intent(in) :: & nx, ny ! horizontal grid dimensions (for scalars) @@ -9713,8 +9710,13 @@ subroutine compute_residual_vector_2d(nx, ny, & real(dp), intent(out), optional :: & L2_norm_relative ! L2 norm of residual vector relative to rhs, |Ax - b| / |b| + !---------------------------------------------------------------- + ! Local variables + !---------------------------------------------------------------- + real(dp), dimension(nx-1,ny-1) :: & - resid_sq ! resid_u^2 + resid_v^2 + worku, workv, & ! work arrays for global sums + resid_sq ! resid_u^2 + resid_v^2 real(dp) :: my_max_resid, global_max_resid @@ -9760,27 +9762,23 @@ subroutine compute_residual_vector_2d(nx, ny, & enddo ! jA ! Subtract b to get A*x - b - ! Sum up squared L2 norm as we go - - L2_norm = 0.d0 - resid_sq(:,:) = 0.0d0 + worku(:,:) = 0.0d0 + workv(:,:) = 0.0d0 ! Loop over locally owned vertices - do j = staggered_jlo, staggered_jhi - do i = staggered_ilo, staggered_ihi - if (active_vertex(i,j)) then - resid_u(i,j) = resid_u(i,j) - bu(i,j) - resid_v(i,j) = resid_v(i,j) - bv(i,j) - resid_sq(i,j) = resid_u(i,j)*resid_u(i,j) + resid_v(i,j)*resid_v(i,j) - L2_norm = L2_norm + resid_sq(i,j) - endif ! active vertex - enddo ! i - enddo ! j + do i = staggered_ilo, staggered_ihi + if (active_vertex(i,j)) then + resid_u(i,j) = resid_u(i,j) - bu(i,j) + resid_v(i,j) = resid_v(i,j) - bv(i,j) + worku(i,j) = resid_u(i,j)*resid_u(i,j) + workv(i,j) = resid_v(i,j)*resid_v(i,j) + endif ! active vertex + enddo ! i + enddo ! j ! Take global sum, then take square root - - L2_norm = parallel_reduce_sum(L2_norm) + L2_norm = parallel_global_sum_stagger(worku, parallel, workv) L2_norm = sqrt(L2_norm) if (verbose_residual) then @@ -9795,6 +9793,7 @@ subroutine compute_residual_vector_2d(nx, ny, & ! Compute max value of (squared) residual on this task. ! If this task owns the vertex with the global max residual, then print a diagnostic message. + resid_sq(:,:) = worku(:,:) + workv(:,:) my_max_resid = maxval(resid_sq) global_max_resid = parallel_reduce_max(my_max_resid) @@ -9815,24 +9814,27 @@ subroutine compute_residual_vector_2d(nx, ny, & if (present(L2_norm_relative)) then ! compute L2_norm relative to rhs - L2_norm_rhs = 0.d0 + worku(:,:) = 0.0d0 + workv(:,:) = 0.0d0 + ! Loop over locally owned vertices do j = staggered_jlo, staggered_jhi - do i = staggered_ilo, staggered_ihi - if (active_vertex(i,j)) then - L2_norm_rhs = L2_norm_rhs + bu(i,j)*bu(i,j) + bv(i,j)*bv(i,j) - endif ! active vertex - enddo ! i - enddo ! j + do i = staggered_ilo, staggered_ihi + if (active_vertex(i,j)) then + worku(i,j) = bu(i,j)*bu(i,j) + workv(i,j) = bv(i,j)*bv(i,j) + endif ! active vertex + enddo ! i + enddo ! j ! Take global sum, then take square root - L2_norm_rhs = parallel_reduce_sum(L2_norm_rhs) + L2_norm_rhs = parallel_global_sum_stagger(worku, parallel, workv) L2_norm_rhs = sqrt(L2_norm_rhs) - if (L2_norm_rhs > 0.d0) then + if (L2_norm_rhs > 0.0d0) then L2_norm_relative = L2_norm / L2_norm_rhs else - L2_norm_relative = 0.d0 + L2_norm_relative = 0.0d0 endif endif From 4a89b90215a6a4b17a9a8acd712cd4436a60ffe1 Mon Sep 17 00:00:00 2001 From: William Lipscomb Date: Fri, 23 Jan 2026 15:59:22 -0700 Subject: [PATCH 17/21] Reordered some subroutines in glissade_velo_higher This commit simply moves subroutines linked to the 2d velocity solver above subroutines linked to the 3d velocity solver, as done earlier for the PCG subroutine. Historically, Blatter-Pattyn was coded before SSA/L1L2/DIVA, which is why the 3d subroutines came first. --- libglissade/glissade_velo_higher.F90 | 3385 +++++++++++++------------- 1 file changed, 1651 insertions(+), 1734 deletions(-) diff --git a/libglissade/glissade_velo_higher.F90 b/libglissade/glissade_velo_higher.F90 index 4bcf8755..66074862 100644 --- a/libglissade/glissade_velo_higher.F90 +++ b/libglissade/glissade_velo_higher.F90 @@ -4898,26 +4898,38 @@ end subroutine lateral_shelf_bc !**************************************************************************** - subroutine assemble_stiffness_matrix_3d(nx, ny, & - nz, sigma, & + subroutine assemble_stiffness_matrix_2d(nx, ny, & + nz, & + sigma, stagsigma, & nhalo, & itest, jtest, rtest, & active_cell, & xVertex, yVertex, & - uvel, vvel, & + uvel_2d, vvel_2d, & stagusrf, stagthck, & - flwafact, whichapprox, & + flwa, flwafact, & + whichapprox, & + diva_slope_factor_x, diva_slope_factor_y, & whichefvs, efvs, & efvs_constant, effstrain_min, & Auu, Auv, & - Avu, Avv) + Avu, Avv, & + dusrf_dx, dusrf_dy, & + thck, & + btractx, btracty, & + omega_k, omega, & + efvs_qp_3d) !---------------------------------------------------------------- ! Assemble the stiffness matrix A in the linear system Ax = b. ! This subroutine is called for each nonlinear iteration if ! we are iterating on the effective viscosity. + ! The matrix A can be based on the shallow-shelf approximation or + ! the depth-integrated L1L2 approximation (Schoof and Hindmarsh, 2010). !---------------------------------------------------------------- + use glissade_grid_operators, only: glissade_vertical_average + !---------------------------------------------------------------- ! Input-output arguments !---------------------------------------------------------------- @@ -4925,37 +4937,45 @@ subroutine assemble_stiffness_matrix_3d(nx, ny, & integer, intent(in) :: & nx, ny, & ! horizontal grid dimensions nz, & ! number of vertical levels at which velocity is computed - ! Note: the number of elements per column is nz-1 + ! (used for flwafact) nhalo ! number of halo layers real(dp), dimension(nz), intent(in) :: & - sigma ! sigma vertical coordinate + sigma ! sigma vertical coordinate + + real(dp), dimension(nz-1), intent(in) :: & + stagsigma ! staggered sigma vertical coordinate integer, intent(in) :: & itest, jtest, rtest ! coordinates of diagnostic point logical, dimension(nx,ny), intent(in) :: & - active_cell ! true if cell contains ice and borders a locally owned vertex + active_cell ! true if cell contains ice and borders a locally owned vertex real(dp), dimension(nx-1,ny-1), intent(in) :: & - xVertex, yVertex ! x and y coordinates of vertices + xVertex, yVertex ! x and y coordinates of vertices - real(dp), dimension(nz,nx-1,ny-1), intent(in) :: & - uvel, vvel ! velocity components (m/yr) + real(dp), dimension(nx-1,ny-1), intent(in) :: & + uvel_2d, vvel_2d ! 2D velocity components (m/yr) real(dp), dimension(nx-1,ny-1), intent(in) :: & stagusrf, & ! upper surface elevation on staggered grid (m) stagthck ! ice thickness on staggered grid (m) + !TODO - Pass in flwa only, and compute flwafact here? real(dp), dimension(nz-1,nx,ny), intent(in) :: & - flwafact ! temperature-based flow factor, 0.5 * A^(-1/n), + flwa, &! temperature-based flow factor A, Pa^{-n} yr^{-1} + flwafact ! temperature-based flow factor, 0.5 * A^(-1/n), Pa yr^(1/n) ! used to compute the effective viscosity - ! units: Pa yr^(1/n) integer, intent(in) :: & - whichapprox, & ! option for Stokes approximation (BP, SSA, SIA) + whichapprox, & ! option for Stokes approximation (BP, L1L2, SSA, SIA) whichefvs ! option for effective viscosity calculation + real(dp), dimension(nx,ny), intent(in) :: & + diva_slope_factor_x, & ! correction factor for DIVA in x direction, based on theta_slope_x + diva_slope_factor_y ! correction factor for DIVA in y direction, based on theta_slope_y + real(dp), dimension(nz-1,nx,ny), intent(out) :: & efvs ! effective viscosity (Pa yr) @@ -4963,36 +4983,61 @@ subroutine assemble_stiffness_matrix_3d(nx, ny, & efvs_constant, & ! constant value of effective viscosity (Pa yr) effstrain_min ! minimum value of effective strain rate (yr^-1) for computing viscosity - real(dp), dimension(nNodeNeighbors_3d,nz,nx-1,ny-1), intent(out) :: & + real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(out) :: & Auu, Auv, & ! assembled stiffness matrix, divided into 4 parts Avu, Avv + ! The following optional arguments are used for the L1L2 approximation only + + real(dp), dimension(nx-1,ny-1), intent(in), optional :: & + dusrf_dx, & ! upper surface elevation gradient on staggered grid (m/m) + dusrf_dy ! needed for L1L2 assembly only + + ! The following optional arguments are used for DIVA only + + real(dp), dimension(nx,ny), intent(in), optional :: & + thck ! ice thickness (m) + + real(dp), dimension(nx-1,ny-1), intent(in), optional :: & + btractx, btracty ! components of basal traction (Pa) + + real(dp), dimension(nz,nx,ny), intent(out), optional :: & + omega_k ! single integral, defined by Goldberg (2011) eq. 32 + + real(dp), dimension(nx,ny), intent(out), optional :: & + omega ! double integral, defined by Goldberg (2011) eq. 35 + ! Note: omega here = Goldberg's omega/H + + real(dp), dimension(nz-1,nQuadPoints_2d,nx,ny), intent(inout), optional :: & + efvs_qp_3d ! effective viscosity (Pa yr) + !--------------------------------------------------------- ! Local variables !--------------------------------------------------------- - real(dp), dimension(nQuadPoints_3d) :: & + real(dp), dimension(nQuadPoints_2d) :: & detJ ! determinant of J - real(dp), dimension(nNodesPerElement_3d) :: & - dphi_dx_3d, dphi_dy_3d, dphi_dz_3d ! derivatives of basis function, evaluated at quad pt + real(dp), dimension(nNodesPerElement_2d) :: & + dphi_dx_2d, dphi_dy_2d, dphi_dz_2d ! derivatives of basis function, evaluated at quad pts + ! set dphi_dz = 0 for 2D problem !---------------------------------------------------------------- - ! Note: Kuu, Kuv, Kvu, and Kvv are 8x8 components of the stiffness matrix - ! for the local element. (The combined stiffness matrix is 16x16.) + ! Note: Kuu, Kuv, Kvu, and Kvv are 4x4 components of the stiffness matrix + ! for the local element. (The combined stiffness matrix is 8x8.) ! - ! Once these matrices are formed, their coefficients are summed into the assembled - ! matrices Auu, Auv, Avu, Avv. The A matrices each have as many rows as there are - ! active nodes, but only 27 columns, corresponding to the 27 vertices that belong to - ! the 8 elements sharing a given node. + ! Once these matrices are formed, their coefficients are summed into the global + ! matrices Auu_2d, Auv_2d, Avu_2d, Avv_2d. The global matrices each have as + ! many rows as there are active vertices, but only 9 columns, corresponding to + ! the 9 vertices of the 4 elements sharing a given node. ! ! The native structured PCG solver works with the dense A matrices in the form ! computed here. For the SLAP solver, the terms of the A matrices are put - ! in a sparse matrix during preprocessing. For the Trilinos solver, the terms - ! of the A matrices are passed to Trilinos one row at a time. + ! in a sparse matrix format during preprocessing. For the Trilinos solver, + ! the terms of the A matrices are passed to Trilinos one row at a time. !---------------------------------------------------------------- - real(dp), dimension(nNodesPerElement_3d, nNodesPerElement_3d) :: & ! + real(dp), dimension(nNodesPerElement_2d, nNodesPerElement_2d) :: & ! Kuu, & ! element stiffness matrix, divided into 4 parts as shown below Kuv, & ! Kvu, & ! @@ -5003,25 +5048,36 @@ subroutine assemble_stiffness_matrix_3d(nx, ny, & ! ! Kvu may not be needed if matrix is symmetric, but is included for now - real(dp), dimension(nNodesPerElement_3d) :: & - x, y, z, & ! Cartesian coordinates of nodes - u, v, & ! u and v at nodes - s ! upper surface elevation at nodes + real(dp), dimension(nNodesPerElement_2d) :: & + x, y, & ! Cartesian coordinates of vertices + u, v, & ! depth-integrated mean velocity at vertices (m/yr) + h, & ! thickness at vertices (m) + s, & ! upper surface elevation at vertices (m) + bx, by, & ! basal traction at vertices (Pa) (DIVA only) + dsdx, dsdy ! upper surface elevation gradient at vertices (m/m) (L1L2 only) - real(dp), dimension(nQuadPoints_3d) :: & - efvs_qp ! effective viscosity at a quad pt + real(dp), dimension(nQuadPoints_2d) :: & + efvs_qp_vertavg ! vertically averaged effective viscosity at a quad pt + + real(dp) :: & + h_qp ! thickness at a quad pt + + real(dp), dimension(nz-1,nQuadPoints_2d) :: & + efvs_qp ! effective viscosity at each layer in a cell column + ! corresponding to a quad pt logical, parameter :: & check_symmetry_element = .true. ! if true, then check symmetry of element matrix - !Note: Can speed up assembly a bit by setting to false for production + + real(dp), dimension(nx,ny) :: & + flwafact_2d ! vertically averaged flow factor integer :: i, j, k, n, p - integer :: iNode, jNode, kNode + integer :: iVertex, jVertex if (verbose_matrix .and. main_task) then write(iulog,*) ' ' - write(iulog,*) 'In assemble_stiffness_matrix_3d' - write(iulog,*) 'itest, jtest, ktest, rtest =', itest, jtest, ktest, rtest + write(iulog,*) 'In assemble_stiffness_matrix_2d' endif ! Initialize effective viscosity @@ -5029,10 +5085,17 @@ subroutine assemble_stiffness_matrix_3d(nx, ny, & ! Initialize global stiffness matrix - Auu(:,:,:,:) = 0.d0 - Auv(:,:,:,:) = 0.d0 - Avu(:,:,:,:) = 0.d0 - Avv(:,:,:,:) = 0.d0 + Auu(:,:,:) = 0.d0 + Auv(:,:,:) = 0.d0 + Avu(:,:,:) = 0.d0 + Avv(:,:,:) = 0.d0 + + ! Compute vertical average of flow factor (SSA only) + if (whichapprox == HO_APPROX_SSA) then + call glissade_vertical_average(nx, ny, & + nz, sigma, & + flwafact, flwafact_2d) + endif ! Sum over elements in active cells ! Loop over all cells that border locally owned vertices. @@ -5042,112 +5105,200 @@ subroutine assemble_stiffness_matrix_3d(nx, ny, & if (active_cell(i,j)) then - do k = 1, nz-1 ! loop over elements in this column - ! assume k increases from upper surface to bed - - ! Initialize element stiffness matrix - Kuu(:,:) = 0.d0 - Kuv(:,:) = 0.d0 - Kvu(:,:) = 0.d0 - Kvv(:,:) = 0.d0 + ! Initialize element stiffness matrix + Kuu(:,:) = 0.d0 + Kuv(:,:) = 0.d0 + Kvu(:,:) = 0.d0 + Kvv(:,:) = 0.d0 - ! compute spatial coordinates, velocity, and upper surface elevation for each node - - do n = 1, nNodesPerElement_3d + ! Compute spatial coordinates, velocity, thickness and surface elevation for each vertex + ! Also compute surface elevation gradient (for L1L2) and basal traction (for DIVA) + do n = 1, nNodesPerElement_2d - ! Determine (k,i,j) for this node - ! The reason for the '7' is that node 7, in the NE corner of the upper layer, has index (k,i,j). - ! Indices for other nodes are computed relative to this node. - iNode = i + ishift(7,n) - jNode = j + jshift(7,n) - kNode = k + kshift(7,n) + ! Determine (i,j) for this vertex + ! The reason for the '3' is that node 3, in the NE corner of the grid cell, has index (i,j). + ! Indices for other nodes are computed relative to this vertex. + iVertex = i + ishift(3,n) + jVertex = j + jshift(3,n) - x(n) = xVertex(iNode,jNode) - y(n) = yVertex(iNode,jNode) - z(n) = stagusrf(iNode,jNode) - sigma(kNode)*stagthck(iNode,jNode) - u(n) = uvel(kNode,iNode,jNode) - v(n) = vvel(kNode,iNode,jNode) - s(n) = stagusrf(iNode,jNode) + x(n) = xVertex(iVertex,jVertex) + y(n) = yVertex(iVertex,jVertex) + u(n) = uvel_2d(iVertex,jVertex) + v(n) = vvel_2d(iVertex,jVertex) + s(n) = stagusrf(iVertex,jVertex) + h(n) = stagthck(iVertex,jVertex) + if (present(dusrf_dx) .and. present(dusrf_dy)) then ! L1L2 + dsdx(n) = dusrf_dx(iVertex,jVertex) + dsdy(n) = dusrf_dy(iVertex,jVertex) + endif + if (present(btractx) .and. present(btracty)) then ! DIVA + bx(n) = btractx(iVertex,jVertex) + by(n) = btracty(iVertex,jVertex) + endif - if (verbose_matrix .and. this_rank==rtest .and. i==itest .and. j==jtest .and. k==ktest) then - write(iulog,*) ' ' - write(iulog,*) 'i, j, k, n, x, y, z:', i, j, k, n, x(n), y(n), z(n) - write(iulog,*) 's, u, v:', s(n), u(n), v(n) - endif + if (verbose_matrix .and. this_rank==rtest .and. i==itest .and. j==jtest) then + write(iulog,*) ' ' + write(iulog,*) 'i, j, n, x, y:', i, j, n, x(n), y(n) + write(iulog,*) 's, h, u, v:', s(n), h(n), u(n), v(n) + if (present(btractx) .and. present(btracty)) write(iulog,*) 'bx, by:', bx(n), by(n) + endif - enddo ! nodes per element + enddo ! vertices per element - ! Loop over quadrature points for this element + ! Loop over quadrature points for this element - do p = 1, nQuadPoints_3d - - ! Evaluate the derivatives of the element basis functions at this quadrature point. - !WHL - Pass in i, j, k, and p to the following subroutines for debugging. + do p = 1, nQuadPoints_2d - call get_basis_function_derivatives_3d(x(:), y(:), z(:), & - dphi_dxr_3d(:,p), dphi_dyr_3d(:,p), dphi_dzr_3d(:,p), & - dphi_dx_3d(:), dphi_dy_3d(:), dphi_dz_3d(:), & - detJ(p), & - itest, jtest, rtest, & - i, j, k, p) + ! Evaluate the derivatives of the element basis functions at this quadrature point. -! call t_startf('glissade_effective_viscosity') - call compute_effective_viscosity(whichefvs, whichapprox, & - efvs_constant, effstrain_min, & - nNodesPerElement_3d, & - dphi_dx_3d(:), dphi_dy_3d(:), dphi_dz_3d(:), & - u(:), v(:), & - flwafact(k,i,j), efvs_qp(p), & - itest, jtest, rtest, & - i, j, k, p ) -! call t_stopf('glissade_effective_viscosity') + call get_basis_function_derivatives_2d(x(:), y(:), & + dphi_dxr_2d(:,p), dphi_dyr_2d(:,p), & + dphi_dx_2d(:), dphi_dy_2d(:), & + detJ(p), & + itest, jtest, rtest, & + i, j, p) - if (verbose_efvs .and. this_rank==rtest .and. i==itest .and. j==jtest .and. p==ptest) then - write(iulog,*) 'i, j, k, p, efvs (Pa yr):', i, j, k, p, efvs_qp(p) - endif + dphi_dz_2d(:) = 0.d0 - ! Increment the element stiffness matrix with the contribution from each quadrature point. + if (whichapprox == HO_APPROX_L1L2) then -! call t_startf('glissade_compute_element_matrix') - call compute_element_matrix(whichapprox, nNodesPerElement_3d, & - wqp_3d(p), detJ(p), efvs_qp(p), & - dphi_dx_3d(:), dphi_dy_3d(:), dphi_dz_3d(:), & - Kuu(:,:), Kuv(:,:), & - Kvu(:,:), Kvv(:,:), & - itest, jtest, rtest, & - i, j, k, p ) -! call t_stopf('glissade_compute_element_matrix') + ! Compute effective viscosity for each layer at this quadrature point + !TODO - sigma -> stagsigma for L1L2 viscosity? + call compute_effective_viscosity_L1L2(whichefvs, & + efvs_constant, effstrain_min, & + nz, sigma, & + nNodesPerElement_2d, phi_2d(:,p), & + dphi_dx_2d(:), dphi_dy_2d(:), & + u(:), v(:), & + h(:), & + dsdx(:), dsdy(:), & + flwa(:,i,j), flwafact(:,i,j), & + efvs_qp(:,p), & + itest, jtest, rtest, & + i, j, p) - enddo ! nQuadPoints_3d + ! Compute vertical average of effective viscosity + efvs_qp_vertavg(p) = 0.d0 + do k = 1, nz-1 + efvs_qp_vertavg(p) = efvs_qp_vertavg(p) + efvs_qp(k,p) * (sigma(k+1) - sigma(k)) + enddo - ! Compute average of effective viscosity over quad pts - efvs(k,i,j) = 0.d0 + elseif (whichapprox == HO_APPROX_DIVA) then - do p = 1, nQuadPoints_3d - efvs(k,i,j) = efvs(k,i,j) + efvs_qp(p) + ! Copy efvs_qp from global array to local column array + efvs_qp(:,:) = efvs_qp_3d(:,:,i,j) + + ! Compute effective viscosity for each layer at this quadrature point + ! Note: efvs_qp_3d is intent(inout); old value is used to compute new value + call compute_effective_viscosity_diva(whichefvs, & + efvs_constant, effstrain_min, & + nz, stagsigma, & + nNodesPerElement_2d, phi_2d(:,p), & + dphi_dx_2d(:), dphi_dy_2d(:), & + u(:), v(:), & + bx(:), by(:), & + diva_slope_factor_x(i,j), & + diva_slope_factor_y(i,j), & + h(:), & + flwa(:,i,j), flwafact(:,i,j), & + efvs_qp(:,p), & + itest, jtest, rtest, & + i, j, p) + + if (verbose_efvs .and. this_rank==rtest .and. i==itest .and. j==jtest .and. p==ptest) then + write(iulog,*) 'i, j, k, p, efvs (Pa yr):', i, j, k, p, efvs_qp(:,p) + endif + + !WHL - Copy local efvs_qp to the global array + efvs_qp_3d(:,:,i,j) = efvs_qp(:,:) + + ! Compute vertical average of effective viscosity + efvs_qp_vertavg(p) = 0.d0 + do k = 1, nz-1 + efvs_qp_vertavg(p) = efvs_qp_vertavg(p) + efvs_qp(k,p)*(sigma(k+1) - sigma(k)) + enddo + + else ! SSA + + ! Compute vertically averaged effective viscosity at this quadrature point + !TODO - Why do we pass in dphi_dz_2d here and not elsewhere? + call compute_effective_viscosity(whichefvs, whichapprox, & + efvs_constant, effstrain_min, & + nNodesPerElement_2d, & + dphi_dx_2d(:), dphi_dy_2d(:), dphi_dz_2d(:), & + u(:), v(:), & + flwafact_2d(i,j), efvs_qp_vertavg(p), & + itest, jtest, rtest, & + i, j, 1, p) + + ! Copy vertically averaged value to all levels + efvs_qp(:,p) = efvs_qp_vertavg(p) + + endif ! whichapprox + + ! Compute ice thickness at this quadrature point + + h_qp = 0.d0 + do n = 1, nNodesPerElement_2d + h_qp = h_qp + phi_2d(n,p) * h(n) enddo - efvs(k,i,j) = efvs(k,i,j) / nQuadPoints_3d - - if (check_symmetry_element) then - call check_symmetry_element_matrix(nNodesPerElement_3d, & - Kuu, Kuv, Kvu, Kvv) - endif - ! Sum terms of element matrix K into dense assembled matrix A + ! Increment the element stiffness matrix with the contribution from each quadrature point. + ! Note: The effective viscosity is multiplied by thickness since the equation to be solved + ! is vertically integrated. - call element_to_global_matrix_3d(nx, ny, nz, & - i, j, k, & - itest, jtest, rtest, & - Kuu, Kuv, & - Kvu, Kvv, & - Auu, Auv, & - Avu, Avv) + call compute_element_matrix(whichapprox, nNodesPerElement_2d, & + wqp_2d(p), detJ(p), & + h_qp*efvs_qp_vertavg(p), & + dphi_dx_2d(:), dphi_dy_2d(:), dphi_dz_2d(:), & + Kuu(:,:), Kuv(:,:), & + Kvu(:,:), Kvv(:,:), & + itest, jtest, rtest, & + i, j, 1, p ) - enddo ! nz (loop over elements in this column) + enddo ! nQuadPoints_2d + + if (whichapprox == HO_APPROX_DIVA) then + + ! Compute vertical integrals needed for the 2D solve and 3D velocity reconstruction + call compute_integrals_diva(nz, sigma, & + itest, jtest, rtest, & + thck(i,j), efvs_qp(:,:), & + omega_k(:,i,j), omega(i,j), & + i, j) + + endif + + ! Compute average of effective viscosity over quad points. + ! For L1L2 and DIVA there is a different efvs in each layer. + ! For SSA, simply write the vertical average value to each layer. + + efvs(:,i,j) = 0.d0 + do p = 1, nQuadPoints_2d + do k = 1, nz-1 + efvs(k,i,j) = efvs(k,i,j) + efvs_qp(k,p) + enddo + enddo + efvs(:,i,j) = efvs(:,i,j) / nQuadPoints_2d + + if (check_symmetry_element) then + call check_symmetry_element_matrix(nNodesPerElement_2d, & + Kuu, Kuv, Kvu, Kvv) + endif + + ! Sum the terms of element matrix K into the dense assembled matrix A + + call element_to_global_matrix_2d(nx, ny, & + i, j, & + itest, jtest, rtest, & + Kuu, Kuv, & + Kvu, Kvv, & + Auu, Auv, & + Avu, Avv) if (verbose_efvs .and. this_rank==rtest .and. i==itest .and. j==jtest) then write(iulog,*) ' ' - write(iulog,*) 'Assembled 3D matrix, i, j =', i, j + write(iulog,*) 'Assembled 2D matrix, i, j =', i, j write(iulog,*) 'k, flwafact, efvs:' do k = 1, nz-1 write(iulog,*) k, flwafact(k,i,j), efvs(k,i,j) @@ -5159,42 +5310,30 @@ subroutine assemble_stiffness_matrix_3d(nx, ny, & enddo ! i enddo ! j - end subroutine assemble_stiffness_matrix_3d + end subroutine assemble_stiffness_matrix_2d !**************************************************************************** - subroutine assemble_stiffness_matrix_2d(nx, ny, & - nz, & - sigma, stagsigma, & + subroutine assemble_stiffness_matrix_3d(nx, ny, & + nz, sigma, & nhalo, & itest, jtest, rtest, & active_cell, & xVertex, yVertex, & - uvel_2d, vvel_2d, & + uvel, vvel, & stagusrf, stagthck, & - flwa, flwafact, & - whichapprox, & - diva_slope_factor_x, diva_slope_factor_y, & + flwafact, whichapprox, & whichefvs, efvs, & efvs_constant, effstrain_min, & Auu, Auv, & - Avu, Avv, & - dusrf_dx, dusrf_dy, & - thck, & - btractx, btracty, & - omega_k, omega, & - efvs_qp_3d) - + Avu, Avv) + !---------------------------------------------------------------- ! Assemble the stiffness matrix A in the linear system Ax = b. ! This subroutine is called for each nonlinear iteration if ! we are iterating on the effective viscosity. - ! The matrix A can be based on the shallow-shelf approximation or - ! the depth-integrated L1L2 approximation (Schoof and Hindmarsh, 2010). !---------------------------------------------------------------- - use glissade_grid_operators, only: glissade_vertical_average - !---------------------------------------------------------------- ! Input-output arguments !---------------------------------------------------------------- @@ -5202,45 +5341,37 @@ subroutine assemble_stiffness_matrix_2d(nx, ny, & integer, intent(in) :: & nx, ny, & ! horizontal grid dimensions nz, & ! number of vertical levels at which velocity is computed - ! (used for flwafact) + ! Note: the number of elements per column is nz-1 nhalo ! number of halo layers real(dp), dimension(nz), intent(in) :: & - sigma ! sigma vertical coordinate - - real(dp), dimension(nz-1), intent(in) :: & - stagsigma ! staggered sigma vertical coordinate + sigma ! sigma vertical coordinate integer, intent(in) :: & itest, jtest, rtest ! coordinates of diagnostic point logical, dimension(nx,ny), intent(in) :: & - active_cell ! true if cell contains ice and borders a locally owned vertex + active_cell ! true if cell contains ice and borders a locally owned vertex real(dp), dimension(nx-1,ny-1), intent(in) :: & - xVertex, yVertex ! x and y coordinates of vertices + xVertex, yVertex ! x and y coordinates of vertices - real(dp), dimension(nx-1,ny-1), intent(in) :: & - uvel_2d, vvel_2d ! 2D velocity components (m/yr) + real(dp), dimension(nz,nx-1,ny-1), intent(in) :: & + uvel, vvel ! velocity components (m/yr) real(dp), dimension(nx-1,ny-1), intent(in) :: & stagusrf, & ! upper surface elevation on staggered grid (m) stagthck ! ice thickness on staggered grid (m) - !TODO - Pass in flwa only, and compute flwafact here? real(dp), dimension(nz-1,nx,ny), intent(in) :: & - flwa, &! temperature-based flow factor A, Pa^{-n} yr^{-1} - flwafact ! temperature-based flow factor, 0.5 * A^(-1/n), Pa yr^(1/n) + flwafact ! temperature-based flow factor, 0.5 * A^(-1/n), ! used to compute the effective viscosity + ! units: Pa yr^(1/n) integer, intent(in) :: & - whichapprox, & ! option for Stokes approximation (BP, L1L2, SSA, SIA) + whichapprox, & ! option for Stokes approximation (BP, SSA, SIA) whichefvs ! option for effective viscosity calculation - real(dp), dimension(nx,ny), intent(in) :: & - diva_slope_factor_x, & ! correction factor for DIVA in x direction, based on theta_slope_x - diva_slope_factor_y ! correction factor for DIVA in y direction, based on theta_slope_y - real(dp), dimension(nz-1,nx,ny), intent(out) :: & efvs ! effective viscosity (Pa yr) @@ -5248,61 +5379,36 @@ subroutine assemble_stiffness_matrix_2d(nx, ny, & efvs_constant, & ! constant value of effective viscosity (Pa yr) effstrain_min ! minimum value of effective strain rate (yr^-1) for computing viscosity - real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(out) :: & + real(dp), dimension(nNodeNeighbors_3d,nz,nx-1,ny-1), intent(out) :: & Auu, Auv, & ! assembled stiffness matrix, divided into 4 parts - Avu, Avv - - ! The following optional arguments are used for the L1L2 approximation only - - real(dp), dimension(nx-1,ny-1), intent(in), optional :: & - dusrf_dx, & ! upper surface elevation gradient on staggered grid (m/m) - dusrf_dy ! needed for L1L2 assembly only - - ! The following optional arguments are used for DIVA only - - real(dp), dimension(nx,ny), intent(in), optional :: & - thck ! ice thickness (m) - - real(dp), dimension(nx-1,ny-1), intent(in), optional :: & - btractx, btracty ! components of basal traction (Pa) - - real(dp), dimension(nz,nx,ny), intent(out), optional :: & - omega_k ! single integral, defined by Goldberg (2011) eq. 32 - - real(dp), dimension(nx,ny), intent(out), optional :: & - omega ! double integral, defined by Goldberg (2011) eq. 35 - ! Note: omega here = Goldberg's omega/H - - real(dp), dimension(nz-1,nQuadPoints_2d,nx,ny), intent(inout), optional :: & - efvs_qp_3d ! effective viscosity (Pa yr) + Avu, Avv !--------------------------------------------------------- ! Local variables !--------------------------------------------------------- - real(dp), dimension(nQuadPoints_2d) :: & + real(dp), dimension(nQuadPoints_3d) :: & detJ ! determinant of J - real(dp), dimension(nNodesPerElement_2d) :: & - dphi_dx_2d, dphi_dy_2d, dphi_dz_2d ! derivatives of basis function, evaluated at quad pts - ! set dphi_dz = 0 for 2D problem + real(dp), dimension(nNodesPerElement_3d) :: & + dphi_dx_3d, dphi_dy_3d, dphi_dz_3d ! derivatives of basis function, evaluated at quad pt !---------------------------------------------------------------- - ! Note: Kuu, Kuv, Kvu, and Kvv are 4x4 components of the stiffness matrix - ! for the local element. (The combined stiffness matrix is 8x8.) + ! Note: Kuu, Kuv, Kvu, and Kvv are 8x8 components of the stiffness matrix + ! for the local element. (The combined stiffness matrix is 16x16.) ! - ! Once these matrices are formed, their coefficients are summed into the global - ! matrices Auu_2d, Auv_2d, Avu_2d, Avv_2d. The global matrices each have as - ! many rows as there are active vertices, but only 9 columns, corresponding to - ! the 9 vertices of the 4 elements sharing a given node. + ! Once these matrices are formed, their coefficients are summed into the assembled + ! matrices Auu, Auv, Avu, Avv. The A matrices each have as many rows as there are + ! active nodes, but only 27 columns, corresponding to the 27 vertices that belong to + ! the 8 elements sharing a given node. ! ! The native structured PCG solver works with the dense A matrices in the form ! computed here. For the SLAP solver, the terms of the A matrices are put - ! in a sparse matrix format during preprocessing. For the Trilinos solver, - ! the terms of the A matrices are passed to Trilinos one row at a time. + ! in a sparse matrix during preprocessing. For the Trilinos solver, the terms + ! of the A matrices are passed to Trilinos one row at a time. !---------------------------------------------------------------- - real(dp), dimension(nNodesPerElement_2d, nNodesPerElement_2d) :: & ! + real(dp), dimension(nNodesPerElement_3d, nNodesPerElement_3d) :: & ! Kuu, & ! element stiffness matrix, divided into 4 parts as shown below Kuv, & ! Kvu, & ! @@ -5313,36 +5419,25 @@ subroutine assemble_stiffness_matrix_2d(nx, ny, & ! ! Kvu may not be needed if matrix is symmetric, but is included for now - real(dp), dimension(nNodesPerElement_2d) :: & - x, y, & ! Cartesian coordinates of vertices - u, v, & ! depth-integrated mean velocity at vertices (m/yr) - h, & ! thickness at vertices (m) - s, & ! upper surface elevation at vertices (m) - bx, by, & ! basal traction at vertices (Pa) (DIVA only) - dsdx, dsdy ! upper surface elevation gradient at vertices (m/m) (L1L2 only) - - real(dp), dimension(nQuadPoints_2d) :: & - efvs_qp_vertavg ! vertically averaged effective viscosity at a quad pt - - real(dp) :: & - h_qp ! thickness at a quad pt + real(dp), dimension(nNodesPerElement_3d) :: & + x, y, z, & ! Cartesian coordinates of nodes + u, v, & ! u and v at nodes + s ! upper surface elevation at nodes - real(dp), dimension(nz-1,nQuadPoints_2d) :: & - efvs_qp ! effective viscosity at each layer in a cell column - ! corresponding to a quad pt + real(dp), dimension(nQuadPoints_3d) :: & + efvs_qp ! effective viscosity at a quad pt logical, parameter :: & check_symmetry_element = .true. ! if true, then check symmetry of element matrix - - real(dp), dimension(nx,ny) :: & - flwafact_2d ! vertically averaged flow factor + !Note: Can speed up assembly a bit by setting to false for production integer :: i, j, k, n, p - integer :: iVertex, jVertex + integer :: iNode, jNode, kNode if (verbose_matrix .and. main_task) then write(iulog,*) ' ' - write(iulog,*) 'In assemble_stiffness_matrix_2d' + write(iulog,*) 'In assemble_stiffness_matrix_3d' + write(iulog,*) 'itest, jtest, ktest, rtest =', itest, jtest, ktest, rtest endif ! Initialize effective viscosity @@ -5350,220 +5445,125 @@ subroutine assemble_stiffness_matrix_2d(nx, ny, & ! Initialize global stiffness matrix - Auu(:,:,:) = 0.d0 - Auv(:,:,:) = 0.d0 - Avu(:,:,:) = 0.d0 - Avv(:,:,:) = 0.d0 - - ! Compute vertical average of flow factor (SSA only) - if (whichapprox == HO_APPROX_SSA) then - call glissade_vertical_average(nx, ny, & - nz, sigma, & - flwafact, flwafact_2d) - endif + Auu(:,:,:,:) = 0.d0 + Auv(:,:,:,:) = 0.d0 + Avu(:,:,:,:) = 0.d0 + Avv(:,:,:,:) = 0.d0 ! Sum over elements in active cells ! Loop over all cells that border locally owned vertices. do j = nhalo+1, ny-nhalo+1 do i = nhalo+1, nx-nhalo+1 - - if (active_cell(i,j)) then - - ! Initialize element stiffness matrix - Kuu(:,:) = 0.d0 - Kuv(:,:) = 0.d0 - Kvu(:,:) = 0.d0 - Kvv(:,:) = 0.d0 - - ! Compute spatial coordinates, velocity, thickness and surface elevation for each vertex - ! Also compute surface elevation gradient (for L1L2) and basal traction (for DIVA) - do n = 1, nNodesPerElement_2d - ! Determine (i,j) for this vertex - ! The reason for the '3' is that node 3, in the NE corner of the grid cell, has index (i,j). - ! Indices for other nodes are computed relative to this vertex. - iVertex = i + ishift(3,n) - jVertex = j + jshift(3,n) - - x(n) = xVertex(iVertex,jVertex) - y(n) = yVertex(iVertex,jVertex) - u(n) = uvel_2d(iVertex,jVertex) - v(n) = vvel_2d(iVertex,jVertex) - s(n) = stagusrf(iVertex,jVertex) - h(n) = stagthck(iVertex,jVertex) - if (present(dusrf_dx) .and. present(dusrf_dy)) then ! L1L2 - dsdx(n) = dusrf_dx(iVertex,jVertex) - dsdy(n) = dusrf_dy(iVertex,jVertex) - endif - if (present(btractx) .and. present(btracty)) then ! DIVA - bx(n) = btractx(iVertex,jVertex) - by(n) = btracty(iVertex,jVertex) - endif - - if (verbose_matrix .and. this_rank==rtest .and. i==itest .and. j==jtest) then - write(iulog,*) ' ' - write(iulog,*) 'i, j, n, x, y:', i, j, n, x(n), y(n) - write(iulog,*) 's, h, u, v:', s(n), h(n), u(n), v(n) - if (present(btractx) .and. present(btracty)) write(iulog,*) 'bx, by:', bx(n), by(n) - endif - - enddo ! vertices per element - - ! Loop over quadrature points for this element - - do p = 1, nQuadPoints_2d - - ! Evaluate the derivatives of the element basis functions at this quadrature point. + if (active_cell(i,j)) then - call get_basis_function_derivatives_2d(x(:), y(:), & - dphi_dxr_2d(:,p), dphi_dyr_2d(:,p), & - dphi_dx_2d(:), dphi_dy_2d(:), & - detJ(p), & - itest, jtest, rtest, & - i, j, p) + do k = 1, nz-1 ! loop over elements in this column + ! assume k increases from upper surface to bed - dphi_dz_2d(:) = 0.d0 + ! Initialize element stiffness matrix + Kuu(:,:) = 0.d0 + Kuv(:,:) = 0.d0 + Kvu(:,:) = 0.d0 + Kvv(:,:) = 0.d0 - if (whichapprox == HO_APPROX_L1L2) then + ! compute spatial coordinates, velocity, and upper surface elevation for each node - ! Compute effective viscosity for each layer at this quadrature point - !TODO - sigma -> stagsigma for L1L2 viscosity? - call compute_effective_viscosity_L1L2(whichefvs, & - efvs_constant, effstrain_min, & - nz, sigma, & - nNodesPerElement_2d, phi_2d(:,p), & - dphi_dx_2d(:), dphi_dy_2d(:), & - u(:), v(:), & - h(:), & - dsdx(:), dsdy(:), & - flwa(:,i,j), flwafact(:,i,j), & - efvs_qp(:,p), & - itest, jtest, rtest, & - i, j, p) + do n = 1, nNodesPerElement_3d - ! Compute vertical average of effective viscosity - efvs_qp_vertavg(p) = 0.d0 - do k = 1, nz-1 - efvs_qp_vertavg(p) = efvs_qp_vertavg(p) + efvs_qp(k,p) * (sigma(k+1) - sigma(k)) - enddo + ! Determine (k,i,j) for this node + ! The reason for the '7' is that node 7, in the NE corner of the upper layer, has index (k,i,j). + ! Indices for other nodes are computed relative to this node. + iNode = i + ishift(7,n) + jNode = j + jshift(7,n) + kNode = k + kshift(7,n) - elseif (whichapprox == HO_APPROX_DIVA) then + x(n) = xVertex(iNode,jNode) + y(n) = yVertex(iNode,jNode) + z(n) = stagusrf(iNode,jNode) - sigma(kNode)*stagthck(iNode,jNode) + u(n) = uvel(kNode,iNode,jNode) + v(n) = vvel(kNode,iNode,jNode) + s(n) = stagusrf(iNode,jNode) - ! Copy efvs_qp from global array to local column array - efvs_qp(:,:) = efvs_qp_3d(:,:,i,j) + if (verbose_matrix .and. this_rank==rtest .and. i==itest .and. j==jtest .and. k==ktest) then + write(iulog,*) ' ' + write(iulog,*) 'i, j, k, n, x, y, z:', i, j, k, n, x(n), y(n), z(n) + write(iulog,*) 's, u, v:', s(n), u(n), v(n) + endif - ! Compute effective viscosity for each layer at this quadrature point - ! Note: efvs_qp_3d is intent(inout); old value is used to compute new value - call compute_effective_viscosity_diva(whichefvs, & - efvs_constant, effstrain_min, & - nz, stagsigma, & - nNodesPerElement_2d, phi_2d(:,p), & - dphi_dx_2d(:), dphi_dy_2d(:), & - u(:), v(:), & - bx(:), by(:), & - diva_slope_factor_x(i,j), & - diva_slope_factor_y(i,j), & - h(:), & - flwa(:,i,j), flwafact(:,i,j), & - efvs_qp(:,p), & - itest, jtest, rtest, & - i, j, p) + enddo ! nodes per element - if (verbose_efvs .and. this_rank==rtest .and. i==itest .and. j==jtest .and. p==ptest) then - write(iulog,*) 'i, j, k, p, efvs (Pa yr):', i, j, k, p, efvs_qp(:,p) - endif + ! Loop over quadrature points for this element - !WHL - Copy local efvs_qp to the global array - efvs_qp_3d(:,:,i,j) = efvs_qp(:,:) + do p = 1, nQuadPoints_3d - ! Compute vertical average of effective viscosity - efvs_qp_vertavg(p) = 0.d0 - do k = 1, nz-1 - efvs_qp_vertavg(p) = efvs_qp_vertavg(p) + efvs_qp(k,p)*(sigma(k+1) - sigma(k)) - enddo + ! Evaluate the derivatives of the element basis functions at this quadrature point. + !WHL - Pass in i, j, k, and p to the following subroutines for debugging. - else ! SSA + call get_basis_function_derivatives_3d(x(:), y(:), z(:), & + dphi_dxr_3d(:,p), dphi_dyr_3d(:,p), dphi_dzr_3d(:,p), & + dphi_dx_3d(:), dphi_dy_3d(:), dphi_dz_3d(:), & + detJ(p), & + itest, jtest, rtest, & + i, j, k, p) - ! Compute vertically averaged effective viscosity at this quadrature point - !TODO - Why do we pass in dphi_dz_2d here and not elsewhere? +! call t_startf('glissade_effective_viscosity') call compute_effective_viscosity(whichefvs, whichapprox, & efvs_constant, effstrain_min, & - nNodesPerElement_2d, & - dphi_dx_2d(:), dphi_dy_2d(:), dphi_dz_2d(:), & + nNodesPerElement_3d, & + dphi_dx_3d(:), dphi_dy_3d(:), dphi_dz_3d(:), & u(:), v(:), & - flwafact_2d(i,j), efvs_qp_vertavg(p), & + flwafact(k,i,j), efvs_qp(p), & itest, jtest, rtest, & - i, j, 1, p) - - ! Copy vertically averaged value to all levels - efvs_qp(:,p) = efvs_qp_vertavg(p) - - endif ! whichapprox - - ! Compute ice thickness at this quadrature point - - h_qp = 0.d0 - do n = 1, nNodesPerElement_2d - h_qp = h_qp + phi_2d(n,p) * h(n) - enddo - - ! Increment the element stiffness matrix with the contribution from each quadrature point. - ! Note: The effective viscosity is multiplied by thickness since the equation to be solved - ! is vertically integrated. - - call compute_element_matrix(whichapprox, nNodesPerElement_2d, & - wqp_2d(p), detJ(p), & - h_qp*efvs_qp_vertavg(p), & - dphi_dx_2d(:), dphi_dy_2d(:), dphi_dz_2d(:), & - Kuu(:,:), Kuv(:,:), & - Kvu(:,:), Kvv(:,:), & - itest, jtest, rtest, & - i, j, 1, p ) + i, j, k, p ) +! call t_stopf('glissade_effective_viscosity') - enddo ! nQuadPoints_2d + if (verbose_efvs .and. this_rank==rtest .and. i==itest .and. j==jtest .and. p==ptest) then + write(iulog,*) 'i, j, k, p, efvs (Pa yr):', i, j, k, p, efvs_qp(p) + endif - if (whichapprox == HO_APPROX_DIVA) then + ! Increment the element stiffness matrix with the contribution from each quadrature point. - ! Compute vertical integrals needed for the 2D solve and 3D velocity reconstruction - call compute_integrals_diva(nz, sigma, & - itest, jtest, rtest, & - thck(i,j), efvs_qp(:,:), & - omega_k(:,i,j), omega(i,j), & - i, j) +! call t_startf('glissade_compute_element_matrix') + call compute_element_matrix(whichapprox, nNodesPerElement_3d, & + wqp_3d(p), detJ(p), efvs_qp(p), & + dphi_dx_3d(:), dphi_dy_3d(:), dphi_dz_3d(:), & + Kuu(:,:), Kuv(:,:), & + Kvu(:,:), Kvv(:,:), & + itest, jtest, rtest, & + i, j, k, p ) +! call t_stopf('glissade_compute_element_matrix') - endif + enddo ! nQuadPoints_3d - ! Compute average of effective viscosity over quad points. - ! For L1L2 and DIVA there is a different efvs in each layer. - ! For SSA, simply write the vertical average value to each layer. + ! Compute average of effective viscosity over quad pts + efvs(k,i,j) = 0.d0 - efvs(:,i,j) = 0.d0 - do p = 1, nQuadPoints_2d - do k = 1, nz-1 - efvs(k,i,j) = efvs(k,i,j) + efvs_qp(k,p) + do p = 1, nQuadPoints_3d + efvs(k,i,j) = efvs(k,i,j) + efvs_qp(p) enddo - enddo - efvs(:,i,j) = efvs(:,i,j) / nQuadPoints_2d + efvs(k,i,j) = efvs(k,i,j) / nQuadPoints_3d + + if (check_symmetry_element) then + call check_symmetry_element_matrix(nNodesPerElement_3d, & + Kuu, Kuv, Kvu, Kvv) + endif - if (check_symmetry_element) then - call check_symmetry_element_matrix(nNodesPerElement_2d, & - Kuu, Kuv, Kvu, Kvv) - endif + ! Sum terms of element matrix K into dense assembled matrix A - ! Sum the terms of element matrix K into the dense assembled matrix A + call element_to_global_matrix_3d(nx, ny, nz, & + i, j, k, & + itest, jtest, rtest, & + Kuu, Kuv, & + Kvu, Kvv, & + Auu, Auv, & + Avu, Avv) - call element_to_global_matrix_2d(nx, ny, & - i, j, & - itest, jtest, rtest, & - Kuu, Kuv, & - Kvu, Kvv, & - Auu, Auv, & - Avu, Avv) + enddo ! nz (loop over elements in this column) if (verbose_efvs .and. this_rank==rtest .and. i==itest .and. j==jtest) then write(iulog,*) ' ' - write(iulog,*) 'Assembled 2D matrix, i, j =', i, j + write(iulog,*) 'Assembled 3D matrix, i, j =', i, j write(iulog,*) 'k, flwafact, efvs:' do k = 1, nz-1 write(iulog,*) k, flwafact(k,i,j), efvs(k,i,j) @@ -5575,7 +5575,7 @@ subroutine assemble_stiffness_matrix_2d(nx, ny, & enddo ! i enddo ! j - end subroutine assemble_stiffness_matrix_2d + end subroutine assemble_stiffness_matrix_3d !**************************************************************************** @@ -6274,127 +6274,104 @@ end subroutine compute_3d_velocity_L1L2 !**************************************************************************** - subroutine get_basis_function_derivatives_3d(xNode, yNode, zNode, & - dphi_dxr_3d, dphi_dyr_3d, dphi_dzr_3d, & - dphi_dx_3d, dphi_dy_3d, dphi_dz_3d, & - detJ, & - itest, jtest, rtest, & - i, j, k, p) + subroutine get_basis_function_derivatives_2d(xNode, yNode, & + dphi_dxr_2d, dphi_dyr_2d, & + dphi_dx_2d, dphi_dy_2d, & + detJ, & + itest, jtest, rtest, & + i, j, p) !------------------------------------------------------------------ - ! Evaluate the x, y and z derivatives of the element basis functions + ! Evaluate the x and y derivatives of 2D element basis functions ! at a particular quadrature point. ! ! Also determine the Jacobian of the transformation between the ! reference element and the true element. ! - ! This subroutine should work for any 3D element with any number of nodes. + ! This subroutine should work for any 2D element with any number of nodes. !------------------------------------------------------------------ - - real(dp), dimension(nNodesPerElement_3d), intent(in) :: & - xNode, yNode, zNode, &! nodal coordinates - dphi_dxr_3d, dphi_dyr_3d, dphi_dzr_3d ! derivatives of basis functions at quad pt - ! wrt x, y and z in reference element - real(dp), dimension(nNodesPerElement_3d), intent(out) :: & - dphi_dx_3d, dphi_dy_3d, dphi_dz_3d ! derivatives of basis functions at quad pt - ! wrt x, y and z in true Cartesian coordinates + real(dp), dimension(nNodesPerElement_2d), intent(in) :: & + xNode, yNode, &! nodal coordinates + dphi_dxr_2d, dphi_dyr_2d ! derivatives of basis functions at quad pt + ! wrt x and y in reference element + + real(dp), dimension(nNodesPerElement_2d), intent(out) :: & + dphi_dx_2d, dphi_dy_2d ! derivatives of basis functions at quad pt + ! wrt x and y in true Cartesian coordinates real(dp), intent(out) :: & - detJ ! determinant of Jacobian matrix + detJ ! determinant of Jacobian matrix - real(dp), dimension(3,3) :: & - Jac, &! Jacobian matrix - Jinv, &! inverse Jacobian matrix - cofactor ! matrix of cofactors + real(dp), dimension(2,2) :: & + Jac, &! Jacobian matrix + Jinv ! inverse Jacobian matrix integer, intent(in) :: & itest, jtest, rtest ! coordinates of diagnostic point - integer, intent(in) :: i, j, k, p ! indices passed in for debugging + integer, intent(in) :: i, j, p integer :: n, row, col logical, parameter :: Jac_bug_check = .false. ! set to true for debugging - real(dp), dimension(3,3) :: prod ! Jac * Jinv (should be identity matrix) + real(dp), dimension(2,2) :: prod ! Jac * Jinv (should be identity matrix) !------------------------------------------------------------------ ! Compute the Jacobian for the transformation from the reference ! coordinates to the true coordinates: ! - ! | | - ! | sum_n{dphi_n/dxr * xn} sum_n{dphi_n/dxr * yn} sum_n{dphi_n/dxr * zn} | - ! J(xr,yr,zr) = | | - ! | sum_n{dphi_n/dyr * xn} sum_n{dphi_n/dyr * yn} sum_n{dphi_n/dyr * zn} | - ! | | - ! | sum_n{dphi_n/dzr * xn} sum_n{dphi_n/dzr * yn} sum_n{dphi_n/dzr * zn} | - ! ! | + ! | | + ! | sum_n{dphi_n/dxr * xn} sum_n{dphi_n/dxr * yn} | + ! J(xr,yr) = | | + ! | sum_n{dphi_n/dyr * xn} sum_n{dphi_n/dyr * yn} | + ! | | ! - ! where (xn,yn,zn) are the true Cartesian nodal coordinates, - ! (xr,yr,zr) are the coordinates of the quad point in the reference element, + ! where (xn,yn) are the true Cartesian nodal coordinates, + ! (xr,yr) are the coordinates of the quad point in the reference element, ! and sum_n denotes a sum over nodes. !------------------------------------------------------------------ - if (verbose_Jac .and. this_rank==rtest .and. i==itest .and. j==jtest .and. k==ktest) then + Jac(:,:) = 0.d0 + + if ((verbose_Jac .or. verbose_diva) .and. this_rank==rtest .and. i==itest .and. j==jtest) then write(iulog,*) ' ' - write(iulog,*) 'In get_basis_function_derivatives_3d: i, j, k, p =', i, j, k, p + write(iulog,*) 'In get_basis_function_derivatives_2d: i, j, p =', i, j, p endif - Jac(:,:) = 0.d0 - - do n = 1, nNodesPerElement_3d - Jac(1,1) = Jac(1,1) + dphi_dxr_3d(n) * xNode(n) - Jac(1,2) = Jac(1,2) + dphi_dxr_3d(n) * yNode(n) - Jac(1,3) = Jac(1,3) + dphi_dxr_3d(n) * zNode(n) - Jac(2,1) = Jac(2,1) + dphi_dyr_3d(n) * xNode(n) - Jac(2,2) = Jac(2,2) + dphi_dyr_3d(n) * yNode(n) - Jac(2,3) = Jac(2,3) + dphi_dyr_3d(n) * zNode(n) - Jac(3,1) = Jac(3,1) + dphi_dzr_3d(n) * xNode(n) - Jac(3,2) = Jac(3,2) + dphi_dzr_3d(n) * yNode(n) - Jac(3,3) = Jac(3,3) + dphi_dzr_3d(n) * zNode(n) + do n = 1, nNodesPerElement_2d + if (verbose_Jac .and. this_rank==rtest .and. i==itest .and. j==jtest) then + write(iulog,*) ' ' + write(iulog,*) 'n, x, y:', n, xNode(n), yNode(n) + write(iulog,*) 'dphi_dxr_2d, dphi_dyr_2d:', dphi_dxr_2d(n), dphi_dyr_2d(n) + endif + Jac(1,1) = Jac(1,1) + dphi_dxr_2d(n) * xNode(n) + Jac(1,2) = Jac(1,2) + dphi_dxr_2d(n) * yNode(n) + Jac(2,1) = Jac(2,1) + dphi_dyr_2d(n) * xNode(n) + Jac(2,2) = Jac(2,2) + dphi_dyr_2d(n) * yNode(n) enddo !------------------------------------------------------------------ ! Compute the determinant and inverse of J !------------------------------------------------------------------ - cofactor(1,1) = Jac(2,2)*Jac(3,3) - Jac(2,3)*Jac(3,2) - cofactor(1,2) = -(Jac(2,1)*Jac(3,3) - Jac(2,3)*Jac(3,1)) - cofactor(1,3) = Jac(2,1)*Jac(3,2) - Jac(2,2)*Jac(3,1) - cofactor(2,1) = -(Jac(1,2)*Jac(3,3) - Jac(1,3)*Jac(3,2)) - cofactor(2,2) = Jac(1,1)*Jac(3,3) - Jac(1,3)*Jac(3,1) - cofactor(2,3) = -(Jac(1,1)*Jac(3,2) - Jac(1,2)*Jac(3,1)) - cofactor(3,1) = Jac(1,2)*Jac(2,3) - Jac(1,3)*Jac(2,2) - cofactor(3,2) = -(Jac(1,1)*Jac(2,3) - Jac(1,3)*Jac(2,1)) - cofactor(3,3) = Jac(1,1)*Jac(2,2) - Jac(1,2)*Jac(2,1) - - detJ = Jac(1,1)*cofactor(1,1) + Jac(1,2)*cofactor(1,2) + Jac(1,3)*cofactor(1,3) - - if (verbose_Jac .and. this_rank==rtest .and. i==itest .and. j==jtest .and. k==ktest) then - write(iulog,*) ' ' - write(iulog,*) 'detJ1:', Jac(1,1)*cofactor(1,1) + Jac(1,2)*cofactor(1,2) + Jac(1,3)*cofactor(1,3) - write(iulog,*) 'detJ2:', Jac(2,1)*cofactor(2,1) + Jac(2,2)*cofactor(2,2) + Jac(2,3)*cofactor(2,3) - write(iulog,*) 'detJ3:', Jac(3,1)*cofactor(3,1) + Jac(3,2)*cofactor(3,2) + Jac(3,3)*cofactor(3,3) - endif + detJ = Jac(1,1)*Jac(2,2) - Jac(1,2)*Jac(2,1) if (abs(detJ) > 0.d0) then - do col = 1, 3 - do row = 1, 3 - Jinv(row,col) = cofactor(col,row) - enddo - enddo - Jinv(:,:) = Jinv(:,:) / detJ + Jinv(1,1) = Jac(2,2)/detJ + Jinv(1,2) = -Jac(1,2)/detJ + Jinv(2,1) = -Jac(2,1)/detJ + Jinv(2,2) = Jac(1,1)/detJ else write(iulog,*) 'stopping, det J = 0' - write(iulog,*) 'i, j, k, p:', i, j, k, p + write(iulog,*) 'i, j, p:', i, j, p write(iulog,*) 'Jacobian matrix:' write(iulog,*) Jac(1,:) write(iulog,*) Jac(2,:) - write(iulog,*) Jac(3,:) call write_log('Jacobian matrix is singular', GM_FATAL) endif - if (verbose_Jac .and. this_rank==rtest .and. i==itest .and. j==jtest .and. k==ktest) then + if (verbose_Jac .and. this_rank==rtest .and. i==itest .and. j==jtest) then write(iulog,*) ' ' write(iulog,*) 'Jacobian calc, p =', p write(iulog,*) 'det J =', detJ @@ -6402,204 +6379,205 @@ subroutine get_basis_function_derivatives_3d(xNode, yNode, zNode, write(iulog,*) 'Jacobian matrix:' write(iulog,*) Jac(1,:) write(iulog,*) Jac(2,:) - write(iulog,*) Jac(3,:) - write(iulog,*) ' ' - write(iulog,*) 'cofactor matrix:' - write(iulog,*) cofactor(1,:) - write(iulog,*) cofactor(2,:) - write(iulog,*) cofactor(3,:) write(iulog,*) ' ' write(iulog,*) 'Inverse matrix:' write(iulog,*) Jinv(1,:) write(iulog,*) Jinv(2,:) - write(iulog,*) Jinv(3,:) write(iulog,*) ' ' prod = matmul(Jac, Jinv) write(iulog,*) 'Jac*Jinv:' write(iulog,*) prod(1,:) write(iulog,*) prod(2,:) - write(iulog,*) prod(3,:) endif - ! Optional bug check: Verify that J * Jinv = I + ! Optional bug check - Verify that J * Jinv = I if (Jac_bug_check) then prod = matmul(Jac,Jinv) - do col = 1, 3 - do row = 1, 3 - if (abs(prod(row,col) - identity3(row,col)) > eps10) then - write(iulog,*) '3d Jacobian, stopping, Jac * Jinv /= identity' - write(iulog,*) 'rank, i, j, k, p:', this_rank, i, j, k, p + do col = 1, 2 + do row = 1, 2 + if (abs(prod(row,col) - identity3(row,col)) > 1.d-12) then + write(iulog,*) '2d Jacobian, stopping, Jac * Jinv /= identity' + write(iulog,*) 'rank, i, j, p:', this_rank, i, j, p write(iulog,*) 'Jac*Jinv:' write(iulog,*) prod(1,:) write(iulog,*) prod(2,:) - write(iulog,*) prod(3,:) call write_log('Jacobian matrix was not correctly inverted', GM_FATAL) endif enddo enddo - endif ! Jac_bug_check + endif !------------------------------------------------------------------ ! Compute the contribution of this quadrature point to dphi/dx and dphi/dy ! for each basis function. ! ! | dphi_n/dx | | dphi_n/dxr | - ! | | | | - ! | dphi_n/dy | = Jinv * | dphi_n/dyr | - ! | | | | - ! | dphi_n/dz | | dphi_n/dzr | + ! | | = Jinv * | | + ! | dphi_n/dy | | dphi_n/dyr | ! !------------------------------------------------------------------ - dphi_dx_3d(:) = 0.d0 - dphi_dy_3d(:) = 0.d0 - dphi_dz_3d(:) = 0.d0 + dphi_dx_2d(:) = 0.d0 + dphi_dy_2d(:) = 0.d0 - do n = 1, nNodesPerElement_3d - dphi_dx_3d(n) = Jinv(1,1)*dphi_dxr_3d(n) & - + Jinv(1,2)*dphi_dyr_3d(n) & - + Jinv(1,3)*dphi_dzr_3d(n) - dphi_dy_3d(n) = Jinv(2,1)*dphi_dxr_3d(n) & - + Jinv(2,2)*dphi_dyr_3d(n) & - + Jinv(2,3)*dphi_dzr_3d(n) - dphi_dz_3d(n) = Jinv(3,1)*dphi_dxr_3d(n) & - + Jinv(3,2)*dphi_dyr_3d(n) & - + Jinv(3,3)*dphi_dzr_3d(n) + do n = 1, nNodesPerElement_2d + dphi_dx_2d(n) = dphi_dx_2d(n) + Jinv(1,1)*dphi_dxr_2d(n) & + + Jinv(1,2)*dphi_dyr_2d(n) + dphi_dy_2d(n) = dphi_dy_2d(n) + Jinv(2,1)*dphi_dxr_2d(n) & + + Jinv(2,2)*dphi_dyr_2d(n) enddo + if (verbose_Jac .and. this_rank==rtest .and. i==itest .and. j==jtest) then + write(iulog,*) ' ' + write(iulog,*) 'dphi_dx_2d:', dphi_dx_2d(:) + write(iulog,*) 'dphi_dy_2d:', dphi_dy_2d(:) + endif + if (Jac_bug_check) then ! Check that the sum of dphi_dx, etc. is close to zero - - if (abs( sum(dphi_dx_3d)/maxval(dphi_dx_3d) ) > 1.d-11) then + if (abs( sum(dphi_dx_2d)/maxval(dphi_dx_2d) ) > 1.d-11) then write(iulog,*) 'stopping, sum over basis functions of dphi_dx > 0' - write(iulog,*) 'dphi_dx_3d =', dphi_dx_3d(:) - write(iulog,*) 'sum =', sum(dphi_dx_3d) - write(iulog,*) 'i, j, k, p =', i, j, k, p + write(iulog,*) 'dphi_dx_2d =', dphi_dx_2d(:) + write(iulog,*) 'i, j, p =', i, j, p call write_log('Sum over basis functions of dphi_dx /= 0', GM_FATAL) endif - if (abs( sum(dphi_dy_3d)/maxval(dphi_dy_3d) ) > 1.d-11) then + if (abs( sum(dphi_dy_2d)/maxval(dphi_dy_2d) ) > 1.d-11) then write(iulog,*) 'stopping, sum over basis functions of dphi_dy > 0' - write(iulog,*) 'dphi_dy_3d =', dphi_dy_3d(:) - write(iulog,*) 'sum =', sum(dphi_dy_3d) - write(iulog,*) 'i, j, k, p =', i, j, k, p + write(iulog,*) 'dphi_dy =', dphi_dy_2d(:) + write(iulog,*) 'i, j, p =', i, j, p call write_log('Sum over basis functions of dphi_dy /= 0', GM_FATAL) endif - if (abs( sum(dphi_dz_3d)/maxval(dphi_dz_3d) ) > 1.d-11) then - write(iulog,*) 'stopping, sum over basis functions of dphi_dz > 0' - write(iulog,*) 'dphi_dz_3d =', dphi_dz_3d(:) - write(iulog,*) 'sum =', sum(dphi_dz_3d) - write(iulog,*) 'i, j, k, p =', i, j, k, p - call write_log('Sum over basis functions of dphi_dz /= 0', GM_FATAL) - endif - - endif ! Jac_bug_check - - end subroutine get_basis_function_derivatives_3d + endif + end subroutine get_basis_function_derivatives_2d + !**************************************************************************** - subroutine get_basis_function_derivatives_2d(xNode, yNode, & - dphi_dxr_2d, dphi_dyr_2d, & - dphi_dx_2d, dphi_dy_2d, & - detJ, & - itest, jtest, rtest, & - i, j, p) + subroutine get_basis_function_derivatives_3d(xNode, yNode, zNode, & + dphi_dxr_3d, dphi_dyr_3d, dphi_dzr_3d, & + dphi_dx_3d, dphi_dy_3d, dphi_dz_3d, & + detJ, & + itest, jtest, rtest, & + i, j, k, p) !------------------------------------------------------------------ - ! Evaluate the x and y derivatives of 2D element basis functions + ! Evaluate the x, y and z derivatives of the element basis functions ! at a particular quadrature point. ! ! Also determine the Jacobian of the transformation between the ! reference element and the true element. ! - ! This subroutine should work for any 2D element with any number of nodes. + ! This subroutine should work for any 3D element with any number of nodes. !------------------------------------------------------------------ + + real(dp), dimension(nNodesPerElement_3d), intent(in) :: & + xNode, yNode, zNode, &! nodal coordinates + dphi_dxr_3d, dphi_dyr_3d, dphi_dzr_3d ! derivatives of basis functions at quad pt + ! wrt x, y and z in reference element - real(dp), dimension(nNodesPerElement_2d), intent(in) :: & - xNode, yNode, &! nodal coordinates - dphi_dxr_2d, dphi_dyr_2d ! derivatives of basis functions at quad pt - ! wrt x and y in reference element - - real(dp), dimension(nNodesPerElement_2d), intent(out) :: & - dphi_dx_2d, dphi_dy_2d ! derivatives of basis functions at quad pt - ! wrt x and y in true Cartesian coordinates + real(dp), dimension(nNodesPerElement_3d), intent(out) :: & + dphi_dx_3d, dphi_dy_3d, dphi_dz_3d ! derivatives of basis functions at quad pt + ! wrt x, y and z in true Cartesian coordinates real(dp), intent(out) :: & - detJ ! determinant of Jacobian matrix + detJ ! determinant of Jacobian matrix - real(dp), dimension(2,2) :: & - Jac, &! Jacobian matrix - Jinv ! inverse Jacobian matrix + real(dp), dimension(3,3) :: & + Jac, &! Jacobian matrix + Jinv, &! inverse Jacobian matrix + cofactor ! matrix of cofactors integer, intent(in) :: & itest, jtest, rtest ! coordinates of diagnostic point - integer, intent(in) :: i, j, p + integer, intent(in) :: i, j, k, p ! indices passed in for debugging integer :: n, row, col logical, parameter :: Jac_bug_check = .false. ! set to true for debugging - real(dp), dimension(2,2) :: prod ! Jac * Jinv (should be identity matrix) + real(dp), dimension(3,3) :: prod ! Jac * Jinv (should be identity matrix) !------------------------------------------------------------------ ! Compute the Jacobian for the transformation from the reference ! coordinates to the true coordinates: ! - ! | | - ! | sum_n{dphi_n/dxr * xn} sum_n{dphi_n/dxr * yn} | - ! J(xr,yr) = | | - ! | sum_n{dphi_n/dyr * xn} sum_n{dphi_n/dyr * yn} | - ! | | + ! | | + ! | sum_n{dphi_n/dxr * xn} sum_n{dphi_n/dxr * yn} sum_n{dphi_n/dxr * zn} | + ! J(xr,yr,zr) = | | + ! | sum_n{dphi_n/dyr * xn} sum_n{dphi_n/dyr * yn} sum_n{dphi_n/dyr * zn} | + ! | | + ! | sum_n{dphi_n/dzr * xn} sum_n{dphi_n/dzr * yn} sum_n{dphi_n/dzr * zn} | + ! ! | ! - ! where (xn,yn) are the true Cartesian nodal coordinates, - ! (xr,yr) are the coordinates of the quad point in the reference element, + ! where (xn,yn,zn) are the true Cartesian nodal coordinates, + ! (xr,yr,zr) are the coordinates of the quad point in the reference element, ! and sum_n denotes a sum over nodes. !------------------------------------------------------------------ - Jac(:,:) = 0.d0 - - if ((verbose_Jac .or. verbose_diva) .and. this_rank==rtest .and. i==itest .and. j==jtest) then + if (verbose_Jac .and. this_rank==rtest .and. i==itest .and. j==jtest .and. k==ktest) then write(iulog,*) ' ' - write(iulog,*) 'In get_basis_function_derivatives_2d: i, j, p =', i, j, p + write(iulog,*) 'In get_basis_function_derivatives_3d: i, j, k, p =', i, j, k, p endif - do n = 1, nNodesPerElement_2d - if (verbose_Jac .and. this_rank==rtest .and. i==itest .and. j==jtest) then - write(iulog,*) ' ' - write(iulog,*) 'n, x, y:', n, xNode(n), yNode(n) - write(iulog,*) 'dphi_dxr_2d, dphi_dyr_2d:', dphi_dxr_2d(n), dphi_dyr_2d(n) - endif - Jac(1,1) = Jac(1,1) + dphi_dxr_2d(n) * xNode(n) - Jac(1,2) = Jac(1,2) + dphi_dxr_2d(n) * yNode(n) - Jac(2,1) = Jac(2,1) + dphi_dyr_2d(n) * xNode(n) - Jac(2,2) = Jac(2,2) + dphi_dyr_2d(n) * yNode(n) + Jac(:,:) = 0.d0 + + do n = 1, nNodesPerElement_3d + Jac(1,1) = Jac(1,1) + dphi_dxr_3d(n) * xNode(n) + Jac(1,2) = Jac(1,2) + dphi_dxr_3d(n) * yNode(n) + Jac(1,3) = Jac(1,3) + dphi_dxr_3d(n) * zNode(n) + Jac(2,1) = Jac(2,1) + dphi_dyr_3d(n) * xNode(n) + Jac(2,2) = Jac(2,2) + dphi_dyr_3d(n) * yNode(n) + Jac(2,3) = Jac(2,3) + dphi_dyr_3d(n) * zNode(n) + Jac(3,1) = Jac(3,1) + dphi_dzr_3d(n) * xNode(n) + Jac(3,2) = Jac(3,2) + dphi_dzr_3d(n) * yNode(n) + Jac(3,3) = Jac(3,3) + dphi_dzr_3d(n) * zNode(n) enddo !------------------------------------------------------------------ ! Compute the determinant and inverse of J !------------------------------------------------------------------ - detJ = Jac(1,1)*Jac(2,2) - Jac(1,2)*Jac(2,1) + cofactor(1,1) = Jac(2,2)*Jac(3,3) - Jac(2,3)*Jac(3,2) + cofactor(1,2) = -(Jac(2,1)*Jac(3,3) - Jac(2,3)*Jac(3,1)) + cofactor(1,3) = Jac(2,1)*Jac(3,2) - Jac(2,2)*Jac(3,1) + cofactor(2,1) = -(Jac(1,2)*Jac(3,3) - Jac(1,3)*Jac(3,2)) + cofactor(2,2) = Jac(1,1)*Jac(3,3) - Jac(1,3)*Jac(3,1) + cofactor(2,3) = -(Jac(1,1)*Jac(3,2) - Jac(1,2)*Jac(3,1)) + cofactor(3,1) = Jac(1,2)*Jac(2,3) - Jac(1,3)*Jac(2,2) + cofactor(3,2) = -(Jac(1,1)*Jac(2,3) - Jac(1,3)*Jac(2,1)) + cofactor(3,3) = Jac(1,1)*Jac(2,2) - Jac(1,2)*Jac(2,1) + + detJ = Jac(1,1)*cofactor(1,1) + Jac(1,2)*cofactor(1,2) + Jac(1,3)*cofactor(1,3) + + if (verbose_Jac .and. this_rank==rtest .and. i==itest .and. j==jtest .and. k==ktest) then + write(iulog,*) ' ' + write(iulog,*) 'detJ1:', Jac(1,1)*cofactor(1,1) + Jac(1,2)*cofactor(1,2) + Jac(1,3)*cofactor(1,3) + write(iulog,*) 'detJ2:', Jac(2,1)*cofactor(2,1) + Jac(2,2)*cofactor(2,2) + Jac(2,3)*cofactor(2,3) + write(iulog,*) 'detJ3:', Jac(3,1)*cofactor(3,1) + Jac(3,2)*cofactor(3,2) + Jac(3,3)*cofactor(3,3) + endif if (abs(detJ) > 0.d0) then - Jinv(1,1) = Jac(2,2)/detJ - Jinv(1,2) = -Jac(1,2)/detJ - Jinv(2,1) = -Jac(2,1)/detJ - Jinv(2,2) = Jac(1,1)/detJ + do col = 1, 3 + do row = 1, 3 + Jinv(row,col) = cofactor(col,row) + enddo + enddo + Jinv(:,:) = Jinv(:,:) / detJ else write(iulog,*) 'stopping, det J = 0' - write(iulog,*) 'i, j, p:', i, j, p + write(iulog,*) 'i, j, k, p:', i, j, k, p write(iulog,*) 'Jacobian matrix:' write(iulog,*) Jac(1,:) write(iulog,*) Jac(2,:) + write(iulog,*) Jac(3,:) call write_log('Jacobian matrix is singular', GM_FATAL) endif - if (verbose_Jac .and. this_rank==rtest .and. i==itest .and. j==jtest) then + if (verbose_Jac .and. this_rank==rtest .and. i==itest .and. j==jtest .and. k==ktest) then write(iulog,*) ' ' write(iulog,*) 'Jacobian calc, p =', p write(iulog,*) 'det J =', detJ @@ -6607,81 +6585,103 @@ subroutine get_basis_function_derivatives_2d(xNode, yNode, & write(iulog,*) 'Jacobian matrix:' write(iulog,*) Jac(1,:) write(iulog,*) Jac(2,:) + write(iulog,*) Jac(3,:) + write(iulog,*) ' ' + write(iulog,*) 'cofactor matrix:' + write(iulog,*) cofactor(1,:) + write(iulog,*) cofactor(2,:) + write(iulog,*) cofactor(3,:) write(iulog,*) ' ' write(iulog,*) 'Inverse matrix:' write(iulog,*) Jinv(1,:) write(iulog,*) Jinv(2,:) + write(iulog,*) Jinv(3,:) write(iulog,*) ' ' prod = matmul(Jac, Jinv) write(iulog,*) 'Jac*Jinv:' write(iulog,*) prod(1,:) write(iulog,*) prod(2,:) + write(iulog,*) prod(3,:) endif - ! Optional bug check - Verify that J * Jinv = I + ! Optional bug check: Verify that J * Jinv = I if (Jac_bug_check) then prod = matmul(Jac,Jinv) - do col = 1, 2 - do row = 1, 2 - if (abs(prod(row,col) - identity3(row,col)) > 1.d-12) then - write(iulog,*) '2d Jacobian, stopping, Jac * Jinv /= identity' - write(iulog,*) 'rank, i, j, p:', this_rank, i, j, p + do col = 1, 3 + do row = 1, 3 + if (abs(prod(row,col) - identity3(row,col)) > eps10) then + write(iulog,*) '3d Jacobian, stopping, Jac * Jinv /= identity' + write(iulog,*) 'rank, i, j, k, p:', this_rank, i, j, k, p write(iulog,*) 'Jac*Jinv:' write(iulog,*) prod(1,:) write(iulog,*) prod(2,:) + write(iulog,*) prod(3,:) call write_log('Jacobian matrix was not correctly inverted', GM_FATAL) endif enddo enddo - endif + endif ! Jac_bug_check !------------------------------------------------------------------ ! Compute the contribution of this quadrature point to dphi/dx and dphi/dy ! for each basis function. ! ! | dphi_n/dx | | dphi_n/dxr | - ! | | = Jinv * | | - ! | dphi_n/dy | | dphi_n/dyr | + ! | | | | + ! | dphi_n/dy | = Jinv * | dphi_n/dyr | + ! | | | | + ! | dphi_n/dz | | dphi_n/dzr | ! !------------------------------------------------------------------ - dphi_dx_2d(:) = 0.d0 - dphi_dy_2d(:) = 0.d0 + dphi_dx_3d(:) = 0.d0 + dphi_dy_3d(:) = 0.d0 + dphi_dz_3d(:) = 0.d0 - do n = 1, nNodesPerElement_2d - dphi_dx_2d(n) = dphi_dx_2d(n) + Jinv(1,1)*dphi_dxr_2d(n) & - + Jinv(1,2)*dphi_dyr_2d(n) - dphi_dy_2d(n) = dphi_dy_2d(n) + Jinv(2,1)*dphi_dxr_2d(n) & - + Jinv(2,2)*dphi_dyr_2d(n) + do n = 1, nNodesPerElement_3d + dphi_dx_3d(n) = Jinv(1,1)*dphi_dxr_3d(n) & + + Jinv(1,2)*dphi_dyr_3d(n) & + + Jinv(1,3)*dphi_dzr_3d(n) + dphi_dy_3d(n) = Jinv(2,1)*dphi_dxr_3d(n) & + + Jinv(2,2)*dphi_dyr_3d(n) & + + Jinv(2,3)*dphi_dzr_3d(n) + dphi_dz_3d(n) = Jinv(3,1)*dphi_dxr_3d(n) & + + Jinv(3,2)*dphi_dyr_3d(n) & + + Jinv(3,3)*dphi_dzr_3d(n) enddo - if (verbose_Jac .and. this_rank==rtest .and. i==itest .and. j==jtest) then - write(iulog,*) ' ' - write(iulog,*) 'dphi_dx_2d:', dphi_dx_2d(:) - write(iulog,*) 'dphi_dy_2d:', dphi_dy_2d(:) - endif - if (Jac_bug_check) then ! Check that the sum of dphi_dx, etc. is close to zero - if (abs( sum(dphi_dx_2d)/maxval(dphi_dx_2d) ) > 1.d-11) then + + if (abs( sum(dphi_dx_3d)/maxval(dphi_dx_3d) ) > 1.d-11) then write(iulog,*) 'stopping, sum over basis functions of dphi_dx > 0' - write(iulog,*) 'dphi_dx_2d =', dphi_dx_2d(:) - write(iulog,*) 'i, j, p =', i, j, p + write(iulog,*) 'dphi_dx_3d =', dphi_dx_3d(:) + write(iulog,*) 'sum =', sum(dphi_dx_3d) + write(iulog,*) 'i, j, k, p =', i, j, k, p call write_log('Sum over basis functions of dphi_dx /= 0', GM_FATAL) endif - if (abs( sum(dphi_dy_2d)/maxval(dphi_dy_2d) ) > 1.d-11) then + if (abs( sum(dphi_dy_3d)/maxval(dphi_dy_3d) ) > 1.d-11) then write(iulog,*) 'stopping, sum over basis functions of dphi_dy > 0' - write(iulog,*) 'dphi_dy =', dphi_dy_2d(:) - write(iulog,*) 'i, j, p =', i, j, p + write(iulog,*) 'dphi_dy_3d =', dphi_dy_3d(:) + write(iulog,*) 'sum =', sum(dphi_dy_3d) + write(iulog,*) 'i, j, k, p =', i, j, k, p call write_log('Sum over basis functions of dphi_dy /= 0', GM_FATAL) endif - endif + if (abs( sum(dphi_dz_3d)/maxval(dphi_dz_3d) ) > 1.d-11) then + write(iulog,*) 'stopping, sum over basis functions of dphi_dz > 0' + write(iulog,*) 'dphi_dz_3d =', dphi_dz_3d(:) + write(iulog,*) 'sum =', sum(dphi_dz_3d) + write(iulog,*) 'i, j, k, p =', i, j, k, p + call write_log('Sum over basis functions of dphi_dz /= 0', GM_FATAL) + endif - end subroutine get_basis_function_derivatives_2d + endif ! Jac_bug_check + + end subroutine get_basis_function_derivatives_3d !**************************************************************************** @@ -8076,148 +8076,148 @@ end subroutine compute_element_matrix !**************************************************************************** - subroutine element_to_global_matrix_3d(nx, ny, nz, & - iElement, jElement, kElement, & - itest, jtest, rtest, & - Kuu, Kuv, & - Kvu, Kvv, & - Auu, Auv, & + subroutine element_to_global_matrix_2d(nx, ny, & + iElement, jElement, & + itest, jtest, rtest, & + Kuu, Kuv, & + Kvu, Kvv, & + Auu, Auv, & Avu, Avv) - + ! Sum terms of element matrix K into dense assembled matrix A ! K is partitioned into Kuu, Kuv, Kvu, and Kvv, and similarly for A. integer, intent(in) :: & - nx, ny, & ! horizontal grid dimensions - nz ! number of vertical levels where velocity is computed + nx, ny ! horizontal grid dimensions integer, intent(in) :: & - iElement, jElement, kElement ! i, j and k indices for this element + iElement, jElement ! i and j indices for this element integer, intent(in) :: & itest, jtest, rtest ! coordinates of diagnostic point - real(dp), dimension(nNodesPerElement_3d,nNodesPerElement_3d), intent(in) :: & + real(dp), dimension(nNodesPerElement_2d,nNodesPerElement_2d), intent(in) :: & Kuu, Kuv, Kvu, Kvv ! element matrix - real(dp), dimension(nNodeNeighbors_3d,nz,nx-1,ny-1), intent(inout) :: & + real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(inout) :: & Auu, Auv, Avu, Avv ! assembled matrix - integer :: i, j, k, m - integer :: iA, jA, kA + integer :: i, j, m + integer :: iA, jA integer :: n, nr, nc - if (verbose_matrix .and. this_rank==rtest .and. iElement==itest .and. jElement==jtest .and. kElement==ktest) then - write(iulog,*) 'Element i, j, k:', iElement, jElement, kElement + if (verbose_matrix .and. this_rank==rtest .and. iElement==itest .and. jElement==jtest) then + write(iulog,*) 'Element i, j:', iElement, jElement write(iulog,*) 'Rows of Kuu:' - do n = 1, nNodesPerElement_3d + do n = 1, nNodesPerElement_2d write(iulog, '(8e12.4)') Kuu(n,:) enddo endif - !WHL - On a Mac I tried switching the loops to put nc on the outside, but - ! the one with nr on the outside is faster. - do nr = 1, nNodesPerElement_3d ! rows of K + do nr = 1, nNodesPerElement_2d ! rows of K - ! Determine row of A to be incremented by finding (k,i,j) for this node - ! The reason for the '7' is that node 7, in the NE corner of the upper layer, has index (k,i,j). + ! Determine row of A to be incremented by finding (i,j) for this node + ! The reason for the '3' is that node 3, in the NE corner of this gridcell, has index (i,j). ! Indices for other nodes are computed relative to this node. - i = iElement + ishift(7,nr) - j = jElement + jshift(7,nr) - k = kElement + kshift(7,nr) + i = iElement + ishift(3,nr) + j = jElement + jshift(3,nr) - do nc = 1, nNodesPerElement_3d ! columns of K + do nc = 1, nNodesPerElement_2d ! columns of K ! Determine column of A to be incremented - kA = kshift(nr,nc) ! k index of A into which K(m,n) is summed - iA = ishift(nr,nc) ! similarly for i and j indices + iA = ishift(nr,nc) ! similarly for i and j indices jA = jshift(nr,nc) ! these indices can take values -1, 0 and 1 - m = indxA_3d(iA,jA,kA) + m = indxA_2d(iA,jA) ! Increment A - Auu(m,k,i,j) = Auu(m,k,i,j) + Kuu(nr,nc) - Auv(m,k,i,j) = Auv(m,k,i,j) + Kuv(nr,nc) - Avu(m,k,i,j) = Avu(m,k,i,j) + Kvu(nr,nc) - Avv(m,k,i,j) = Avv(m,k,i,j) + Kvv(nr,nc) + Auu(i,j,m) = Auu(i,j,m) + Kuu(nr,nc) + Auv(i,j,m) = Auv(i,j,m) + Kuv(nr,nc) + Avu(i,j,m) = Avu(i,j,m) + Kvu(nr,nc) + Avv(i,j,m) = Avv(i,j,m) + Kvv(nr,nc) - enddo ! nc + if (verbose_matrix .and. this_rank==rtest .and. iElement==itest .and. jElement==jtest) then + write(iulog,*) 'Increment Auu, element i, j, nr, nc =', iElement, jElement, nr, nc + write(iulog,*) ' i, j, m, Kuu, new Auu:', i, j, m, Kuu(nr,nc), Auu(i,j,m) +!! write(iulog,*) 'Increment Avv, element i, j, nr, nc =', iElement, jElement, nr, nc +!! write(iulog,*) ' i, j, m, Kvv, new Avv:', i, j, m, Kvv(nr,nc), Avv(i,j,m) + endif + enddo ! nc enddo ! nr - end subroutine element_to_global_matrix_3d - + end subroutine element_to_global_matrix_2d + !**************************************************************************** - subroutine element_to_global_matrix_2d(nx, ny, & - iElement, jElement, & - itest, jtest, rtest, & - Kuu, Kuv, & - Kvu, Kvv, & - Auu, Auv, & + subroutine element_to_global_matrix_3d(nx, ny, nz, & + iElement, jElement, kElement, & + itest, jtest, rtest, & + Kuu, Kuv, & + Kvu, Kvv, & + Auu, Auv, & Avu, Avv) - + ! Sum terms of element matrix K into dense assembled matrix A ! K is partitioned into Kuu, Kuv, Kvu, and Kvv, and similarly for A. integer, intent(in) :: & - nx, ny ! horizontal grid dimensions + nx, ny, & ! horizontal grid dimensions + nz ! number of vertical levels where velocity is computed integer, intent(in) :: & - iElement, jElement ! i and j indices for this element + iElement, jElement, kElement ! i, j and k indices for this element integer, intent(in) :: & itest, jtest, rtest ! coordinates of diagnostic point - real(dp), dimension(nNodesPerElement_2d,nNodesPerElement_2d), intent(in) :: & + real(dp), dimension(nNodesPerElement_3d,nNodesPerElement_3d), intent(in) :: & Kuu, Kuv, Kvu, Kvv ! element matrix - real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(inout) :: & + real(dp), dimension(nNodeNeighbors_3d,nz,nx-1,ny-1), intent(inout) :: & Auu, Auv, Avu, Avv ! assembled matrix - integer :: i, j, m - integer :: iA, jA + integer :: i, j, k, m + integer :: iA, jA, kA integer :: n, nr, nc - if (verbose_matrix .and. this_rank==rtest .and. iElement==itest .and. jElement==jtest) then - write(iulog,*) 'Element i, j:', iElement, jElement + if (verbose_matrix .and. this_rank==rtest .and. iElement==itest .and. jElement==jtest .and. kElement==ktest) then + write(iulog,*) 'Element i, j, k:', iElement, jElement, kElement write(iulog,*) 'Rows of Kuu:' - do n = 1, nNodesPerElement_2d + do n = 1, nNodesPerElement_3d write(iulog, '(8e12.4)') Kuu(n,:) enddo endif - do nr = 1, nNodesPerElement_2d ! rows of K + !WHL - On a Mac I tried switching the loops to put nc on the outside, but + ! the one with nr on the outside is faster. + do nr = 1, nNodesPerElement_3d ! rows of K - ! Determine row of A to be incremented by finding (i,j) for this node - ! The reason for the '3' is that node 3, in the NE corner of this gridcell, has index (i,j). + ! Determine row of A to be incremented by finding (k,i,j) for this node + ! The reason for the '7' is that node 7, in the NE corner of the upper layer, has index (k,i,j). ! Indices for other nodes are computed relative to this node. - i = iElement + ishift(3,nr) - j = jElement + jshift(3,nr) + i = iElement + ishift(7,nr) + j = jElement + jshift(7,nr) + k = kElement + kshift(7,nr) - do nc = 1, nNodesPerElement_2d ! columns of K + do nc = 1, nNodesPerElement_3d ! columns of K ! Determine column of A to be incremented - iA = ishift(nr,nc) ! similarly for i and j indices + kA = kshift(nr,nc) ! k index of A into which K(m,n) is summed + iA = ishift(nr,nc) ! similarly for i and j indices jA = jshift(nr,nc) ! these indices can take values -1, 0 and 1 - m = indxA_2d(iA,jA) + m = indxA_3d(iA,jA,kA) ! Increment A - Auu(i,j,m) = Auu(i,j,m) + Kuu(nr,nc) - Auv(i,j,m) = Auv(i,j,m) + Kuv(nr,nc) - Avu(i,j,m) = Avu(i,j,m) + Kvu(nr,nc) - Avv(i,j,m) = Avv(i,j,m) + Kvv(nr,nc) - - if (verbose_matrix .and. this_rank==rtest .and. iElement==itest .and. jElement==jtest) then - write(iulog,*) 'Increment Auu, element i, j, nr, nc =', iElement, jElement, nr, nc - write(iulog,*) ' i, j, m, Kuu, new Auu:', i, j, m, Kuu(nr,nc), Auu(i,j,m) -!! write(iulog,*) 'Increment Avv, element i, j, nr, nc =', iElement, jElement, nr, nc -!! write(iulog,*) ' i, j, m, Kvv, new Avv:', i, j, m, Kvv(nr,nc), Avv(i,j,m) - endif + Auu(m,k,i,j) = Auu(m,k,i,j) + Kuu(nr,nc) + Auv(m,k,i,j) = Auv(m,k,i,j) + Kuv(nr,nc) + Avu(m,k,i,j) = Avu(m,k,i,j) + Kvu(nr,nc) + Avv(m,k,i,j) = Avv(m,k,i,j) + Kvv(nr,nc) enddo ! nc + enddo ! nr - end subroutine element_to_global_matrix_2d + end subroutine element_to_global_matrix_3d !**************************************************************************** !WHL, May 2025: @@ -9018,65 +9018,237 @@ subroutine basal_sliding_bc_3d(nx, ny, & iA = ishift(nr,nc) ! iA index of A into which K(nr,nc) is summed jA = jshift(nr,nc) ! similarly for jA - if (nNeighbors == nNodeNeighbors_3d) then ! 3D problem - m = indxA_3d(iA,jA,0) - else ! 2D problem - m = indxA_2d(iA,jA) - endif + if (nNeighbors == nNodeNeighbors_3d) then ! 3D problem + m = indxA_3d(iA,jA,0) + else ! 2D problem + m = indxA_2d(iA,jA) + endif + + Auu(m,ii,jj) = Auu(m,ii,jj) + Kuu(nr,nc) + Avv(m,ii,jj) = Avv(m,ii,jj) + Kvv(nr,nc) + + if (verbose_basal .and. this_rank==rtest .and. ii==itest .and. jj==jtest .and. m==5) then + ! m = 5 gives the influence of beta at vertex(i,j) on velocity at vertex(ii,jj). + ! For local assembly, Auu and Avv get nonzero increments only for m = 5. + write(iulog,*) 'Basal increment for Auu and Avv: source (i,j), Kuu, new Auu, ii, jj, m =', & + i, j, Kuu(nr,nc), Auu(m,ii,jj), ii, jj, m + endif + + enddo ! nc + enddo ! nr + + if (verbose_basal .and. this_rank==rtest .and. i==itest .and. j==jtest) then +! write(iulog,*) ' ' +! write(iulog,*) 'i, j =', i, j +! write(iulog,*) 'Kuu:' +! do nr = 1, nNodesPerElement_2d +! write(iulog,*) nr, Kuu(nr,:) +! enddo +! write(iulog,*) ' ' +! write(iulog,*) 'rowsum(Kuu):' +! do nr = 1, nNodesPerElement_2d +! write(iulog,*) nr, sum(Kuu(nr,:)) +! enddo +! write(iulog,*) ' ' +! write(iulog,*) 'sum(Kuu):', sum(Kuu(:,:)) + endif + + enddo ! nQuadPoints_2d + + endif ! active_cell + + enddo ! i + enddo ! j + + endif ! whichassemble_beta + + if (verbose_basal .and. this_rank==rtest) then + i = itest + j = jtest + if (nNeighbors == nNodeNeighbors_3d) then ! 3D problem + m = indxA_3d(0,0,0) + else + m = indxA_2d(0,0) + endif + write(iulog,*) ' ' + write(iulog,*) 'Basal BC: i, j, diagonal index =', i, j, m + write(iulog,*) 'New Auu diagonal:', Auu(m,i,j) + write(iulog,*) 'New Avv diagonal:', Avv(m,i,j) + endif + + end subroutine basal_sliding_bc_3d + +!**************************************************************************** + + subroutine dirichlet_boundary_conditions_2d(nx, ny, & + nhalo, & + active_vertex, & + umask_dirichlet, vmask_dirichlet, & + uvel, vvel, & + Auu, Auv, & + Avu, Avv, & + bu, bv) + + !---------------------------------------------------------------- + ! Modify the global matrix and RHS for Dirichlet boundary conditions, + ! where uvel and vvel are prescribed at certain nodes. + ! For each such node, we zero out the row, except for setting the diagonal term to 1. + ! We also zero out the column, moving terms containing uvel/vvel to the rhs. + !---------------------------------------------------------------- + + !---------------------------------------------------------------- + ! Input-output arguments + !---------------------------------------------------------------- + + integer, intent(in) :: & + nx, ny, & ! horizontal grid dimensions + nhalo ! number of halo layers + + logical, dimension(nx-1,ny-1), intent(in) :: & + active_vertex ! true for active vertices (vertices of active cells) + + integer, dimension(nx-1,ny-1), intent(in) :: & + umask_dirichlet, &! Dirichlet mask for velocity (if true, u is prescribed) + vmask_dirichlet ! Dirichlet mask for velocity (if true, v is prescribed) + + real(dp), dimension(nx-1,ny-1), intent(in) :: & + uvel, vvel ! velocity components + + real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(inout) :: & + Auu, Auv, & ! assembled stiffness matrix, divided into 4 parts + Avu, Avv + + real(dp), dimension(nx-1,ny-1), intent(inout) :: & + bu, bv ! assembled load vector, divided into 2 parts + + !---------------------------------------------------------------- + ! Local variables + !---------------------------------------------------------------- + + integer :: i, j ! Cartesian indices of nodes + integer :: iA, jA ! i and j offsets of neighboring nodes + integer :: m, mm + + ! Loop over all vertices that border locally owned vertices. + ! Locally owned vertices are (staggered_ilo:staggered_ihi, staggered_jlo_staggered_jhi). + ! OK to skip vertices outside the global domain (i < nhalo or j < nhalo). + ! Note: Need nhalo >= 2 so as not to step out of bounds. + + do jA = -1,1 + do iA = -1,1 + m = indxA_2d(iA,jA) + mm = indxA_2d(-iA,-jA) + + do j = nhalo, ny-nhalo+1 + do i = nhalo, nx-nhalo+1 + if (active_vertex(i,j)) then + + if (umask_dirichlet(i,j) == 1) then + + ! set the rhs to the prescribed velocity + bu(i,j) = uvel(i,j) + + ! loop through matrix values in the rows associated with this vertex + ! (Auu contains one row, Avu contains a second row) + + if ( (i+iA >= 1 .and. i+iA <= nx-1) & + .and. & + (j+jA >= 1 .and. j+jA <= ny-1) ) then + + if (iA==0 .and. jA==0) then ! main diagonal + + ! Set Auu = 1 on the main diagonal + ! Set Auv term = 0; this term is off-diagonal for the fully assembled matrix + ! Set Avu term = 0 to preserve matrix symmetry (given that Auv term = 0) + Auu(i,j,indxA_2d(0,0)) = 1.d0 + Auv(i,j,indxA_2d(0,0)) = 1.d0 + Avu(i,j,indxA_2d(0,0)) = 1.d0 + + else ! not on the diagonal + + ! Zero out non-diagonal matrix terms in the row associated with this vertex + Auu(i,j,m) = 0.d0 + Auv(i,j,m) = 0.d0 + + ! Shift terms associated with this velocity to the rhs. + ! Note: The remaining operations do not change the answer, but do restore symmetry to the matrix. + ! Recall mm = indxA_2d(-iA,-jA) + + if (umask_dirichlet(i+iA, j+jA) /= 1) then + ! Move (Auu term) * uvel to rhs + bu(i+iA, j+jA) = bu(i+iA, j+jA) - Auu(i+iA, j+jA, mm) * uvel(i,j) + Auu(i+iA, j+jA, mm) = 0.d0 + endif + + if (vmask_dirichlet(i+iA, j+jA) /= 1) then + ! Move (Avu term) * uvel to rhs + bv(i+iA, j+jA) = bv(i+iA, j+jA) - Avu(i+iA, j+jA, mm) * uvel(i,j) + Avu(i+iA, j+jA, mm) = 0.d0 + endif + + endif ! on the diagonal + + endif ! i+iA and j+jA in bounds + + endif ! umask_dirichlet + + if (vmask_dirichlet(i,j) == 1) then + + ! set the rhs to the prescribed velocity + bv(i,j) = vvel(i,j) + + ! loop through matrix values in the rows associated with this vertex + ! (Auv contains one row, Avv contains a second row) + + if ( (i+iA >= 1 .and. i+iA <= nx-1) & + .and. & + (j+jA >= 1 .and. j+jA <= ny-1) ) then + + if (iA==0 .and. jA==0) then ! main diagonal + + ! Set Avv = 1 on the main diagonal + ! Set Avu term = 0; this term is off-diagonal for the fully assembled matrix + ! Set Auv term = 0 to preserve matrix symmetry (given that Avu term = 0) + Auv(i,j,indxA_2d(0,0)) = 0.d0 + Avu(i,j,indxA_2d(0,0)) = 0.d0 + Avv(i,j,indxA_2d(0,0)) = 1.d0 + + else ! not on the diagonal - Auu(m,ii,jj) = Auu(m,ii,jj) + Kuu(nr,nc) - Avv(m,ii,jj) = Avv(m,ii,jj) + Kvv(nr,nc) + ! Zero out non-diagonal matrix terms in the rows associated with this vertex + Avu(i,j,m) = 0.d0 + Avv(i,j,m) = 0.d0 - if (verbose_basal .and. this_rank==rtest .and. ii==itest .and. jj==jtest .and. m==5) then - ! m = 5 gives the influence of beta at vertex(i,j) on velocity at vertex(ii,jj). - ! For local assembly, Auu and Avv get nonzero increments only for m = 5. - write(iulog,*) 'Basal increment for Auu and Avv: source (i,j), Kuu, new Auu, ii, jj, m =', & - i, j, Kuu(nr,nc), Auu(m,ii,jj), ii, jj, m - endif + ! Shift terms associated with this velocity to the rhs. + ! Note: The remaining operations do not change the answer, but do restore symmetry to the matrix. + ! Recall mm = indxA_2d(-iA,-jA) - enddo ! nc - enddo ! nr + if (umask_dirichlet(i+iA, j+jA) /= 1) then + ! Move (Auv term) * vvel to rhs + bu(i+iA, j+jA) = bu(i+iA, j+jA) - Auv(i+iA, j+jA, mm) * vvel(i,j) + Auv(i+iA, j+jA, mm) = 0.d0 + endif - if (verbose_basal .and. this_rank==rtest .and. i==itest .and. j==jtest) then -! write(iulog,*) ' ' -! write(iulog,*) 'i, j =', i, j -! write(iulog,*) 'Kuu:' -! do nr = 1, nNodesPerElement_2d -! write(iulog,*) nr, Kuu(nr,:) -! enddo -! write(iulog,*) ' ' -! write(iulog,*) 'rowsum(Kuu):' -! do nr = 1, nNodesPerElement_2d -! write(iulog,*) nr, sum(Kuu(nr,:)) -! enddo -! write(iulog,*) ' ' -! write(iulog,*) 'sum(Kuu):', sum(Kuu(:,:)) - endif + if (vmask_dirichlet(i+iA, j+jA) /= 1) then + ! Move (Avv term) * vvel to rhs + bv(i+iA, j+jA) = bv(i+iA, j+jA) - Avv(i+iA, j+jA, mm) * vvel(i,j) + Avv(i+iA, j+jA, mm) = 0.d0 + endif - enddo ! nQuadPoints_2d + endif ! on the diagonal - endif ! active_cell + endif ! i+iA and j+jA in bounds - enddo ! i - enddo ! j + endif ! vmask_dirichlet - endif ! whichassemble_beta + endif ! active_vertex + enddo ! i + enddo ! j - if (verbose_basal .and. this_rank==rtest) then - i = itest - j = jtest - if (nNeighbors == nNodeNeighbors_3d) then ! 3D problem - m = indxA_3d(0,0,0) - else - m = indxA_2d(0,0) - endif - write(iulog,*) ' ' - write(iulog,*) 'Basal BC: i, j, diagonal index =', i, j, m - write(iulog,*) 'New Auu diagonal:', Auu(m,i,j) - write(iulog,*) 'New Avv diagonal:', Avv(m,i,j) - endif + enddo ! iA + enddo ! jA - end subroutine basal_sliding_bc_3d + end subroutine dirichlet_boundary_conditions_2d !**************************************************************************** @@ -9217,239 +9389,67 @@ subroutine dirichlet_boundary_conditions_3d(nx, ny, & if ( (k+kA >= 1 .and. k+kA <= nz) & .and. & (i+iA >= 1 .and. i+iA <= nx-1) & - .and. & - (j+jA >= 1 .and. j+jA <= ny-1) ) then - - if (iA==0 .and. jA==0 .and. kA==0) then ! main diagonal - - ! Set Avv = 1 on the main diagonal - ! Set Avu term = 0; this term is off-diagonal for the fully assembled matrix - ! Set Auv term = 0 to preserve matrix symmetry (given that Avu term = 0) - m = indxA_3d(0,0,0) - - Auv(m,k,i,j) = 0.d0 - Avu(m,k,i,j) = 0.d0 - Avv(m,k,i,j) = 1.d0 - - !TODO - Set bv above, outside iA/jA loop - ! Set the rhs to the prescribed velocity, forcing v = prescribed vvel for this node -!! bv(k,i,j) = vvel(k,i,j) - - else ! not on the diagonal - - ! Zero out non-diagonal matrix terms in the rows associated with this node - m = indxA_3d(iA,jA,kA) - Avu(m, k, i, j) = 0.d0 - Avv(m, k, i, j) = 0.d0 - - ! Shift terms associated with this velocity to the rhs. - ! Note: The remaining operations do not change the answer, but do restore symmetry to the matrix. - m = indxA_3d(-iA,-jA,-kA) - - if (umask_dirichlet(k+kA, i+iA, j+jA) /= 1) then - ! Move (Auv term) * vvel to rhs - bu(k+kA, i+iA, j+jA) = bu(k+kA, i+iA, j+jA) - Auv(m, k+kA, i+iA, j+jA) * vvel(k,i,j) - Auv(m, k+kA, i+iA, j+jA) = 0.d0 - endif - - if (vmask_dirichlet(k+kA, i+iA, j+jA) /= 1) then - ! Move (Avv term) * vvel to rhs - bv(k+kA, i+iA, j+jA) = bv(k+kA, i+iA, j+jA) - Avv(m, k+kA, i+iA, j+jA) * vvel(k,i,j) - Avv(m, k+kA, i+iA, j+jA) = 0.d0 - endif - - endif ! on the diagonal - - endif ! i+iA, j+jA, and k+kA in bounds - - enddo ! kA - enddo ! iA - enddo ! jA - - endif ! vmask_dirichlet - - enddo ! k - endif ! active_vertex - enddo ! i - enddo ! j - - end subroutine dirichlet_boundary_conditions_3d - -!**************************************************************************** - - subroutine dirichlet_boundary_conditions_2d(nx, ny, & - nhalo, & - active_vertex, & - umask_dirichlet, vmask_dirichlet, & - uvel, vvel, & - Auu, Auv, & - Avu, Avv, & - bu, bv) - - !---------------------------------------------------------------- - ! Modify the global matrix and RHS for Dirichlet boundary conditions, - ! where uvel and vvel are prescribed at certain nodes. - ! For each such node, we zero out the row, except for setting the diagonal term to 1. - ! We also zero out the column, moving terms containing uvel/vvel to the rhs. - !---------------------------------------------------------------- - - !---------------------------------------------------------------- - ! Input-output arguments - !---------------------------------------------------------------- - - integer, intent(in) :: & - nx, ny, & ! horizontal grid dimensions - nhalo ! number of halo layers - - logical, dimension(nx-1,ny-1), intent(in) :: & - active_vertex ! true for active vertices (vertices of active cells) - - integer, dimension(nx-1,ny-1), intent(in) :: & - umask_dirichlet, &! Dirichlet mask for velocity (if true, u is prescribed) - vmask_dirichlet ! Dirichlet mask for velocity (if true, v is prescribed) - - real(dp), dimension(nx-1,ny-1), intent(in) :: & - uvel, vvel ! velocity components - - real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(inout) :: & - Auu, Auv, & ! assembled stiffness matrix, divided into 4 parts - Avu, Avv - - real(dp), dimension(nx-1,ny-1), intent(inout) :: & - bu, bv ! assembled load vector, divided into 2 parts - - !---------------------------------------------------------------- - ! Local variables - !---------------------------------------------------------------- - - integer :: i, j ! Cartesian indices of nodes - integer :: iA, jA ! i and j offsets of neighboring nodes - integer :: m, mm - - ! Loop over all vertices that border locally owned vertices. - ! Locally owned vertices are (staggered_ilo:staggered_ihi, staggered_jlo_staggered_jhi). - ! OK to skip vertices outside the global domain (i < nhalo or j < nhalo). - ! Note: Need nhalo >= 2 so as not to step out of bounds. - - do jA = -1,1 - do iA = -1,1 - m = indxA_2d(iA,jA) - mm = indxA_2d(-iA,-jA) - - do j = nhalo, ny-nhalo+1 - do i = nhalo, nx-nhalo+1 - if (active_vertex(i,j)) then - - if (umask_dirichlet(i,j) == 1) then - - ! set the rhs to the prescribed velocity - bu(i,j) = uvel(i,j) - - ! loop through matrix values in the rows associated with this vertex - ! (Auu contains one row, Avu contains a second row) - - if ( (i+iA >= 1 .and. i+iA <= nx-1) & - .and. & - (j+jA >= 1 .and. j+jA <= ny-1) ) then - - if (iA==0 .and. jA==0) then ! main diagonal - - ! Set Auu = 1 on the main diagonal - ! Set Auv term = 0; this term is off-diagonal for the fully assembled matrix - ! Set Avu term = 0 to preserve matrix symmetry (given that Auv term = 0) - Auu(i,j,indxA_2d(0,0)) = 1.d0 - Auv(i,j,indxA_2d(0,0)) = 1.d0 - Avu(i,j,indxA_2d(0,0)) = 1.d0 - - else ! not on the diagonal - - ! Zero out non-diagonal matrix terms in the row associated with this vertex - Auu(i,j,m) = 0.d0 - Auv(i,j,m) = 0.d0 - - ! Shift terms associated with this velocity to the rhs. - ! Note: The remaining operations do not change the answer, but do restore symmetry to the matrix. - ! Recall mm = indxA_2d(-iA,-jA) - - if (umask_dirichlet(i+iA, j+jA) /= 1) then - ! Move (Auu term) * uvel to rhs - bu(i+iA, j+jA) = bu(i+iA, j+jA) - Auu(i+iA, j+jA, mm) * uvel(i,j) - Auu(i+iA, j+jA, mm) = 0.d0 - endif - - if (vmask_dirichlet(i+iA, j+jA) /= 1) then - ! Move (Avu term) * uvel to rhs - bv(i+iA, j+jA) = bv(i+iA, j+jA) - Avu(i+iA, j+jA, mm) * uvel(i,j) - Avu(i+iA, j+jA, mm) = 0.d0 - endif - - endif ! on the diagonal - - endif ! i+iA and j+jA in bounds - - endif ! umask_dirichlet - - if (vmask_dirichlet(i,j) == 1) then - - ! set the rhs to the prescribed velocity - bv(i,j) = vvel(i,j) - - ! loop through matrix values in the rows associated with this vertex - ! (Auv contains one row, Avv contains a second row) + .and. & + (j+jA >= 1 .and. j+jA <= ny-1) ) then - if ( (i+iA >= 1 .and. i+iA <= nx-1) & - .and. & - (j+jA >= 1 .and. j+jA <= ny-1) ) then + if (iA==0 .and. jA==0 .and. kA==0) then ! main diagonal - if (iA==0 .and. jA==0) then ! main diagonal + ! Set Avv = 1 on the main diagonal + ! Set Avu term = 0; this term is off-diagonal for the fully assembled matrix + ! Set Auv term = 0 to preserve matrix symmetry (given that Avu term = 0) + m = indxA_3d(0,0,0) - ! Set Avv = 1 on the main diagonal - ! Set Avu term = 0; this term is off-diagonal for the fully assembled matrix - ! Set Auv term = 0 to preserve matrix symmetry (given that Avu term = 0) - Auv(i,j,indxA_2d(0,0)) = 0.d0 - Avu(i,j,indxA_2d(0,0)) = 0.d0 - Avv(i,j,indxA_2d(0,0)) = 1.d0 + Auv(m,k,i,j) = 0.d0 + Avu(m,k,i,j) = 0.d0 + Avv(m,k,i,j) = 1.d0 - else ! not on the diagonal + !TODO - Set bv above, outside iA/jA loop + ! Set the rhs to the prescribed velocity, forcing v = prescribed vvel for this node +!! bv(k,i,j) = vvel(k,i,j) + + else ! not on the diagonal - ! Zero out non-diagonal matrix terms in the rows associated with this vertex - Avu(i,j,m) = 0.d0 - Avv(i,j,m) = 0.d0 + ! Zero out non-diagonal matrix terms in the rows associated with this node + m = indxA_3d(iA,jA,kA) + Avu(m, k, i, j) = 0.d0 + Avv(m, k, i, j) = 0.d0 - ! Shift terms associated with this velocity to the rhs. - ! Note: The remaining operations do not change the answer, but do restore symmetry to the matrix. - ! Recall mm = indxA_2d(-iA,-jA) + ! Shift terms associated with this velocity to the rhs. + ! Note: The remaining operations do not change the answer, but do restore symmetry to the matrix. + m = indxA_3d(-iA,-jA,-kA) - if (umask_dirichlet(i+iA, j+jA) /= 1) then - ! Move (Auv term) * vvel to rhs - bu(i+iA, j+jA) = bu(i+iA, j+jA) - Auv(i+iA, j+jA, mm) * vvel(i,j) - Auv(i+iA, j+jA, mm) = 0.d0 - endif + if (umask_dirichlet(k+kA, i+iA, j+jA) /= 1) then + ! Move (Auv term) * vvel to rhs + bu(k+kA, i+iA, j+jA) = bu(k+kA, i+iA, j+jA) - Auv(m, k+kA, i+iA, j+jA) * vvel(k,i,j) + Auv(m, k+kA, i+iA, j+jA) = 0.d0 + endif - if (vmask_dirichlet(i+iA, j+jA) /= 1) then - ! Move (Avv term) * vvel to rhs - bv(i+iA, j+jA) = bv(i+iA, j+jA) - Avv(i+iA, j+jA, mm) * vvel(i,j) - Avv(i+iA, j+jA, mm) = 0.d0 - endif + if (vmask_dirichlet(k+kA, i+iA, j+jA) /= 1) then + ! Move (Avv term) * vvel to rhs + bv(k+kA, i+iA, j+jA) = bv(k+kA, i+iA, j+jA) - Avv(m, k+kA, i+iA, j+jA) * vvel(k,i,j) + Avv(m, k+kA, i+iA, j+jA) = 0.d0 + endif - endif ! on the diagonal + endif ! on the diagonal - endif ! i+iA and j+jA in bounds + endif ! i+iA, j+jA, and k+kA in bounds - endif ! vmask_dirichlet + enddo ! kA + enddo ! iA + enddo ! jA - endif ! active_vertex - enddo ! i - enddo ! j + endif ! vmask_dirichlet - enddo ! iA - enddo ! jA + enddo ! k + endif ! active_vertex + enddo ! i + enddo ! j - end subroutine dirichlet_boundary_conditions_2d + end subroutine dirichlet_boundary_conditions_3d !**************************************************************************** - subroutine compute_residual_vector_3d(nx, ny, nz, & + subroutine compute_residual_vector_2d(nx, ny, & parallel, & itest, jtest, rtest, & active_vertex, & @@ -9464,12 +9464,11 @@ subroutine compute_residual_vector_3d(nx, ny, nz, & ! This subroutine assumes that the matrix is stored in structured (x/y/z) format. !---------------------------------------------------------------- - ! Input/output variables + ! Input/output arguments !---------------------------------------------------------------- integer, intent(in) :: & - nx, ny, & ! horizontal grid dimensions (for scalars) - nz ! number of vertical levels where velocity is computed + nx, ny ! horizontal grid dimensions (for scalars) type(parallel_type), intent(in) :: & parallel ! info for parallel communication @@ -9480,25 +9479,25 @@ subroutine compute_residual_vector_3d(nx, ny, nz, & logical, dimension(nx-1,ny-1), intent(in) :: & active_vertex ! T for columns (i,j) where velocity is computed, else F - real(dp), dimension(nNodeNeighbors_3d,nz,nx-1,ny-1), intent(in) :: & + real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(in) :: & Auu, Auv, Avu, Avv ! four components of assembled matrix - ! 1st dimension = 3 (node and its nearest neighbors in x, y and z direction) - ! other dimensions = (z,x,y) indices + ! 3rd dimension = 9 (node and its nearest neighbors in x and y directions) + ! 1st and 2nd dimensions = (x,y) indices ! ! Auu | Auv ! _____|____ ! Avu | Avv ! | - real(dp), dimension(nz,nx-1,ny-1), intent(in) :: & + real(dp), dimension(nx-1,ny-1), intent(in) :: & bu, bv ! assembled load (rhs) vector, divided into 2 parts - real(dp), dimension(nz,nx-1,ny-1), intent(in) :: & + real(dp), dimension(nx-1,ny-1), intent(in) :: & uvel, vvel ! u and v components of velocity (m/yr) - real(dp), dimension(nz,nx-1,ny-1), intent(out) :: & + real(dp), dimension(nx-1,ny-1), intent(out) :: & resid_u, & ! residual vector, divided into 2 parts - resid_v ! + resid_v real(dp), intent(out) :: & L2_norm ! L2 norm of residual vector, |Ax - b| @@ -9510,13 +9509,13 @@ subroutine compute_residual_vector_3d(nx, ny, nz, & ! Local variables !---------------------------------------------------------------- - real(dp), dimension(nz,nx-1,ny-1) :: & - worku, workv, & ! work arrays for global sums - resid_sq ! resid_u^2 + resid_v^2 + real(dp), dimension(nx-1,ny-1) :: & + worku, workv, & ! work arrays for global sums + resid_sq ! resid_u^2 + resid_v^2 real(dp) :: my_max_resid, global_max_resid - integer :: i, j, k, iA, jA, kA, m, iglobal, jglobal + integer :: i, j, iA, jA, m, iglobal, jglobal real(dp) :: L2_norm_rhs ! L2 norm of rhs vector, |b| @@ -9531,59 +9530,47 @@ subroutine compute_residual_vector_3d(nx, ny, nz, & ! Compute u and v components of A*x - resid_u(:,:,:) = 0.d0 - resid_v(:,:,:) = 0.d0 + resid_u(:,:) = 0.d0 + resid_v(:,:) = 0.d0 ! Loop over locally owned vertices - do j = staggered_jlo, staggered_jhi - do i = staggered_ilo, staggered_ihi - if (active_vertex(i,j)) then - do k = 1, nz - do kA = -1,1 - do jA = -1,1 - do iA = -1,1 - if ( (k+kA >= 1 .and. k+kA <= nz) & - .and. & - (i+iA >= 1 .and. i+iA <= nx-1) & - .and. & - (j+jA >= 1 .and. j+jA <= ny-1) ) then - - m = indxA_3d(iA,jA,kA) - - resid_u(k,i,j) = resid_u(k,i,j) & - + Auu(m,k,i,j)*uvel(k+kA,i+iA,j+jA) & - + Auv(m,k,i,j)*vvel(k+kA,i+iA,j+jA) - - resid_v(k,i,j) = resid_v(k,i,j) & - + Avu(m,k,i,j)*uvel(k+kA,i+iA,j+jA) & - + Avv(m,k,i,j)*vvel(k+kA,i+iA,j+jA) - endif ! in bounds - enddo ! kA - enddo ! iA - enddo ! jA - enddo ! k - endif ! active_vertex - enddo ! i - enddo ! j + do jA = -1,1 + do iA = -1,1 + m = indxA_2d(iA,jA) + do j = staggered_jlo, staggered_jhi + do i = staggered_ilo, staggered_ihi + if (active_vertex(i,j)) then + if ( (i+iA >= 1 .and. i+iA <= nx-1) & + .and. & + (j+jA >= 1 .and. j+jA <= ny-1) ) then + resid_u(i,j) = resid_u(i,j) & + + Auu(i,j,m)*uvel(i+iA,j+jA) & + + Auv(i,j,m)*vvel(i+iA,j+jA) + resid_v(i,j) = resid_v(i,j) & + + Avu(i,j,m)*uvel(i+iA,j+jA) & + + Avv(i,j,m)*vvel(i+iA,j+jA) + endif ! in bounds + endif ! active_vertex + enddo ! i + enddo ! j + enddo ! iA + enddo ! jA ! Subtract b to get A*x - b - - worku(:,:,:) = 0.0d0 - workv(:,:,:) = 0.0d0 + worku(:,:) = 0.0d0 + workv(:,:) = 0.0d0 ! Loop over locally owned vertices do j = staggered_jlo, staggered_jhi do i = staggered_ilo, staggered_ihi if (active_vertex(i,j)) then - do k = 1, nz - resid_u(k,i,j) = resid_u(k,i,j) - bu(k,i,j) - resid_v(k,i,j) = resid_v(k,i,j) - bv(k,i,j) - worku(k,i,j) = resid_u(k,i,j)*resid_u(k,i,j) - workv(k,i,j) = resid_v(k,i,j)*resid_v(k,i,j) - enddo ! k + resid_u(i,j) = resid_u(i,j) - bu(i,j) + resid_v(i,j) = resid_v(i,j) - bv(i,j) + worku(i,j) = resid_u(i,j)*resid_u(i,j) + workv(i,j) = resid_v(i,j)*resid_v(i,j) endif ! active vertex - enddo ! i - enddo ! j + enddo ! i + enddo ! j ! Take global sum, then take square root L2_norm = parallel_global_sum_stagger(worku, parallel, workv) @@ -9594,28 +9581,26 @@ subroutine compute_residual_vector_3d(nx, ny, nz, & if (this_rank==rtest) then i = itest j = jtest - k = ktest - write(iulog,*) 'In compute_residual_vector_3d: task, i, j, k =', this_rank, i, j, k + write(iulog,*) 'In compute_residual_vector_2d: task, i, j =', this_rank, i, j write(iulog, '(a16, 2f13.7, 2e13.5)') & - ' u, v, ru, rv: ', uvel(k,i,j), vvel(k,i,j), resid_u(k,i,j), resid_v(k,i,j) + ' u, v, ru, rv: ', uvel(i,j), vvel(i,j), resid_u(i,j), resid_v(i,j) endif ! Compute max value of (squared) residual on this task. ! If this task owns the vertex with the global max residual, then print a diagnostic message. + resid_sq(:,:) = worku(:,:) + workv(:,:) my_max_resid = maxval(resid_sq) global_max_resid = parallel_reduce_max(my_max_resid) if (abs((my_max_resid - global_max_resid)/global_max_resid) < 1.0d-6) then do j = staggered_jlo, staggered_jhi do i = staggered_ilo, staggered_ihi - do k = 1, nz - if (abs((resid_sq(k,i,j) - global_max_resid)/global_max_resid) < 1.0d-6) then - call parallel_globalindex(i, j, iglobal, jglobal, parallel) - write(iulog, '(a24, 2i6, i4, 2e13.5, e16.8)') 'ig, jg, k, ru, rv, rmax:', & - iglobal, jglobal, k, resid_u(k,i,j), resid_v(k,i,j), sqrt(global_max_resid) - write(iulog,*) ' ' - endif - enddo + if (abs((resid_sq(i,j) - global_max_resid)/global_max_resid) < 1.0d-6) then + call parallel_globalindex(i, j, iglobal, jglobal, parallel) + write(iulog, '(a24, 2i6, 2e13.5, e16.8)') 'ig, jg, ru, rv, rmax:', & + iglobal, jglobal, resid_u(i,j), resid_v(i,j), sqrt(global_max_resid) + write(iulog,*) ' ' + endif enddo enddo endif @@ -9624,19 +9609,18 @@ subroutine compute_residual_vector_3d(nx, ny, nz, & if (present(L2_norm_relative)) then ! compute L2_norm relative to rhs - worku(:,:,:) = 0.0d0 - workv(:,:,:) = 0.0d0 + worku(:,:) = 0.0d0 + workv(:,:) = 0.0d0 + ! Loop over locally owned vertices do j = staggered_jlo, staggered_jhi do i = staggered_ilo, staggered_ihi if (active_vertex(i,j)) then - do k = 1, nz - worku(k,i,j) = bu(k,i,j)*bu(k,i,j) - workv(k,i,j) = bv(k,i,j)*bv(k,i,j) - enddo ! k + worku(i,j) = bu(i,j)*bu(i,j) + workv(i,j) = bv(i,j)*bv(i,j) endif ! active vertex - enddo ! i - enddo ! j + enddo ! i + enddo ! j ! Take global sum, then take square root L2_norm_rhs = parallel_global_sum_stagger(worku, parallel, workv) @@ -9650,11 +9634,11 @@ subroutine compute_residual_vector_3d(nx, ny, nz, & endif - end subroutine compute_residual_vector_3d + end subroutine compute_residual_vector_2d !**************************************************************************** - subroutine compute_residual_vector_2d(nx, ny, & + subroutine compute_residual_vector_3d(nx, ny, nz, & parallel, & itest, jtest, rtest, & active_vertex, & @@ -9669,11 +9653,12 @@ subroutine compute_residual_vector_2d(nx, ny, & ! This subroutine assumes that the matrix is stored in structured (x/y/z) format. !---------------------------------------------------------------- - ! Input/output arguments + ! Input/output variables !---------------------------------------------------------------- integer, intent(in) :: & - nx, ny ! horizontal grid dimensions (for scalars) + nx, ny, & ! horizontal grid dimensions (for scalars) + nz ! number of vertical levels where velocity is computed type(parallel_type), intent(in) :: & parallel ! info for parallel communication @@ -9684,25 +9669,25 @@ subroutine compute_residual_vector_2d(nx, ny, & logical, dimension(nx-1,ny-1), intent(in) :: & active_vertex ! T for columns (i,j) where velocity is computed, else F - real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(in) :: & + real(dp), dimension(nNodeNeighbors_3d,nz,nx-1,ny-1), intent(in) :: & Auu, Auv, Avu, Avv ! four components of assembled matrix - ! 3rd dimension = 9 (node and its nearest neighbors in x and y directions) - ! 1st and 2nd dimensions = (x,y) indices + ! 1st dimension = 3 (node and its nearest neighbors in x, y and z direction) + ! other dimensions = (z,x,y) indices ! ! Auu | Auv ! _____|____ ! Avu | Avv ! | - real(dp), dimension(nx-1,ny-1), intent(in) :: & + real(dp), dimension(nz,nx-1,ny-1), intent(in) :: & bu, bv ! assembled load (rhs) vector, divided into 2 parts - real(dp), dimension(nx-1,ny-1), intent(in) :: & + real(dp), dimension(nz,nx-1,ny-1), intent(in) :: & uvel, vvel ! u and v components of velocity (m/yr) - real(dp), dimension(nx-1,ny-1), intent(out) :: & + real(dp), dimension(nz,nx-1,ny-1), intent(out) :: & resid_u, & ! residual vector, divided into 2 parts - resid_v + resid_v ! real(dp), intent(out) :: & L2_norm ! L2 norm of residual vector, |Ax - b| @@ -9714,13 +9699,13 @@ subroutine compute_residual_vector_2d(nx, ny, & ! Local variables !---------------------------------------------------------------- - real(dp), dimension(nx-1,ny-1) :: & - worku, workv, & ! work arrays for global sums - resid_sq ! resid_u^2 + resid_v^2 + real(dp), dimension(nz,nx-1,ny-1) :: & + worku, workv, & ! work arrays for global sums + resid_sq ! resid_u^2 + resid_v^2 real(dp) :: my_max_resid, global_max_resid - integer :: i, j, iA, jA, m, iglobal, jglobal + integer :: i, j, k, iA, jA, kA, m, iglobal, jglobal real(dp) :: L2_norm_rhs ! L2 norm of rhs vector, |b| @@ -9733,49 +9718,61 @@ subroutine compute_residual_vector_2d(nx, ny, & staggered_jlo = parallel%staggered_jlo staggered_jhi = parallel%staggered_jhi - ! Compute u and v components of A*x + ! Compute u and v components of A*x + + resid_u(:,:,:) = 0.d0 + resid_v(:,:,:) = 0.d0 + + ! Loop over locally owned vertices + do j = staggered_jlo, staggered_jhi + do i = staggered_ilo, staggered_ihi + if (active_vertex(i,j)) then + do k = 1, nz + do kA = -1,1 + do jA = -1,1 + do iA = -1,1 + if ( (k+kA >= 1 .and. k+kA <= nz) & + .and. & + (i+iA >= 1 .and. i+iA <= nx-1) & + .and. & + (j+jA >= 1 .and. j+jA <= ny-1) ) then + + m = indxA_3d(iA,jA,kA) - resid_u(:,:) = 0.d0 - resid_v(:,:) = 0.d0 + resid_u(k,i,j) = resid_u(k,i,j) & + + Auu(m,k,i,j)*uvel(k+kA,i+iA,j+jA) & + + Auv(m,k,i,j)*vvel(k+kA,i+iA,j+jA) - ! Loop over locally owned vertices - do jA = -1,1 - do iA = -1,1 - m = indxA_2d(iA,jA) - do j = staggered_jlo, staggered_jhi - do i = staggered_ilo, staggered_ihi - if (active_vertex(i,j)) then - if ( (i+iA >= 1 .and. i+iA <= nx-1) & - .and. & - (j+jA >= 1 .and. j+jA <= ny-1) ) then - resid_u(i,j) = resid_u(i,j) & - + Auu(i,j,m)*uvel(i+iA,j+jA) & - + Auv(i,j,m)*vvel(i+iA,j+jA) - resid_v(i,j) = resid_v(i,j) & - + Avu(i,j,m)*uvel(i+iA,j+jA) & - + Avv(i,j,m)*vvel(i+iA,j+jA) - endif ! in bounds - endif ! active_vertex - enddo ! i - enddo ! j - enddo ! iA - enddo ! jA + resid_v(k,i,j) = resid_v(k,i,j) & + + Avu(m,k,i,j)*uvel(k+kA,i+iA,j+jA) & + + Avv(m,k,i,j)*vvel(k+kA,i+iA,j+jA) + endif ! in bounds + enddo ! kA + enddo ! iA + enddo ! jA + enddo ! k + endif ! active_vertex + enddo ! i + enddo ! j ! Subtract b to get A*x - b - worku(:,:) = 0.0d0 - workv(:,:) = 0.0d0 + + worku(:,:,:) = 0.0d0 + workv(:,:,:) = 0.0d0 ! Loop over locally owned vertices do j = staggered_jlo, staggered_jhi do i = staggered_ilo, staggered_ihi if (active_vertex(i,j)) then - resid_u(i,j) = resid_u(i,j) - bu(i,j) - resid_v(i,j) = resid_v(i,j) - bv(i,j) - worku(i,j) = resid_u(i,j)*resid_u(i,j) - workv(i,j) = resid_v(i,j)*resid_v(i,j) + do k = 1, nz + resid_u(k,i,j) = resid_u(k,i,j) - bu(k,i,j) + resid_v(k,i,j) = resid_v(k,i,j) - bv(k,i,j) + worku(k,i,j) = resid_u(k,i,j)*resid_u(k,i,j) + workv(k,i,j) = resid_v(k,i,j)*resid_v(k,i,j) + enddo ! k endif ! active vertex - enddo ! i - enddo ! j + enddo ! i + enddo ! j ! Take global sum, then take square root L2_norm = parallel_global_sum_stagger(worku, parallel, workv) @@ -9786,26 +9783,28 @@ subroutine compute_residual_vector_2d(nx, ny, & if (this_rank==rtest) then i = itest j = jtest - write(iulog,*) 'In compute_residual_vector_2d: task, i, j =', this_rank, i, j + k = ktest + write(iulog,*) 'In compute_residual_vector_3d: task, i, j, k =', this_rank, i, j, k write(iulog, '(a16, 2f13.7, 2e13.5)') & - ' u, v, ru, rv: ', uvel(i,j), vvel(i,j), resid_u(i,j), resid_v(i,j) + ' u, v, ru, rv: ', uvel(k,i,j), vvel(k,i,j), resid_u(k,i,j), resid_v(k,i,j) endif ! Compute max value of (squared) residual on this task. ! If this task owns the vertex with the global max residual, then print a diagnostic message. - resid_sq(:,:) = worku(:,:) + workv(:,:) my_max_resid = maxval(resid_sq) global_max_resid = parallel_reduce_max(my_max_resid) if (abs((my_max_resid - global_max_resid)/global_max_resid) < 1.0d-6) then do j = staggered_jlo, staggered_jhi do i = staggered_ilo, staggered_ihi - if (abs((resid_sq(i,j) - global_max_resid)/global_max_resid) < 1.0d-6) then - call parallel_globalindex(i, j, iglobal, jglobal, parallel) - write(iulog, '(a24, 2i6, 2e13.5, e16.8)') 'ig, jg, ru, rv, rmax:', & - iglobal, jglobal, resid_u(i,j), resid_v(i,j), sqrt(global_max_resid) - write(iulog,*) ' ' - endif + do k = 1, nz + if (abs((resid_sq(k,i,j) - global_max_resid)/global_max_resid) < 1.0d-6) then + call parallel_globalindex(i, j, iglobal, jglobal, parallel) + write(iulog, '(a24, 2i6, i4, 2e13.5, e16.8)') 'ig, jg, k, ru, rv, rmax:', & + iglobal, jglobal, k, resid_u(k,i,j), resid_v(k,i,j), sqrt(global_max_resid) + write(iulog,*) ' ' + endif + enddo enddo enddo endif @@ -9814,18 +9813,19 @@ subroutine compute_residual_vector_2d(nx, ny, & if (present(L2_norm_relative)) then ! compute L2_norm relative to rhs - worku(:,:) = 0.0d0 - workv(:,:) = 0.0d0 + worku(:,:,:) = 0.0d0 + workv(:,:,:) = 0.0d0 - ! Loop over locally owned vertices do j = staggered_jlo, staggered_jhi do i = staggered_ilo, staggered_ihi if (active_vertex(i,j)) then - worku(i,j) = bu(i,j)*bu(i,j) - workv(i,j) = bv(i,j)*bv(i,j) + do k = 1, nz + worku(k,i,j) = bu(k,i,j)*bu(k,i,j) + workv(k,i,j) = bv(k,i,j)*bv(k,i,j) + enddo ! k endif ! active vertex - enddo ! i - enddo ! j + enddo ! i + enddo ! j ! Take global sum, then take square root L2_norm_rhs = parallel_global_sum_stagger(worku, parallel, workv) @@ -9839,26 +9839,30 @@ subroutine compute_residual_vector_2d(nx, ny, & endif - end subroutine compute_residual_vector_2d + end subroutine compute_residual_vector_3d !**************************************************************************** - subroutine evaluate_accelerated_picard_3d(& + subroutine evaluate_accelerated_picard_2d(& + nx, ny, & L2_norm, L2_norm_large, & L2_norm_alpha_sav, & alpha_accel, alpha_accel_max, & gamma_accel, resid_reduction_threshold, & - uvel, vvel, & - Auu, Auv, & - Avu, Avv, & - uvel_old, vvel_old, & - duvel, dvvel, & - uvel_sav, vvel_sav, & - Auu_sav, Auv_sav, & - Avu_sav, Avv_sav, & + uvel_2d, vvel_2d, & + Auu_2d, Auv_2d, & + Avu_2d, Avv_2d, & + uvel_2d_old, vvel_2d_old, & + duvel_2d, dvvel_2d, & + uvel_2d_sav, vvel_2d_sav, & + Auu_2d_sav, Auv_2d_sav, & + Avu_2d_sav, Avv_2d_sav, & beta_internal, beta_internal_sav, & assembly_is_done) + integer, intent(in) :: & + nx, ny ! number of grid cells in each direction + real(dp), intent(in) :: & L2_norm, & ! latest value of L2 norm of residual L2_norm_large, & ! large value for re-initializing the L2 norm @@ -9870,21 +9874,19 @@ subroutine evaluate_accelerated_picard_3d(& alpha_accel, & ! factor for extending the vector (duvel, dvvel) to reduce the residual L2_norm_alpha_sav ! value of L2 norm of residual, given the previous alpha_accel - real(dp), dimension(:,:), intent(inout) :: & + real(dp), dimension(nx-1,ny-1), intent(inout) :: & + uvel_2d, vvel_2d, & ! latest guess for the velocity solution + uvel_2d_old, vvel_2d_old, & ! velocity solution from previous nonlinear iteration + duvel_2d, dvvel_2d, & ! difference between old velocity solution and latest solution + uvel_2d_sav, vvel_2d_sav, & ! best velocity solution so far, based on the residual norm beta_internal, & ! beta_internal as a function of uvel_2d and vvel_2d beta_internal_sav ! beta_internal as a function of uvel_2d_sav and vvel_2d_sav - real(dp), dimension(:,:,:), intent(inout) :: & - uvel, vvel, & ! latest guess for the velocity solution - uvel_old, vvel_old, & ! velocity solution from previous nonlinear iteration - duvel, dvvel, & ! difference between old velocity solution and latest solution - uvel_sav, vvel_sav ! best velocity solution so far, based on the residual norm - - real(dp), dimension(:,:,:,:), intent(inout) :: & - Auu, Auv, & ! latest assembled matrices as a function of uvel_2d and vvel_2d - Avu, Avv, & - Auu_sav, Auv_sav, & ! assembled matrices as a function of uvel_2d_sav and vvel_2d_sav - Avu_sav, Avv_sav + real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(inout) :: & + Auu_2d, Auv_2d, & ! latest assembled matrices as a function of uvel_2d and vvel_2d + Avu_2d, Avv_2d, & + Auu_2d_sav, Auv_2d_sav, & ! assembled matrices as a function of uvel_2d_sav and vvel_2d_sav + Avu_2d_sav, Avv_2d_sav logical, intent(inout) :: & assembly_is_done ! if true, then accept the current assembled matrices and proceed to solution @@ -9897,12 +9899,12 @@ subroutine evaluate_accelerated_picard_3d(& ! Save the latest values of the solver inputs - uvel_sav = uvel - vvel_sav = vvel - Auu_sav = Auu - Auv_sav = Auv - Avu_sav = Avu - Avv_sav = Avv + uvel_2d_sav = uvel_2d + vvel_2d_sav = vvel_2d + Auu_2d_sav = Auu_2d + Auv_2d_sav = Auv_2d + Avu_2d_sav = Avu_2d + Avv_2d_sav = Avv_2d beta_internal_sav = beta_internal ! Increase alpha_accel and see if the residual keeps getting smaller. @@ -9927,8 +9929,8 @@ subroutine evaluate_accelerated_picard_3d(& endif ! Save this velocity as the starting point for the next nonlinear iteration - uvel_old = uvel - vvel_old = vvel + uvel_2d_old = uvel_2d + vvel_2d_old = vvel_2d ! Reset alpha_accel and L2_norm_alpha_sav for the next nonlinear iteration alpha_accel = 1.0d0 @@ -9941,17 +9943,17 @@ subroutine evaluate_accelerated_picard_3d(& ! The residual is larger than the previous value. ! Switch back to the previously saved velocity and matrix with the lower residual. - uvel = uvel_sav - vvel = vvel_sav - Auu = Auu_sav - Auv = Auv_sav - Avu = Avu_sav - Avv = Avv_sav + uvel_2d = uvel_2d_sav + vvel_2d = vvel_2d_sav + Auu_2d = Auu_2d_sav + Auv_2d = Auv_2d_sav + Avu_2d = Avu_2d_sav + Avv_2d = Avv_2d_sav beta_internal = beta_internal_sav ! Save this velocity as the starting point for the next nonlinear iteration - uvel_old = uvel - vvel_old = vvel + uvel_2d_old = uvel_2d + vvel_2d_old = vvel_2d if (verbose_picard .and. main_task) then write(iulog,*) 'Back up to alpha =', alpha_accel - gamma_accel @@ -9967,30 +9969,26 @@ subroutine evaluate_accelerated_picard_3d(& endif ! L2_norm of residual has reduced - end subroutine evaluate_accelerated_picard_3d - + end subroutine evaluate_accelerated_picard_2d + !**************************************************************************** - subroutine evaluate_accelerated_picard_2d(& - nx, ny, & + subroutine evaluate_accelerated_picard_3d(& L2_norm, L2_norm_large, & L2_norm_alpha_sav, & alpha_accel, alpha_accel_max, & gamma_accel, resid_reduction_threshold, & - uvel_2d, vvel_2d, & - Auu_2d, Auv_2d, & - Avu_2d, Avv_2d, & - uvel_2d_old, vvel_2d_old, & - duvel_2d, dvvel_2d, & - uvel_2d_sav, vvel_2d_sav, & - Auu_2d_sav, Auv_2d_sav, & - Avu_2d_sav, Avv_2d_sav, & + uvel, vvel, & + Auu, Auv, & + Avu, Avv, & + uvel_old, vvel_old, & + duvel, dvvel, & + uvel_sav, vvel_sav, & + Auu_sav, Auv_sav, & + Avu_sav, Avv_sav, & beta_internal, beta_internal_sav, & assembly_is_done) - integer, intent(in) :: & - nx, ny ! number of grid cells in each direction - real(dp), intent(in) :: & L2_norm, & ! latest value of L2 norm of residual L2_norm_large, & ! large value for re-initializing the L2 norm @@ -10002,19 +10000,21 @@ subroutine evaluate_accelerated_picard_2d(& alpha_accel, & ! factor for extending the vector (duvel, dvvel) to reduce the residual L2_norm_alpha_sav ! value of L2 norm of residual, given the previous alpha_accel - real(dp), dimension(nx-1,ny-1), intent(inout) :: & - uvel_2d, vvel_2d, & ! latest guess for the velocity solution - uvel_2d_old, vvel_2d_old, & ! velocity solution from previous nonlinear iteration - duvel_2d, dvvel_2d, & ! difference between old velocity solution and latest solution - uvel_2d_sav, vvel_2d_sav, & ! best velocity solution so far, based on the residual norm + real(dp), dimension(:,:), intent(inout) :: & beta_internal, & ! beta_internal as a function of uvel_2d and vvel_2d beta_internal_sav ! beta_internal as a function of uvel_2d_sav and vvel_2d_sav - real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(inout) :: & - Auu_2d, Auv_2d, & ! latest assembled matrices as a function of uvel_2d and vvel_2d - Avu_2d, Avv_2d, & - Auu_2d_sav, Auv_2d_sav, & ! assembled matrices as a function of uvel_2d_sav and vvel_2d_sav - Avu_2d_sav, Avv_2d_sav + real(dp), dimension(:,:,:), intent(inout) :: & + uvel, vvel, & ! latest guess for the velocity solution + uvel_old, vvel_old, & ! velocity solution from previous nonlinear iteration + duvel, dvvel, & ! difference between old velocity solution and latest solution + uvel_sav, vvel_sav ! best velocity solution so far, based on the residual norm + + real(dp), dimension(:,:,:,:), intent(inout) :: & + Auu, Auv, & ! latest assembled matrices as a function of uvel_2d and vvel_2d + Avu, Avv, & + Auu_sav, Auv_sav, & ! assembled matrices as a function of uvel_2d_sav and vvel_2d_sav + Avu_sav, Avv_sav logical, intent(inout) :: & assembly_is_done ! if true, then accept the current assembled matrices and proceed to solution @@ -10027,12 +10027,12 @@ subroutine evaluate_accelerated_picard_2d(& ! Save the latest values of the solver inputs - uvel_2d_sav = uvel_2d - vvel_2d_sav = vvel_2d - Auu_2d_sav = Auu_2d - Auv_2d_sav = Auv_2d - Avu_2d_sav = Avu_2d - Avv_2d_sav = Avv_2d + uvel_sav = uvel + vvel_sav = vvel + Auu_sav = Auu + Auv_sav = Auv + Avu_sav = Avu + Avv_sav = Avv beta_internal_sav = beta_internal ! Increase alpha_accel and see if the residual keeps getting smaller. @@ -10057,8 +10057,8 @@ subroutine evaluate_accelerated_picard_2d(& endif ! Save this velocity as the starting point for the next nonlinear iteration - uvel_2d_old = uvel_2d - vvel_2d_old = vvel_2d + uvel_old = uvel + vvel_old = vvel ! Reset alpha_accel and L2_norm_alpha_sav for the next nonlinear iteration alpha_accel = 1.0d0 @@ -10068,20 +10068,20 @@ subroutine evaluate_accelerated_picard_2d(& assembly_is_done = .true. else - - ! The residual is larger than the previous value. - ! Switch back to the previously saved velocity and matrix with the lower residual. - uvel_2d = uvel_2d_sav - vvel_2d = vvel_2d_sav - Auu_2d = Auu_2d_sav - Auv_2d = Auv_2d_sav - Avu_2d = Avu_2d_sav - Avv_2d = Avv_2d_sav + + ! The residual is larger than the previous value. + ! Switch back to the previously saved velocity and matrix with the lower residual. + uvel = uvel_sav + vvel = vvel_sav + Auu = Auu_sav + Auv = Auv_sav + Avu = Avu_sav + Avv = Avv_sav beta_internal = beta_internal_sav ! Save this velocity as the starting point for the next nonlinear iteration - uvel_2d_old = uvel_2d - vvel_2d_old = vvel_2d + uvel_old = uvel + vvel_old = vvel if (verbose_picard .and. main_task) then write(iulog,*) 'Back up to alpha =', alpha_accel - gamma_accel @@ -10097,38 +10097,38 @@ subroutine evaluate_accelerated_picard_2d(& endif ! L2_norm of residual has reduced - end subroutine evaluate_accelerated_picard_2d + end subroutine evaluate_accelerated_picard_3d !**************************************************************************** - subroutine compute_residual_velocity_3d(whichresid, parallel, & - uvel, vvel, & - usav, vsav, & + subroutine compute_residual_velocity_2d(whichresid, parallel, & + uvel, vvel, & + usav, vsav, & resid_velo) integer, intent(in) :: & - whichresid ! option for method to use when calculating residual + whichresid ! option for method to use when calculating residual type(parallel_type), intent(in) :: & parallel ! info for parallel communication - real(dp), dimension(:,:,:), intent(in) :: & - uvel, vvel, & ! current guess for velocity - usav, vsav ! previous guess for velocity + real(dp), dimension(:,:), intent(in) :: & + uvel, vvel, & ! current guess for velocity + usav, vsav ! previous guess for velocity real(dp), intent(out) :: & - resid_velo ! quantity related to velocity convergence + resid_velo ! quantity related to velocity convergence integer :: & - imaxdiff, jmaxdiff, kmaxdiff ! location of maximum speed difference - ! currently computed but not used + imaxdiff, jmaxdiff ! location of maximum speed difference + ! currently computed but not used - integer :: i, j, k, count + integer :: i, j, count real(dp) :: & - speed, & ! current guess for ice speed - oldspeed, & ! previous guess for ice speed - diffspeed ! abs(speed-oldspeed) + speed, & ! current guess for ice speed + oldspeed, & ! previous guess for ice speed + diffspeed ! abs(speed-oldspeed) integer :: & staggered_ilo, staggered_ihi, & ! bounds of locally owned vertices on staggered grid @@ -10140,7 +10140,6 @@ subroutine compute_residual_velocity_3d(whichresid, parallel, & staggered_jhi = parallel%staggered_jhi ! Compute a residual quantity based on convergence of the velocity field. - !TODO - Remove some of these velocity residual methods? They are rarely if ever used. ! options for residual calculation method, as specified in configuration file ! case(0): use max of abs( vel_old - vel ) / vel ) @@ -10151,7 +10150,6 @@ subroutine compute_residual_velocity_3d(whichresid, parallel, & resid_velo = 0.d0 imaxdiff = 0 jmaxdiff = 0 - kmaxdiff = 0 select case (whichresid) @@ -10161,19 +10159,16 @@ subroutine compute_residual_velocity_3d(whichresid, parallel, & do j = staggered_jlo, staggered_jhi do i = staggered_ilo, staggered_ihi - do k = 1, size(uvel,1) - 1 ! ignore bed velocity - speed = sqrt(uvel(k,i,j)**2 + vvel(k,i,j)**2) - if (speed /= 0.d0) then - oldspeed = sqrt(usav(k,i,j)**2 + vsav(k,i,j)**2) - diffspeed = abs((oldspeed - speed)/speed) - if (diffspeed > resid_velo) then - resid_velo = diffspeed - imaxdiff = i - jmaxdiff = j - kmaxdiff = k - endif + speed = sqrt(uvel(i,j)**2 + vvel(i,j)**2) + if (speed /= 0.d0) then + oldspeed = sqrt(usav(i,j)**2 + vsav(i,j)**2) + diffspeed = abs((oldspeed - speed)/speed) + if (diffspeed > resid_velo) then + resid_velo = diffspeed + imaxdiff = i + jmaxdiff = j endif - enddo + endif enddo enddo @@ -10188,15 +10183,13 @@ subroutine compute_residual_velocity_3d(whichresid, parallel, & do j = staggered_jlo, staggered_jhi do i = staggered_ilo, staggered_ihi - do k = 1, size(uvel,1) - 1 ! ignore bed velocity - speed = sqrt(uvel(k,i,j)**2 + vvel(k,i,j)**2) - if (speed /= 0.d0) then - count = count+1 - oldspeed = sqrt(usav(k,i,j)**2 + vsav(k,i,j)**2) - diffspeed = abs((oldspeed - speed)/speed) - resid_velo = resid_velo + diffspeed - endif - enddo + speed = sqrt(uvel(i,j)**2 + vvel(i,j)**2) + if (speed /= 0.d0) then + count = count+1 + oldspeed = sqrt(usav(i,j)**2 + vsav(i,j)**2) + diffspeed = abs((oldspeed - speed)/speed) + resid_velo = resid_velo + diffspeed + endif enddo enddo @@ -10207,25 +10200,22 @@ subroutine compute_residual_velocity_3d(whichresid, parallel, & call not_parallel(__FILE__, __LINE__) case default ! max speed difference, including basal speeds - ! (case HO_RESID_MAXU or HO_RESID_L2NORM or HO_RESID_L2NORM_RELATIVE) + ! (case HO_RESID_MAXU or HO_RESID_L2NORM) ! Loop over locally owned vertices do j = staggered_jlo, staggered_jhi do i = staggered_ilo, staggered_ihi - do k = 1, size(uvel,1) - speed = sqrt(uvel(k,i,j)**2 + vvel(k,i,j)**2) - if (speed /= 0.d0) then - oldspeed = sqrt(usav(k,i,j)**2 + vsav(k,i,j)**2) - diffspeed = abs((oldspeed - speed)/speed) - if (diffspeed > resid_velo) then - resid_velo = diffspeed - imaxdiff = i - jmaxdiff = j - kmaxdiff = k - endif + speed = sqrt(uvel(i,j)**2 + vvel(i,j)**2) + if (speed /= 0.d0) then + oldspeed = sqrt(usav(i,j)**2 + vsav(i,j)**2) + diffspeed = abs((oldspeed - speed)/speed) + if (diffspeed > resid_velo) then + resid_velo = diffspeed + imaxdiff = i + jmaxdiff = j endif - enddo + endif enddo enddo @@ -10233,38 +10223,38 @@ subroutine compute_residual_velocity_3d(whichresid, parallel, & end select - end subroutine compute_residual_velocity_3d + end subroutine compute_residual_velocity_2d !**************************************************************************** - subroutine compute_residual_velocity_2d(whichresid, parallel, & - uvel, vvel, & - usav, vsav, & + subroutine compute_residual_velocity_3d(whichresid, parallel, & + uvel, vvel, & + usav, vsav, & resid_velo) integer, intent(in) :: & - whichresid ! option for method to use when calculating residual + whichresid ! option for method to use when calculating residual type(parallel_type), intent(in) :: & parallel ! info for parallel communication - real(dp), dimension(:,:), intent(in) :: & - uvel, vvel, & ! current guess for velocity - usav, vsav ! previous guess for velocity + real(dp), dimension(:,:,:), intent(in) :: & + uvel, vvel, & ! current guess for velocity + usav, vsav ! previous guess for velocity real(dp), intent(out) :: & - resid_velo ! quantity related to velocity convergence + resid_velo ! quantity related to velocity convergence integer :: & - imaxdiff, jmaxdiff ! location of maximum speed difference - ! currently computed but not used + imaxdiff, jmaxdiff, kmaxdiff ! location of maximum speed difference + ! currently computed but not used - integer :: i, j, count + integer :: i, j, k, count real(dp) :: & - speed, & ! current guess for ice speed - oldspeed, & ! previous guess for ice speed - diffspeed ! abs(speed-oldspeed) + speed, & ! current guess for ice speed + oldspeed, & ! previous guess for ice speed + diffspeed ! abs(speed-oldspeed) integer :: & staggered_ilo, staggered_ihi, & ! bounds of locally owned vertices on staggered grid @@ -10276,6 +10266,7 @@ subroutine compute_residual_velocity_2d(whichresid, parallel, & staggered_jhi = parallel%staggered_jhi ! Compute a residual quantity based on convergence of the velocity field. + !TODO - Remove some of these velocity residual methods? They are rarely if ever used. ! options for residual calculation method, as specified in configuration file ! case(0): use max of abs( vel_old - vel ) / vel ) @@ -10286,6 +10277,7 @@ subroutine compute_residual_velocity_2d(whichresid, parallel, & resid_velo = 0.d0 imaxdiff = 0 jmaxdiff = 0 + kmaxdiff = 0 select case (whichresid) @@ -10295,16 +10287,19 @@ subroutine compute_residual_velocity_2d(whichresid, parallel, & do j = staggered_jlo, staggered_jhi do i = staggered_ilo, staggered_ihi - speed = sqrt(uvel(i,j)**2 + vvel(i,j)**2) - if (speed /= 0.d0) then - oldspeed = sqrt(usav(i,j)**2 + vsav(i,j)**2) - diffspeed = abs((oldspeed - speed)/speed) - if (diffspeed > resid_velo) then - resid_velo = diffspeed - imaxdiff = i - jmaxdiff = j + do k = 1, size(uvel,1) - 1 ! ignore bed velocity + speed = sqrt(uvel(k,i,j)**2 + vvel(k,i,j)**2) + if (speed /= 0.d0) then + oldspeed = sqrt(usav(k,i,j)**2 + vsav(k,i,j)**2) + diffspeed = abs((oldspeed - speed)/speed) + if (diffspeed > resid_velo) then + resid_velo = diffspeed + imaxdiff = i + jmaxdiff = j + kmaxdiff = k + endif endif - endif + enddo enddo enddo @@ -10319,13 +10314,15 @@ subroutine compute_residual_velocity_2d(whichresid, parallel, & do j = staggered_jlo, staggered_jhi do i = staggered_ilo, staggered_ihi - speed = sqrt(uvel(i,j)**2 + vvel(i,j)**2) - if (speed /= 0.d0) then - count = count+1 - oldspeed = sqrt(usav(i,j)**2 + vsav(i,j)**2) - diffspeed = abs((oldspeed - speed)/speed) - resid_velo = resid_velo + diffspeed - endif + do k = 1, size(uvel,1) - 1 ! ignore bed velocity + speed = sqrt(uvel(k,i,j)**2 + vvel(k,i,j)**2) + if (speed /= 0.d0) then + count = count+1 + oldspeed = sqrt(usav(k,i,j)**2 + vsav(k,i,j)**2) + diffspeed = abs((oldspeed - speed)/speed) + resid_velo = resid_velo + diffspeed + endif + enddo enddo enddo @@ -10336,30 +10333,96 @@ subroutine compute_residual_velocity_2d(whichresid, parallel, & call not_parallel(__FILE__, __LINE__) case default ! max speed difference, including basal speeds - ! (case HO_RESID_MAXU or HO_RESID_L2NORM) + ! (case HO_RESID_MAXU or HO_RESID_L2NORM or HO_RESID_L2NORM_RELATIVE) ! Loop over locally owned vertices do j = staggered_jlo, staggered_jhi do i = staggered_ilo, staggered_ihi - speed = sqrt(uvel(i,j)**2 + vvel(i,j)**2) - if (speed /= 0.d0) then - oldspeed = sqrt(usav(i,j)**2 + vsav(i,j)**2) - diffspeed = abs((oldspeed - speed)/speed) - if (diffspeed > resid_velo) then - resid_velo = diffspeed - imaxdiff = i - jmaxdiff = j + do k = 1, size(uvel,1) + speed = sqrt(uvel(k,i,j)**2 + vvel(k,i,j)**2) + if (speed /= 0.d0) then + oldspeed = sqrt(usav(k,i,j)**2 + vsav(k,i,j)**2) + diffspeed = abs((oldspeed - speed)/speed) + if (diffspeed > resid_velo) then + resid_velo = diffspeed + imaxdiff = i + jmaxdiff = j + kmaxdiff = k + endif endif - endif + enddo enddo enddo - resid_velo = parallel_reduce_max(resid_velo) - - end select + resid_velo = parallel_reduce_max(resid_velo) + + end select + + end subroutine compute_residual_velocity_3d + +!**************************************************************************** + + subroutine count_nonzeros_2d(nx, ny, & + parallel, & + Auu, Auv, & + Avu, Avv, & + active_vertex, & + nNonzeros) + + !---------------------------------------------------------------- + ! Input-output arguments + !---------------------------------------------------------------- + + integer, intent(in) :: & + nx, ny ! number of grid cells in each direction + + type(parallel_type), intent(in) :: & + parallel ! info for parallel communication + + real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(in) :: & + Auu, Auv, & ! assembled stiffness matrix, divided into 4 parts + Avu, Avv + + logical, dimension(nx-1,ny-1), intent(in) :: & + active_vertex ! true for vertices of active cells + + integer, intent(out) :: & + nNonzeros ! number of nonzero matrix elements + + !---------------------------------------------------------------- + ! Local variables + !---------------------------------------------------------------- + + integer :: i, j, m + + integer :: & + staggered_ilo, staggered_ihi, & ! bounds of locally owned vertices on staggered grid + staggered_jlo, staggered_jhi + + staggered_ilo = parallel%staggered_ilo + staggered_ihi = parallel%staggered_ihi + staggered_jlo = parallel%staggered_jlo + staggered_jhi = parallel%staggered_jhi + + nNonzeros = 0 + + do m = 1, nNodeNeighbors_2d + do j = staggered_jlo, staggered_jhi + do i = staggered_ilo, staggered_ihi + if (active_vertex(i,j)) then + if (Auu(i,j,m) /= 0.d0) nNonzeros = nNonzeros + 1 + if (Auv(i,j,m) /= 0.d0) nNonzeros = nNonzeros + 1 + if (Avu(i,j,m) /= 0.d0) nNonzeros = nNonzeros + 1 + if (Avv(i,j,m) /= 0.d0) nNonzeros = nNonzeros + 1 + endif ! active_vertex + enddo ! i + enddo ! j + enddo ! m + + nNonzeros = parallel_reduce_sum(nNonzeros) - end subroutine compute_residual_velocity_2d + end subroutine count_nonzeros_2d !**************************************************************************** @@ -10431,69 +10494,6 @@ subroutine count_nonzeros_3d(nx, ny, nz, & end subroutine count_nonzeros_3d -!**************************************************************************** - - subroutine count_nonzeros_2d(nx, ny, & - parallel, & - Auu, Auv, & - Avu, Avv, & - active_vertex, & - nNonzeros) - - !---------------------------------------------------------------- - ! Input-output arguments - !---------------------------------------------------------------- - - integer, intent(in) :: & - nx, ny ! number of grid cells in each direction - - type(parallel_type), intent(in) :: & - parallel ! info for parallel communication - - real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(in) :: & - Auu, Auv, & ! assembled stiffness matrix, divided into 4 parts - Avu, Avv - - logical, dimension(nx-1,ny-1), intent(in) :: & - active_vertex ! true for vertices of active cells - - integer, intent(out) :: & - nNonzeros ! number of nonzero matrix elements - - !---------------------------------------------------------------- - ! Local variables - !---------------------------------------------------------------- - - integer :: i, j, m - - integer :: & - staggered_ilo, staggered_ihi, & ! bounds of locally owned vertices on staggered grid - staggered_jlo, staggered_jhi - - staggered_ilo = parallel%staggered_ilo - staggered_ihi = parallel%staggered_ihi - staggered_jlo = parallel%staggered_jlo - staggered_jhi = parallel%staggered_jhi - - nNonzeros = 0 - - do m = 1, nNodeNeighbors_2d - do j = staggered_jlo, staggered_jhi - do i = staggered_ilo, staggered_ihi - if (active_vertex(i,j)) then - if (Auu(i,j,m) /= 0.d0) nNonzeros = nNonzeros + 1 - if (Auv(i,j,m) /= 0.d0) nNonzeros = nNonzeros + 1 - if (Avu(i,j,m) /= 0.d0) nNonzeros = nNonzeros + 1 - if (Avv(i,j,m) /= 0.d0) nNonzeros = nNonzeros + 1 - endif ! active_vertex - enddo ! i - enddo ! j - enddo ! m - - nNonzeros = parallel_reduce_sum(nNonzeros) - - end subroutine count_nonzeros_2d - !**************************************************************************** subroutine check_symmetry_element_matrix(nNodesPerElement, & @@ -10533,31 +10533,237 @@ subroutine check_symmetry_element_matrix(nNodesPerElement, & enddo enddo - ! check that Kvv = (Kvv)^T + ! check that Kvv = (Kvv)^T + + do j = 1, nNodesPerElement + do i = j, nNodesPerElement + if (abs(Kvv(i,j) - Kvv(j,i)) > eps11) then + write(iulog,*) 'Kvv is not symmetric' + write(iulog,*) 'i, j, Kvv(i,j), Kvv(j,i):', i, j, Kvv(i,j), Kvv(j,i) + stop + endif + enddo + enddo + + ! Check that Kuv = (Kvu)^T + + do j = 1, nNodesPerElement + do i = 1, nNodesPerElement + if (abs(Kuv(i,j) - Kvu(j,i)) > eps11) then + write(iulog,*) 'Kuv /= (Kvu)^T' + write(iulog,*) 'i, j, Kuv(i,j), Kvu(j,i):', i, j, Kuv(i,j), Kvu(j,i) + stop + endif + enddo + enddo + + end subroutine check_symmetry_element_matrix + +!**************************************************************************** + + subroutine check_symmetry_assembled_matrix_2d(nx, ny, & + parallel, & + active_vertex, & + Auu, Auv, Avu, Avv) + + !------------------------------------------------------------------ + ! Check that the assembled stiffness matrix is symmetric. + ! This is true provided that (1) Auu = (Auu)^T + ! (2) Avv = (Avv)^T + ! (3) Auv = (Avu)^T + ! The A matrices are assembled in a dense fashion to save storage + ! and preserve the i/j/k structure of the grid. + ! + ! There can be small differences from perfect symmetry due to roundoff error. + ! These differences are fixed provided they are small enough. + !------------------------------------------------------------------ + + integer, intent(in) :: & + nx, ny ! horizontal grid dimensions + + type(parallel_type), intent(in) :: & + parallel ! info for parallel communication + + logical, dimension(nx-1,ny-1), intent(in) :: & + active_vertex ! T for columns (i,j) where velocity is computed, else F + + real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(inout) :: & + Auu, Auv, Avu, Avv ! components of assembled stiffness matrix + ! + ! Auu | Auv + ! _____|____ + ! | + ! Avu | Avv + + integer :: i, j, iA, jA, m, mm, iglobal, jglobal + + real(dp) :: val1, val2 ! values of matrix coefficients + + real(dp) :: maxdiff, global_maxdiff, diag_entry, avg_val + + integer :: rmax, imax, jmax, mmax + + integer :: & + staggered_ilo, staggered_ihi, & ! bounds of locally owned vertices on staggered grid + staggered_jlo, staggered_jhi + + staggered_ilo = parallel%staggered_ilo + staggered_ihi = parallel%staggered_ihi + staggered_jlo = parallel%staggered_jlo + staggered_jhi = parallel%staggered_jhi + + ! Check matrix for symmetry + + ! Here we correct for small differences from symmetry due to roundoff error. + ! The maximum departure from symmetry is set to be a small fraction + ! of the diagonal entry for the row. + ! If the departure from symmetry is larger than this, then the model prints a warning + ! and/or aborts. + + maxdiff = 0.d0 + rmax = 0; imax = 0; jmax = 0; mmax = 0 + + ! Loop over locally owned vertices. + ! Each active vertex is associate with 2*nz matrix rows belonging to this processor. + + do jA = -1, 1 + do iA = -1, 1 + m = indxA_2d( iA, jA) + mm = indxA_2d(-iA,-jA) + + do j = staggered_jlo, staggered_jhi + do i = staggered_ilo, staggered_ihi + if (active_vertex(i,j)) then + + ! Check Auu and Auv for symmetry + diag_entry = Auu(i,j,indxA_2d(0,0)) + + !WHL - debug + if (diag_entry /= diag_entry) then + write(iulog,*) 'WARNING: Diagonal NaN: i, j =', i, j + call parallel_globalindex(i, j, iglobal, jglobal, parallel) + write(iulog,*) ' iglobal, jglobal:', iglobal, jglobal +!! stop + endif + + ! Check that Auu = Auu^T + val1 = Auu(i, j, m ) ! value of Auu(row,col) + val2 = Auu(i+iA, j+jA, mm) ! value of Auu(col,row) + if (val2 /= val1) then + if (abs(val2 - val1) > maxdiff) then + maxdiff = abs(val2 - val1) + rmax = this_rank; imax = i; jmax = j; mmax = m + endif + ! if difference is small, then fix the asymmetry by averaging values + !WHL - Here and below, I commented out the code to average asymmetric values. + ! The hope is that the asymmetries are too small to matter. + ! else print a warning and abort + if ( abs(val2-val1) < eps11*abs(diag_entry) ) then +! avg_val = 0.5d0 * (val1 + val2) +! Auu(i, j, m ) = avg_val +! Auu(i+iA, j+jA, mm) = avg_val + else + write(iulog,*) 'WARNING: Auu is not symmetric: this_rank, i, j, iA, jA =', this_rank, i, j, iA, jA + write(iulog,*) 'Auu(row,col), Auu(col,row), diff/diag:', val1, val2, (val2 - val1)/diag_entry + call parallel_globalindex(i, j, iglobal, jglobal, parallel) + write(iulog,*) ' iglobal, jglobal:', iglobal, jglobal +!! stop + endif + endif ! val2 /= val1 + + ! Check that Auv = (Avu)^T + val1 = Auv(i, j, m ) ! value of Auv(row,col) + val2 = Avu(i+iA, j+jA, mm) ! value of Avu(col,row) + if (val2 /= val1) then + if (abs(val2 - val1) > maxdiff) then + maxdiff = abs(val2 - val1) + rmax = this_rank; imax = i; jmax = j; mmax = m + endif + ! if difference is small, then fix the asymmetry by averaging values + ! else print a warning and abort + if ( abs(val2-val1) < eps11*abs(diag_entry) ) then +! avg_val = 0.5d0 * (val1 + val2) +! Auv(i, j, m ) = avg_val +! Avu(i+iA, j+jA, mm) = avg_val + else + write(iulog,*) 'WARNING: Auv is not equal to (Avu)^T, this_rank, i, j, iA, jA =', this_rank, i, j, iA, jA + write(iulog,*) 'Auv(row,col), Avu(col,row), diff/diag:', val1, val2, (val2 - val1)/diag_entry + call parallel_globalindex(i, j, iglobal, jglobal, parallel) + write(iulog,*) ' iglobal, jglobal:', iglobal, jglobal +!! stop + endif + endif ! val2 /= val1 + + ! Now check Avu and Avv + diag_entry = Avv(i,j,indxA_2d(0,0)) + + ! check that Avv = (Avv)^T + val1 = Avv(i, j, m ) ! value of Avv(row,col) + val2 = Avv(i+iA, j+jA, mm) ! value of Avv(col,row) + + if (val2 /= val1) then + if (abs(val2 - val1) > maxdiff) then + maxdiff = abs(val2 - val1) + rmax = this_rank; imax = i; jmax = j; mmax = m + endif + ! if difference is small, then fix the asymmetry by averaging values + ! else print a warning and abort + if ( abs(val2-val1) < eps11*abs(diag_entry) ) then +! avg_val = 0.5d0 * (val1 + val2) +! Avv(i, j, m ) = avg_val +! Avv(i+iA, j+jA, mm) = avg_val + else + write(iulog,*) 'WARNING: Avv is not symmetric: this_rank, i, j, iA, jA =', this_rank, i, j, iA, jA + write(iulog,*) 'Avv(row,col), Avv(col,row), diff/diag:', val1, val2, (val2 - val1)/diag_entry + call parallel_globalindex(i, j, iglobal, jglobal, parallel) + write(iulog,*) ' iglobal, jglobal:', iglobal, jglobal +!! stop + endif + + endif ! val2 /= val1 - do j = 1, nNodesPerElement - do i = j, nNodesPerElement - if (abs(Kvv(i,j) - Kvv(j,i)) > eps11) then - write(iulog,*) 'Kvv is not symmetric' - write(iulog,*) 'i, j, Kvv(i,j), Kvv(j,i):', i, j, Kvv(i,j), Kvv(j,i) - stop - endif - enddo - enddo + ! Check that Avu = (Auv)^T + val1 = Avu(i, j, m ) ! value of Avu(row,col) + val2 = Auv(i+iA, j+jA, mm) ! value of Auv(col,row) - ! Check that Kuv = (Kvu)^T + if (val2 /= val1) then + if (abs(val2 - val1) > maxdiff) then + maxdiff = abs(val2 - val1) + rmax = this_rank; imax = i; jmax = j; mmax = m + endif + ! if difference is small, then fix the asymmetry by averaging values + ! else print a warning and abort + if ( abs(val2-val1) < eps11*abs(diag_entry) ) then +! avg_val = 0.5d0 * (val1 + val2) +! Avu(i, j, m ) = avg_val +! Auv(i+iA, j+jA, mm) = avg_val + else + write(iulog,*) 'WARNING: Avu is not equal to (Auv)^T, this_rank, i, j, iA, jA =', this_rank, i, j, iA, jA + write(iulog,*) 'Avu(row,col), Auv(col,row), diff/diag:', val1, val2, (val2 - val1)/diag_entry + call parallel_globalindex(i, j, iglobal, jglobal, parallel) + write(iulog,*) ' iglobal, jglobal:', iglobal, jglobal +!! stop + endif - do j = 1, nNodesPerElement - do i = 1, nNodesPerElement - if (abs(Kuv(i,j) - Kvu(j,i)) > eps11) then - write(iulog,*) 'Kuv /= (Kvu)^T' - write(iulog,*) 'i, j, Kuv(i,j), Kvu(j,i):', i, j, Kuv(i,j), Kvu(j,i) - stop - endif - enddo - enddo + endif ! val2 /= val1 - end subroutine check_symmetry_element_matrix + endif ! active_vertex + enddo ! i + enddo ! j + enddo ! iA + enddo ! jA + + if (verbose_matrix) then + global_maxdiff = parallel_reduce_max(maxdiff) + if (global_maxdiff > 0.0d0 .and. maxdiff == global_maxdiff) then + ! maxdiff is on this processor; compute and broadcast the global index + call parallel_globalindex(imax, jmax, iglobal, jglobal, parallel) + write(iulog,*) 'Max asymmetry =', global_maxdiff + write(iulog,*) ' ig, jg, m =', iglobal, jglobal, mmax + endif + endif + + end subroutine check_symmetry_assembled_matrix_2d !**************************************************************************** @@ -10797,228 +11003,22 @@ subroutine check_symmetry_assembled_matrix_3d(nx, ny, nz, & this_rank, i, j, k, iA, jA, kA write(iulog,*) 'Avu(row,col), Auv(col,row), diff/diag:', val1, val2, (val2 - val1)/diag_entry call parallel_globalindex(i, j, iglobal, jglobal, parallel) - write(iulog,*) ' iglobal, jglobal:', iglobal, jglobal -!! stop - endif - - endif ! val2 /= val1 - - endif ! k+kA in bounds - - enddo ! kA - enddo ! iA - enddo ! jA - - enddo ! k - endif ! active_vertex - enddo ! i - enddo ! j - - if (verbose_matrix) then - global_maxdiff = parallel_reduce_max(maxdiff) - if (global_maxdiff > 0.0d0 .and. maxdiff == global_maxdiff) then - ! maxdiff is on this processor; compute and broadcast the global index - call parallel_globalindex(imax, jmax, iglobal, jglobal, parallel) - write(iulog,*) 'Max asymmetry =', global_maxdiff - write(iulog,*) ' i, j, ig, jg, k, m =', imax, jmax, iglobal, jglobal, kmax, mmax - endif - endif - - end subroutine check_symmetry_assembled_matrix_3d - -!**************************************************************************** - - subroutine check_symmetry_assembled_matrix_2d(nx, ny, & - parallel, & - active_vertex, & - Auu, Auv, Avu, Avv) - - !------------------------------------------------------------------ - ! Check that the assembled stiffness matrix is symmetric. - ! This is true provided that (1) Auu = (Auu)^T - ! (2) Avv = (Avv)^T - ! (3) Auv = (Avu)^T - ! The A matrices are assembled in a dense fashion to save storage - ! and preserve the i/j/k structure of the grid. - ! - ! There can be small differences from perfect symmetry due to roundoff error. - ! These differences are fixed provided they are small enough. - !------------------------------------------------------------------ - - integer, intent(in) :: & - nx, ny ! horizontal grid dimensions - - type(parallel_type), intent(in) :: & - parallel ! info for parallel communication - - logical, dimension(nx-1,ny-1), intent(in) :: & - active_vertex ! T for columns (i,j) where velocity is computed, else F - - real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(inout) :: & - Auu, Auv, Avu, Avv ! components of assembled stiffness matrix - ! - ! Auu | Auv - ! _____|____ - ! | - ! Avu | Avv - - integer :: i, j, iA, jA, m, mm, iglobal, jglobal - - real(dp) :: val1, val2 ! values of matrix coefficients - - real(dp) :: maxdiff, global_maxdiff, diag_entry, avg_val - - integer :: rmax, imax, jmax, mmax - - integer :: & - staggered_ilo, staggered_ihi, & ! bounds of locally owned vertices on staggered grid - staggered_jlo, staggered_jhi - - staggered_ilo = parallel%staggered_ilo - staggered_ihi = parallel%staggered_ihi - staggered_jlo = parallel%staggered_jlo - staggered_jhi = parallel%staggered_jhi - - ! Check matrix for symmetry - - ! Here we correct for small differences from symmetry due to roundoff error. - ! The maximum departure from symmetry is set to be a small fraction - ! of the diagonal entry for the row. - ! If the departure from symmetry is larger than this, then the model prints a warning - ! and/or aborts. - - maxdiff = 0.d0 - rmax = 0; imax = 0; jmax = 0; mmax = 0 - - ! Loop over locally owned vertices. - ! Each active vertex is associate with 2*nz matrix rows belonging to this processor. - - do jA = -1, 1 - do iA = -1, 1 - m = indxA_2d( iA, jA) - mm = indxA_2d(-iA,-jA) - - do j = staggered_jlo, staggered_jhi - do i = staggered_ilo, staggered_ihi - if (active_vertex(i,j)) then - - ! Check Auu and Auv for symmetry - diag_entry = Auu(i,j,indxA_2d(0,0)) - - !WHL - debug - if (diag_entry /= diag_entry) then - write(iulog,*) 'WARNING: Diagonal NaN: i, j =', i, j - call parallel_globalindex(i, j, iglobal, jglobal, parallel) - write(iulog,*) ' iglobal, jglobal:', iglobal, jglobal -!! stop - endif - - ! Check that Auu = Auu^T - val1 = Auu(i, j, m ) ! value of Auu(row,col) - val2 = Auu(i+iA, j+jA, mm) ! value of Auu(col,row) - if (val2 /= val1) then - if (abs(val2 - val1) > maxdiff) then - maxdiff = abs(val2 - val1) - rmax = this_rank; imax = i; jmax = j; mmax = m - endif - ! if difference is small, then fix the asymmetry by averaging values - !WHL - Here and below, I commented out the code to average asymmetric values. - ! The hope is that the asymmetries are too small to matter. - ! else print a warning and abort - if ( abs(val2-val1) < eps11*abs(diag_entry) ) then -! avg_val = 0.5d0 * (val1 + val2) -! Auu(i, j, m ) = avg_val -! Auu(i+iA, j+jA, mm) = avg_val - else - write(iulog,*) 'WARNING: Auu is not symmetric: this_rank, i, j, iA, jA =', this_rank, i, j, iA, jA - write(iulog,*) 'Auu(row,col), Auu(col,row), diff/diag:', val1, val2, (val2 - val1)/diag_entry - call parallel_globalindex(i, j, iglobal, jglobal, parallel) - write(iulog,*) ' iglobal, jglobal:', iglobal, jglobal -!! stop - endif - endif ! val2 /= val1 - - ! Check that Auv = (Avu)^T - val1 = Auv(i, j, m ) ! value of Auv(row,col) - val2 = Avu(i+iA, j+jA, mm) ! value of Avu(col,row) - if (val2 /= val1) then - if (abs(val2 - val1) > maxdiff) then - maxdiff = abs(val2 - val1) - rmax = this_rank; imax = i; jmax = j; mmax = m - endif - ! if difference is small, then fix the asymmetry by averaging values - ! else print a warning and abort - if ( abs(val2-val1) < eps11*abs(diag_entry) ) then -! avg_val = 0.5d0 * (val1 + val2) -! Auv(i, j, m ) = avg_val -! Avu(i+iA, j+jA, mm) = avg_val - else - write(iulog,*) 'WARNING: Auv is not equal to (Avu)^T, this_rank, i, j, iA, jA =', this_rank, i, j, iA, jA - write(iulog,*) 'Auv(row,col), Avu(col,row), diff/diag:', val1, val2, (val2 - val1)/diag_entry - call parallel_globalindex(i, j, iglobal, jglobal, parallel) - write(iulog,*) ' iglobal, jglobal:', iglobal, jglobal -!! stop - endif - endif ! val2 /= val1 - - ! Now check Avu and Avv - diag_entry = Avv(i,j,indxA_2d(0,0)) - - ! check that Avv = (Avv)^T - val1 = Avv(i, j, m ) ! value of Avv(row,col) - val2 = Avv(i+iA, j+jA, mm) ! value of Avv(col,row) - - if (val2 /= val1) then - if (abs(val2 - val1) > maxdiff) then - maxdiff = abs(val2 - val1) - rmax = this_rank; imax = i; jmax = j; mmax = m - endif - ! if difference is small, then fix the asymmetry by averaging values - ! else print a warning and abort - if ( abs(val2-val1) < eps11*abs(diag_entry) ) then -! avg_val = 0.5d0 * (val1 + val2) -! Avv(i, j, m ) = avg_val -! Avv(i+iA, j+jA, mm) = avg_val - else - write(iulog,*) 'WARNING: Avv is not symmetric: this_rank, i, j, iA, jA =', this_rank, i, j, iA, jA - write(iulog,*) 'Avv(row,col), Avv(col,row), diff/diag:', val1, val2, (val2 - val1)/diag_entry - call parallel_globalindex(i, j, iglobal, jglobal, parallel) - write(iulog,*) ' iglobal, jglobal:', iglobal, jglobal -!! stop - endif - - endif ! val2 /= val1 + write(iulog,*) ' iglobal, jglobal:', iglobal, jglobal +!! stop + endif - ! Check that Avu = (Auv)^T - val1 = Avu(i, j, m ) ! value of Avu(row,col) - val2 = Auv(i+iA, j+jA, mm) ! value of Auv(col,row) + endif ! val2 /= val1 - if (val2 /= val1) then - if (abs(val2 - val1) > maxdiff) then - maxdiff = abs(val2 - val1) - rmax = this_rank; imax = i; jmax = j; mmax = m - endif - ! if difference is small, then fix the asymmetry by averaging values - ! else print a warning and abort - if ( abs(val2-val1) < eps11*abs(diag_entry) ) then -! avg_val = 0.5d0 * (val1 + val2) -! Avu(i, j, m ) = avg_val -! Auv(i+iA, j+jA, mm) = avg_val - else - write(iulog,*) 'WARNING: Avu is not equal to (Auv)^T, this_rank, i, j, iA, jA =', this_rank, i, j, iA, jA - write(iulog,*) 'Avu(row,col), Auv(col,row), diff/diag:', val1, val2, (val2 - val1)/diag_entry - call parallel_globalindex(i, j, iglobal, jglobal, parallel) - write(iulog,*) ' iglobal, jglobal:', iglobal, jglobal -!! stop - endif + endif ! k+kA in bounds - endif ! val2 /= val1 + enddo ! kA + enddo ! iA + enddo ! jA - endif ! active_vertex - enddo ! i - enddo ! j - enddo ! iA - enddo ! jA + enddo ! k + endif ! active_vertex + enddo ! i + enddo ! j if (verbose_matrix) then global_maxdiff = parallel_reduce_max(maxdiff) @@ -11026,53 +11026,51 @@ subroutine check_symmetry_assembled_matrix_2d(nx, ny, & ! maxdiff is on this processor; compute and broadcast the global index call parallel_globalindex(imax, jmax, iglobal, jglobal, parallel) write(iulog,*) 'Max asymmetry =', global_maxdiff - write(iulog,*) ' ig, jg, m =', iglobal, jglobal, mmax + write(iulog,*) ' i, j, ig, jg, k, m =', imax, jmax, iglobal, jglobal, kmax, mmax endif endif - end subroutine check_symmetry_assembled_matrix_2d + end subroutine check_symmetry_assembled_matrix_3d !**************************************************************************** - subroutine write_matrix_elements_3d(nx, ny, nz, & - nNodesSolve, nodeID, & - iNodeIndex, jNodeIndex, & - kNodeIndex, & - Auu, Auv, & - Avu, Avv, & - bu, bv) + subroutine write_matrix_elements_2d(nx, ny, & + nVerticesSolve, vertexID, & + iVertexIndex, jVertexIndex, & + Auu, Auv, & + Avu, Avv, & + bu, bv) ! Write matrix elements to text files. ! Note: Does not work when running on more than one task. integer, intent(in) :: & nx, ny, & ! horizontal grid dimensions - nz, & ! number of vertical levels at which velocity is computed - nNodesSolve ! number of nodes where we solve for velocity + nVerticesSolve ! number of vertices where we solve for velocity - integer, dimension(nz,nx-1,ny-1), intent(in) :: & - nodeID ! ID for each node + integer, dimension(nx-1,ny-1), intent(in) :: & + vertexID ! ID for each vertex integer, dimension(:), intent(in) :: & - iNodeIndex, jNodeIndex, kNodeIndex ! i, j and k indices of active nodes + iVertexIndex, jVertexIndex ! i and j indices of active vertices - real(dp), dimension(nNodeNeighbors_3d,nz,nx-1,ny-1), intent(in) :: & + real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(in) :: & Auu, Auv, & ! assembled stiffness matrix, divided into 4 parts - Avu, Avv ! 1st dimension = node and its nearest neighbors in x, y and z direction - ! other dimensions = (k,i,j) indices + Avu, Avv ! 1st dimension = vertex and its nearest neighbors in x and y direction + ! other dimensions = (i,j) indices - real(dp), dimension(nz,nx-1,ny-1), intent(in) :: & + real(dp), dimension(nx-1,ny-1), intent(in) :: & bu, bv ! assembled load (rhs) vector, divided into 2 parts ! Local variables integer :: rowA, colA - integer :: i, j, k, m, iA, jA, kA + integer :: i, j, m, iA, jA - real(dp), dimension(nNodesSolve, nNodesSolve) :: & + real(dp), dimension(nVerticesSolve, nVerticesSolve) :: & Auu_val, Auv_val, Avu_val, Avv_val ! dense matrices - real(dp), dimension(nNodesSolve) :: nonzeros + real(dp), dimension(nVerticesSolve) :: nonzeros if (tasks > 1) then call write_log('Error: Cannot write matrix elements to files when tasks > 1', GM_FATAL) @@ -11083,35 +11081,29 @@ subroutine write_matrix_elements_3d(nx, ny, nz, & Avu_val(:,:) = 0.d0 Avv_val(:,:) = 0.d0 - do rowA = 1, nNodesSolve - - i = iNodeIndex(rowA) - j = jNodeIndex(rowA) - k = kNodeIndex(rowA) + do rowA = 1, nVerticesSolve - do kA = -1, 1 + i = iVertexIndex(rowA) + j = jVertexIndex(rowA) do jA = -1, 1 do iA = -1, 1 - if ( (k+kA >= 1 .and. k+kA <= nz) & - .and. & - (i+iA >= 1 .and. i+iA <= nx-1) & + if ( (i+iA >= 1 .and. i+iA <= nx-1) & .and. & (j+jA >= 1 .and. j+jA <= ny-1) ) then - colA = nodeID(k+kA, i+iA, j+jA) ! ID for neighboring node - m = indxA_3d(iA,jA,kA) + colA = vertexID(i+iA, j+jA) ! ID for neighboring vertex + m = indxA_2d(iA,jA) if (colA > 0) then - Auu_val(rowA, colA) = Auu(m,k,i,j) - Auv_val(rowA, colA) = Auv(m,k,i,j) - Avu_val(rowA, colA) = Avu(m,k,i,j) - Avv_val(rowA, colA) = Avv(m,k,i,j) + Auu_val(rowA, colA) = Auu(i,j,m) + Auv_val(rowA, colA) = Auv(i,j,m) + Avu_val(rowA, colA) = Avu(i,j,m) + Avv_val(rowA, colA) = Avv(i,j,m) endif - endif ! i+iA, j+jA, and k+kA in bounds + endif ! i+iA and j+jA in bounds - enddo ! kA enddo ! iA enddo ! jA @@ -11120,9 +11112,9 @@ subroutine write_matrix_elements_3d(nx, ny, nz, & !WHL - bug check write(iulog,*) ' ' write(iulog,*) 'nonzeros per row:' - do rowA = 1, nNodesSolve + do rowA = 1, nVerticesSolve nonzeros(rowA) = 0 - do colA = 1, nNodesSolve + do colA = 1, nVerticesSolve if (abs(Auu_val(rowA,colA)) > 1.d-11) then nonzeros(rowA) = nonzeros(rowA) + 1 endif @@ -11139,12 +11131,12 @@ subroutine write_matrix_elements_3d(nx, ny, nz, & open(unit=12, file='Avu.'//matrix_label, status='unknown') open(unit=13, file='Avv.'//matrix_label, status='unknown') - do rowA = 1, nNodesSolve + do rowA = 1, nVerticesSolve write(10,'(i6)',advance='no') rowA write(11,'(i6)',advance='no') rowA write(12,'(i6)',advance='no') rowA write(13,'(i6)',advance='no') rowA - do colA = 1, nNodesSolve + do colA = 1, nVerticesSolve write(10,'(e16.8)',advance='no') Auu_val(rowA,colA) write(11,'(e16.8)',advance='no') Auv_val(rowA,colA) write(12,'(e16.8)',advance='no') Avu_val(rowA,colA) @@ -11166,57 +11158,58 @@ subroutine write_matrix_elements_3d(nx, ny, nz, & ! write load vectors to file open(unit=14, file='bu.'//matrix_label, status='unknown') open(unit=15, file='bv.'//matrix_label, status='unknown') - do rowA = 1, nNodesSolve - i = iNodeIndex(rowA) - j = jNodeIndex(rowA) - k = kNodeIndex(rowA) - write(14,'(i6, e16.8)') rowA, bu(k,i,j) - write(15,'(i6, e16.8)') rowA, bv(k,i,j) + do rowA = 1, nVerticesSolve + i = iVertexIndex(rowA) + j = jVertexIndex(rowA) + write(14,'(i6, e16.8)') rowA, bu(i,j) + write(15,'(i6, e16.8)') rowA, bv(i,j) enddo close(14) close(15) - end subroutine write_matrix_elements_3d - + end subroutine write_matrix_elements_2d + !**************************************************************************** - subroutine write_matrix_elements_2d(nx, ny, & - nVerticesSolve, vertexID, & - iVertexIndex, jVertexIndex, & - Auu, Auv, & - Avu, Avv, & - bu, bv) + subroutine write_matrix_elements_3d(nx, ny, nz, & + nNodesSolve, nodeID, & + iNodeIndex, jNodeIndex, & + kNodeIndex, & + Auu, Auv, & + Avu, Avv, & + bu, bv) ! Write matrix elements to text files. ! Note: Does not work when running on more than one task. integer, intent(in) :: & nx, ny, & ! horizontal grid dimensions - nVerticesSolve ! number of vertices where we solve for velocity + nz, & ! number of vertical levels at which velocity is computed + nNodesSolve ! number of nodes where we solve for velocity - integer, dimension(nx-1,ny-1), intent(in) :: & - vertexID ! ID for each vertex + integer, dimension(nz,nx-1,ny-1), intent(in) :: & + nodeID ! ID for each node integer, dimension(:), intent(in) :: & - iVertexIndex, jVertexIndex ! i and j indices of active vertices + iNodeIndex, jNodeIndex, kNodeIndex ! i, j and k indices of active nodes - real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(in) :: & + real(dp), dimension(nNodeNeighbors_3d,nz,nx-1,ny-1), intent(in) :: & Auu, Auv, & ! assembled stiffness matrix, divided into 4 parts - Avu, Avv ! 1st dimension = vertex and its nearest neighbors in x and y direction - ! other dimensions = (i,j) indices + Avu, Avv ! 1st dimension = node and its nearest neighbors in x, y and z direction + ! other dimensions = (k,i,j) indices - real(dp), dimension(nx-1,ny-1), intent(in) :: & + real(dp), dimension(nz,nx-1,ny-1), intent(in) :: & bu, bv ! assembled load (rhs) vector, divided into 2 parts ! Local variables integer :: rowA, colA - integer :: i, j, m, iA, jA + integer :: i, j, k, m, iA, jA, kA - real(dp), dimension(nVerticesSolve, nVerticesSolve) :: & + real(dp), dimension(nNodesSolve, nNodesSolve) :: & Auu_val, Auv_val, Avu_val, Avv_val ! dense matrices - real(dp), dimension(nVerticesSolve) :: nonzeros + real(dp), dimension(nNodesSolve) :: nonzeros if (tasks > 1) then call write_log('Error: Cannot write matrix elements to files when tasks > 1', GM_FATAL) @@ -11227,29 +11220,35 @@ subroutine write_matrix_elements_2d(nx, ny, & Avu_val(:,:) = 0.d0 Avv_val(:,:) = 0.d0 - do rowA = 1, nVerticesSolve + do rowA = 1, nNodesSolve - i = iVertexIndex(rowA) - j = jVertexIndex(rowA) + i = iNodeIndex(rowA) + j = jNodeIndex(rowA) + k = kNodeIndex(rowA) + + do kA = -1, 1 do jA = -1, 1 do iA = -1, 1 - if ( (i+iA >= 1 .and. i+iA <= nx-1) & + if ( (k+kA >= 1 .and. k+kA <= nz) & + .and. & + (i+iA >= 1 .and. i+iA <= nx-1) & .and. & (j+jA >= 1 .and. j+jA <= ny-1) ) then - colA = vertexID(i+iA, j+jA) ! ID for neighboring vertex - m = indxA_2d(iA,jA) + colA = nodeID(k+kA, i+iA, j+jA) ! ID for neighboring node + m = indxA_3d(iA,jA,kA) if (colA > 0) then - Auu_val(rowA, colA) = Auu(i,j,m) - Auv_val(rowA, colA) = Auv(i,j,m) - Avu_val(rowA, colA) = Avu(i,j,m) - Avv_val(rowA, colA) = Avv(i,j,m) + Auu_val(rowA, colA) = Auu(m,k,i,j) + Auv_val(rowA, colA) = Auv(m,k,i,j) + Avu_val(rowA, colA) = Avu(m,k,i,j) + Avv_val(rowA, colA) = Avv(m,k,i,j) endif - endif ! i+iA and j+jA in bounds + endif ! i+iA, j+jA, and k+kA in bounds + enddo ! kA enddo ! iA enddo ! jA @@ -11258,9 +11257,9 @@ subroutine write_matrix_elements_2d(nx, ny, & !WHL - bug check write(iulog,*) ' ' write(iulog,*) 'nonzeros per row:' - do rowA = 1, nVerticesSolve + do rowA = 1, nNodesSolve nonzeros(rowA) = 0 - do colA = 1, nVerticesSolve + do colA = 1, nNodesSolve if (abs(Auu_val(rowA,colA)) > 1.d-11) then nonzeros(rowA) = nonzeros(rowA) + 1 endif @@ -11277,12 +11276,12 @@ subroutine write_matrix_elements_2d(nx, ny, & open(unit=12, file='Avu.'//matrix_label, status='unknown') open(unit=13, file='Avv.'//matrix_label, status='unknown') - do rowA = 1, nVerticesSolve + do rowA = 1, nNodesSolve write(10,'(i6)',advance='no') rowA write(11,'(i6)',advance='no') rowA write(12,'(i6)',advance='no') rowA write(13,'(i6)',advance='no') rowA - do colA = 1, nVerticesSolve + do colA = 1, nNodesSolve write(10,'(e16.8)',advance='no') Auu_val(rowA,colA) write(11,'(e16.8)',advance='no') Auv_val(rowA,colA) write(12,'(e16.8)',advance='no') Avu_val(rowA,colA) @@ -11304,100 +11303,18 @@ subroutine write_matrix_elements_2d(nx, ny, & ! write load vectors to file open(unit=14, file='bu.'//matrix_label, status='unknown') open(unit=15, file='bv.'//matrix_label, status='unknown') - do rowA = 1, nVerticesSolve - i = iVertexIndex(rowA) - j = jVertexIndex(rowA) - write(14,'(i6, e16.8)') rowA, bu(i,j) - write(15,'(i6, e16.8)') rowA, bv(i,j) + do rowA = 1, nNodesSolve + i = iNodeIndex(rowA) + j = jNodeIndex(rowA) + k = kNodeIndex(rowA) + write(14,'(i6, e16.8)') rowA, bu(k,i,j) + write(15,'(i6, e16.8)') rowA, bv(k,i,j) enddo close(14) close(15) - end subroutine write_matrix_elements_2d - -!**************************************************************************** - !TODO - Either delete this subroutine, or switch the indices. Not currently used. - subroutine compress_3d_to_2d(nx, ny, nz, & - Auu, Auv, & - Avu, Avv, & - bu, bv, & - Auu_2d, Auv_2d, & - Avu_2d, Avv_2d, & - bu_2d, bv_2d) - - !---------------------------------------------------------------- - ! Form the 2D matrix and rhs by combining terms from the 3D matrix and rhs. - ! This combination is based on the assumption of no vertical shear; - ! i.e., uvel and vvel have the same value at each level in a given column. - !---------------------------------------------------------------- - - !---------------------------------------------------------------- - ! Input-output arguments - !---------------------------------------------------------------- - - integer, intent(in) :: & - nx, ny, & ! horizontal grid dimensions - nz ! number of vertical levels where velocity is computed - - real(dp), dimension(nNodeNeighbors_3d,nz,nx-1,ny-1), intent(in) :: & - Auu, Auv, & ! assembled 3D stiffness matrix, divided into 4 parts - Avu, Avv - - real(dp), dimension(nz,nx-1,ny-1), intent(in) :: & - bu, bv ! assembled 3D rhs vector, divided into 2 parts - - real(dp), dimension(nNodeNeighbors_2d,nx-1,ny-1), intent(out) :: & - Auu_2d, Auv_2d, &! assembled 2D (SSA) stiffness matrix, divided into 4 parts - Avu_2d, Avv_2d - - real(dp), dimension(nx-1,ny-1), intent(out) :: & - bu_2d, bv_2d ! assembled 2D (SSA) rhs vector, divided into 2 parts - - !---------------------------------------------------------------- - ! Local variables - !---------------------------------------------------------------- - - integer :: i, j, k, iA, jA, kA, m, m2 - - ! Initialize 2D matrix and rhs - - Auu_2d(:,:,:) = 0.d0 - Auv_2d(:,:,:) = 0.d0 - Avu_2d(:,:,:) = 0.d0 - Avv_2d(:,:,:) = 0.d0 - bu_2d(:,:) = 0.d0 - bv_2d(:,:) = 0.d0 - - ! Form 2D matrix and rhs - - do j = 1, ny-1 - do i = 1, nx-1 - do k = 1, nz - - ! matrix - do kA = -1,1 - do jA = -1,1 - do iA = -1,1 - m = indxA_3d(iA,jA,kA) - m2 = indxA_2d(iA,jA) - Auu_2d(m2,i,j) = Auu_2d(m2,i,j) + Auu(m,k,i,j) - Auv_2d(m2,i,j) = Auv_2d(m2,i,j) + Auv(m,k,i,j) - Avu_2d(m2,i,j) = Avu_2d(m2,i,j) + Avu(m,k,i,j) - Avv_2d(m2,i,j) = Avv_2d(m2,i,j) + Avv(m,k,i,j) - enddo ! iA - enddo ! jA - enddo ! kA - - ! rhs - bu_2d(i,j) = bu_2d(i,j) + bu(k,i,j) - bv_2d(i,j) = bv_2d(i,j) + bv(k,i,j) - - enddo ! k - enddo ! i - enddo ! j - - end subroutine compress_3d_to_2d - + end subroutine write_matrix_elements_3d + !**************************************************************************** end module glissade_velo_higher From 40718e1997fd2d4ce6ff54be98fdc8a8968b34e8 Mon Sep 17 00:00:00 2001 From: William Lipscomb Date: Fri, 23 Jan 2026 18:25:27 -0700 Subject: [PATCH 18/21] Removed some verbose diagnostics --- libglissade/glissade_velo_higher.F90 | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/libglissade/glissade_velo_higher.F90 b/libglissade/glissade_velo_higher.F90 index 66074862..a4e7b86b 100644 --- a/libglissade/glissade_velo_higher.F90 +++ b/libglissade/glissade_velo_higher.F90 @@ -227,8 +227,8 @@ module glissade_velo_higher ! logical :: verbose_bfric = .true. logical :: verbose_trilinos = .false. ! logical :: verbose_trilinos = .true. -! logical :: verbose_beta = .false. - logical :: verbose_beta = .true. + logical :: verbose_beta = .false. +! logical :: verbose_beta = .true. logical :: verbose_efvs = .false. ! logical :: verbose_efvs = .true. logical :: verbose_tau = .false. @@ -6023,11 +6023,6 @@ subroutine compute_3d_velocity_L1L2(nx, ny, & uvel(1:nz-1,:,:) = 0.d0 vvel(1:nz-1,:,:) = 0.d0 - !WHL - debug - call point_diag(xVertex, 'xVertex', itest, jtest, rtest, 7, 7) - call point_diag(yVertex, 'yVertex', itest, jtest, rtest, 7, 7) - call point_diag(active_cell, 'active_cell', itest, jtest, rtest, 7, 7) - ! Compute viscosity integral and strain rates in elements. ! Loop over all cells that border locally owned vertices. From 69a919998759a116b8589e518832d17643f0247a Mon Sep 17 00:00:00 2001 From: William Lipscomb Date: Mon, 2 Feb 2026 17:01:27 -0700 Subject: [PATCH 19/21] Fixed a bug in the Picard acceleration Several years also I added the option which_ho_nonlinear for Picard acceleration. The basic idea is that after each velocity solve, CISM computes a vector equal to the difference between the new solution and the old solution. The model then tests whether extending that vector (i.e., multiplying it by a scalar > 1) reduces the residual A*x - B. If so, the vector is extended. The logic is such that if an extended solution vector fails to reduce the residual, we back off to the latest attempt that succeeded in reducing the residual. In other words, we discard the trial matrix arrays Auu, Avv, etc., and go back to the saved arrays Auu_sav, Avv_sav. In addition to the matrix arrays, several other arrays must be saved and restored. When investigating the slow convergence for certain grid cells, I realized that for a DIVA solve, some additional arrays need to be saved: beta_eff_x, beta_eff_y, and omega_k. Using the (bad) trial values instead of reverting to the (good) saved values means that the subsquent DIVA 3d velocity integration is not quite correct. This commit saves the additional arrays, which significantly improves convergence of the nonlinear solver for Antarctic runs. I tested the changes for a new modified School sliding law. Before this bug fix, the solver often failed to converge in 50 or more iterations, even after spinning up for several thousand years. With the bug fix, the solver now converges in ~20 nonlinear iterations. Note: This commit is answer-changing for DIVA runs with Picard acceleration. --- libglissade/glissade_velo_higher.F90 | 240 +++++++++++++++------------ 1 file changed, 132 insertions(+), 108 deletions(-) diff --git a/libglissade/glissade_velo_higher.F90 b/libglissade/glissade_velo_higher.F90 index a4e7b86b..1ea5b2eb 100644 --- a/libglissade/glissade_velo_higher.F90 +++ b/libglissade/glissade_velo_higher.F90 @@ -227,8 +227,8 @@ module glissade_velo_higher ! logical :: verbose_bfric = .true. logical :: verbose_trilinos = .false. ! logical :: verbose_trilinos = .true. - logical :: verbose_beta = .false. -! logical :: verbose_beta = .true. +! logical :: verbose_beta = .false. + logical :: verbose_beta = .true. logical :: verbose_efvs = .false. ! logical :: verbose_efvs = .true. logical :: verbose_tau = .false. @@ -1058,11 +1058,13 @@ subroutine glissade_velo_higher_solve(model, & uvel_2d_old, vvel_2d_old, & ! velocity solution from previous nonlinear iteration duvel_2d, dvvel_2d, & ! difference between current and previous velocity solutions uvel_2d_sav, vvel_2d_sav, & ! current best value for velocity solution (smallest residual) - beta_internal_sav ! beta_internal associated with saved velocity + beta_internal_sav, & ! beta_internal associated with saved velocity + beta_eff_x_sav, beta_eff_y_sav real(dp), dimension(:,:,:), allocatable :: & Auu_2d_sav, Auv_2d_sav, & ! assembled matrices associated with (uvel_2d_sav, vvel_2d_sav) - Avu_2d_sav, Avv_2d_sav + Avu_2d_sav, Avv_2d_sav, & + omega_k_sav ! for an accelerated 3D solve: real(dp), dimension(:,:,:), allocatable :: & @@ -1288,7 +1290,15 @@ subroutine glissade_velo_higher_solve(model, & allocate(vsav_2d(nx-1,ny-1)) allocate(resid_u_2d(nx-1,ny-1)) allocate(resid_v_2d(nx-1,ny-1)) - if (accel_picard) then + else ! 3d + allocate(Auu(nNodeNeighbors_3d,nz,nx-1,ny-1)) + allocate(Auv(nNodeNeighbors_3d,nz,nx-1,ny-1)) + allocate(Avu(nNodeNeighbors_3d,nz,nx-1,ny-1)) + allocate(Avv(nNodeNeighbors_3d,nz,nx-1,ny-1)) + endif + + if (accel_picard) then + if (solve_2d) then allocate(uvel_2d_old(nx-1,ny-1)) allocate(vvel_2d_old(nx-1,ny-1)) allocate(duvel_2d(nx-1,ny-1)) @@ -1299,14 +1309,7 @@ subroutine glissade_velo_higher_solve(model, & allocate(Auv_2d_sav(nx-1,ny-1,nNodeNeighbors_2d)) allocate(Avu_2d_sav(nx-1,ny-1,nNodeNeighbors_2d)) allocate(Avv_2d_sav(nx-1,ny-1,nNodeNeighbors_2d)) - allocate(beta_internal_sav(nx-1,ny-1)) - endif - else - allocate(Auu(nNodeNeighbors_3d,nz,nx-1,ny-1)) - allocate(Auv(nNodeNeighbors_3d,nz,nx-1,ny-1)) - allocate(Avu(nNodeNeighbors_3d,nz,nx-1,ny-1)) - allocate(Avv(nNodeNeighbors_3d,nz,nx-1,ny-1)) - if (accel_picard) then + else ! 3d allocate(uvel_old(nz,nx-1,ny-1)) allocate(vvel_old(nz,nx-1,ny-1)) allocate(duvel(nz,nx-1,ny-1)) @@ -1317,9 +1320,13 @@ subroutine glissade_velo_higher_solve(model, & allocate(Auv_sav(nNodeNeighbors_3d,nz,nx-1,ny-1)) allocate(Avu_sav(nNodeNeighbors_3d,nz,nx-1,ny-1)) allocate(Avv_sav(nNodeNeighbors_3d,nz,nx-1,ny-1)) - allocate(beta_internal_sav(nx-1,ny-1)) endif - endif + allocate(beta_internal_sav(nx-1,ny-1)) + !Note: The next three are used only for DIVA, but it's simpler to allocate them regardless + allocate(beta_eff_x_sav(nx-1,ny-1)) + allocate(beta_eff_y_sav(nx-1,ny-1)) + allocate(omega_k_sav(nz,nx,ny)) + endif ! accel_picard if (whichapprox == HO_APPROX_DIVA) then !! call parallel_halo(efvs, parallel) ! efvs halo update is in glissade_diagnostic_variable_solve @@ -2203,8 +2210,7 @@ subroutine glissade_velo_higher_solve(model, & itest, jtest, rtest, & active_vertex, diva_level_index, & ice_plus_land_mask, & - stag_omega, omega_k, & - beta_internal, & + omega_k, beta_internal, & beta_eff_x, beta_eff_y, & stag_theta_slope_x, stag_theta_slope_y, & stag_diva_slope_factor_x, & @@ -3016,6 +3022,7 @@ subroutine glissade_velo_higher_solve(model, & if (solve_2d) then + call t_startf('glissade_resid_vec') call compute_residual_vector_2d(nx, ny, & parallel, & @@ -3032,41 +3039,40 @@ subroutine glissade_velo_higher_solve(model, & call t_startf('glissade_accel_picard') if (accel_picard) then - if (verbose_picard) then - if (this_rank == rtest) then - write(iulog,*) ' ' - write(iulog,*) 'Saved L2 norm, new L2 norm:', L2_norm_alpha_sav, L2_norm - endif - call point_diag(resid_u_2d, 'resid_u_2d', itest, jtest, rtest, 7, 7, '(e10.3)') - call point_diag(uvel_2d, 'uvel_2d', itest, jtest, rtest, 7, 7, '(f10.3)') + if (verbose_picard .and. this_rank == rtest) then + write(iulog,*) ' ' + write(iulog,*) 'Saved L2 norm, new L2 norm:', L2_norm_alpha_sav, L2_norm endif if (counter >= 2) then - call evaluate_accelerated_picard_2d(nx, ny, & - L2_norm, L2_norm_large, & - L2_norm_alpha_sav, & - alpha_accel, alpha_accel_max, & - gamma_accel, resid_reduction_threshold, & - uvel_2d, vvel_2d, & - Auu_2d, Auv_2d, & - Avu_2d, Avv_2d, & - uvel_2d_old, vvel_2d_old, & - duvel_2d, dvvel_2d, & - uvel_2d_sav, vvel_2d_sav, & - Auu_2d_sav, Auv_2d_sav, & - Avu_2d_sav, Avv_2d_sav, & - beta_internal, beta_internal_sav, & - assembly_is_done) + call evaluate_accelerated_picard_2d(& + whichapprox, rtest, & + L2_norm, L2_norm_large, & + L2_norm_alpha_sav, & + alpha_accel, alpha_accel_max, & + gamma_accel, resid_reduction_threshold, & + uvel_2d, vvel_2d, & + Auu_2d, Auv_2d, & + Avu_2d, Avv_2d, & + uvel_2d_old, vvel_2d_old, & + duvel_2d, dvvel_2d, & + uvel_2d_sav, vvel_2d_sav, & + Auu_2d_sav, Auv_2d_sav, & + Avu_2d_sav, Avv_2d_sav, & + beta_internal, beta_internal_sav, & + beta_eff_x, beta_eff_x_sav, & + beta_eff_y, beta_eff_y_sav, & + omega_k, omega_k_sav, & + assembly_is_done) else ! counter = 1 ! proceed to the matrix solution assembly_is_done = .true. - if (verbose_picard .and. main_task) then - write(iulog,*) 'nonlinear counter = 1; continue to matrix solver' - endif + if (verbose_picard .and. this_rank == rtest) & + write(iulog,*) 'nonlinear counter = 1; continue to matrix solver' endif ! counter >= 2 @@ -3107,20 +3113,22 @@ subroutine glissade_velo_higher_solve(model, & if (counter >= 2) then - call evaluate_accelerated_picard_3d(L2_norm, L2_norm_large, & - L2_norm_alpha_sav, & - alpha_accel, alpha_accel_max, & - gamma_accel, resid_reduction_threshold, & - uvel, vvel, & - Auu, Auv, & - Avu, Avv, & - uvel_old, vvel_old, & - duvel, dvvel, & - uvel_sav, vvel_sav, & - Auu_sav, Auv_sav, & - Avu_sav, Avv_sav, & - beta_internal, beta_internal_sav, & - assembly_is_done) + call evaluate_accelerated_picard_3d(& + rtest, & + L2_norm, L2_norm_large, & + L2_norm_alpha_sav, & + alpha_accel, alpha_accel_max, & + gamma_accel, resid_reduction_threshold, & + uvel, vvel, & + Auu, Auv, & + Avu, Avv, & + uvel_old, vvel_old, & + duvel, dvvel, & + uvel_sav, vvel_sav, & + Auu_sav, Auv_sav, & + Avu_sav, Avv_sav, & + beta_internal, beta_internal_sav, & + assembly_is_done) else ! counter = 1 @@ -3545,8 +3553,7 @@ subroutine glissade_velo_higher_solve(model, & itest, jtest, rtest, & active_vertex, diva_level_index, & ice_plus_land_mask, & - stag_omega, omega_k, & - beta_internal, & + omega_k, beta_internal, & beta_eff_x, beta_eff_y, & stag_theta_slope_x, stag_theta_slope_y, & stag_diva_slope_factor_x, & @@ -3840,6 +3847,8 @@ subroutine glissade_velo_higher_solve(model, & deallocate(uvel_2d_sav, vvel_2d_sav) deallocate(Auu_2d_sav, Auv_2d_sav, Avu_2d_sav, Avv_2d_sav) deallocate(beta_internal_sav) + deallocate(beta_eff_x_sav, beta_eff_y_sav) + deallocate(omega_k_sav) endif else deallocate(Auu, Auv, Avu, Avv) @@ -5717,8 +5726,7 @@ subroutine compute_3d_velocity_diva(& itest, jtest, rtest, & active_vertex, diva_level_index, & ice_plus_land_mask, & - stag_omega, omega_k, & - beta, & + omega_k, beta, & beta_eff_x, beta_eff_y, & stag_theta_slope_x, stag_theta_slope_y, & stag_diva_slope_factor_x, & @@ -5768,9 +5776,6 @@ subroutine compute_3d_velocity_diva(& stag_theta_slope_y, & ! slope angle (radians) in y direction at vertices stag_diva_slope_factor_x, & ! slope correction factor in x direction stag_diva_slope_factor_y, & ! slope correction factor in y direction - stag_omega, & ! double integral, defined by Goldberg eq. 35 (m^2/(Pa yr)) - ! already interpolated to staggered grid - ! Note: omega here = Goldberg's omega/H uvel_2d, vvel_2d ! depth-integrated mean velocity; solution of 2D velocity solve (m/yr) real(dp), dimension(nx-1,ny-1), intent(out) :: & @@ -5787,12 +5792,6 @@ subroutine compute_3d_velocity_diva(& stag_omega_k ! single integral, defined by Goldberg eq. 32 (m^2/(Pa yr)) ! interpolated to staggered grid - real(dp), dimension(nx-1,ny-1) :: & - stag_integral ! integral that relates bed velocity to uvel_2d and vvel_2d - ! = stag_omega for diva_level_index = 0 - ! = stag_omega_k(k,:,:) for other values of diva_level_index - - real(dp) :: & slope_correction_x, & ! slope-based correction for vertical shear in x direction slope_correction_y ! slope-based correction for vertical shear in y direction @@ -5812,15 +5811,6 @@ subroutine compute_3d_velocity_diva(& stagger_margin_in = 1) enddo - ! Identify the appropriate integral for relating uvel_2d/vvel_2d to the bed velocity - - if (diva_level_index == 0) then ! solved for mean velocity - stag_integral(:,:) = stag_omega(:,:) - else - k = diva_level_index - stag_integral(:,:) = stag_omega_k(k,:,:) - endif - !---------------------------------------------------------------- ! Compute the 3D velocity field !---------------------------------------------------------------- @@ -7967,7 +7957,7 @@ subroutine compute_element_matrix(whichapprox, nNodesPerElement, & !---------------------------------------------------------------- integer, intent(in) :: & - whichapprox ! which Stokes approximation to use (BP, SIA, SSA) + whichapprox ! which Stokes approximation to use (SIA, SSA, DIVA, L1L2, BP) integer, intent(in) :: nNodesPerElement ! number of nodes per element @@ -9576,9 +9566,11 @@ subroutine compute_residual_vector_2d(nx, ny, & if (this_rank==rtest) then i = itest j = jtest - write(iulog,*) 'In compute_residual_vector_2d: task, i, j =', this_rank, i, j - write(iulog, '(a16, 2f13.7, 2e13.5)') & - ' u, v, ru, rv: ', uvel(i,j), vvel(i,j), resid_u(i,j), resid_v(i,j) + call parallel_globalindex(i, j, iglobal, jglobal, parallel) + write(iulog,*) ' ' + write(iulog,*) 'In compute_residual_vector_2d: test ig, jg =', i, j + write(iulog, '(a15, 2f12.5, 2e13.5)') & + ' u, v, ru, rv:', uvel(i,j), vvel(i,j), resid_u(i,j), resid_v(i,j) endif ! Compute max value of (squared) residual on this task. @@ -9592,7 +9584,7 @@ subroutine compute_residual_vector_2d(nx, ny, & do i = staggered_ilo, staggered_ihi if (abs((resid_sq(i,j) - global_max_resid)/global_max_resid) < 1.0d-6) then call parallel_globalindex(i, j, iglobal, jglobal, parallel) - write(iulog, '(a24, 2i6, 2e13.5, e16.8)') 'ig, jg, ru, rv, rmax:', & + write(iulog, '(a24, 2i6, 2e13.5, e16.8)') 'ig, jg, ru, rv, global rmax:', & iglobal, jglobal, resid_u(i,j), resid_v(i,j), sqrt(global_max_resid) write(iulog,*) ' ' endif @@ -9839,7 +9831,7 @@ end subroutine compute_residual_vector_3d !**************************************************************************** subroutine evaluate_accelerated_picard_2d(& - nx, ny, & + whichapprox, rtest, & L2_norm, L2_norm_large, & L2_norm_alpha_sav, & alpha_accel, alpha_accel_max, & @@ -9853,10 +9845,14 @@ subroutine evaluate_accelerated_picard_2d(& Auu_2d_sav, Auv_2d_sav, & Avu_2d_sav, Avv_2d_sav, & beta_internal, beta_internal_sav, & + beta_eff_x, beta_eff_x_sav, & + beta_eff_y, beta_eff_y_sav, & + omega_k, omega_k_sav, & assembly_is_done) integer, intent(in) :: & - nx, ny ! number of grid cells in each direction + rtest, & ! rank for diagnostic point + whichapprox ! which Stokes approximation to use (SIA, SSA, DIVA, L1L2, BP) real(dp), intent(in) :: & L2_norm, & ! latest value of L2 norm of residual @@ -9869,7 +9865,7 @@ subroutine evaluate_accelerated_picard_2d(& alpha_accel, & ! factor for extending the vector (duvel, dvvel) to reduce the residual L2_norm_alpha_sav ! value of L2 norm of residual, given the previous alpha_accel - real(dp), dimension(nx-1,ny-1), intent(inout) :: & + real(dp), dimension(:,:), intent(inout) :: & uvel_2d, vvel_2d, & ! latest guess for the velocity solution uvel_2d_old, vvel_2d_old, & ! velocity solution from previous nonlinear iteration duvel_2d, dvvel_2d, & ! difference between old velocity solution and latest solution @@ -9877,7 +9873,14 @@ subroutine evaluate_accelerated_picard_2d(& beta_internal, & ! beta_internal as a function of uvel_2d and vvel_2d beta_internal_sav ! beta_internal as a function of uvel_2d_sav and vvel_2d_sav - real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(inout) :: & + real(dp), dimension(:,:), intent(inout) :: & + beta_eff_x, beta_eff_x_sav, & + beta_eff_y, beta_eff_y_sav + + real(dp), dimension(:,:,:), intent(inout) :: & + omega_k, omega_k_sav + + real(dp), dimension(:,:,:), intent(inout) :: & Auu_2d, Auv_2d, & ! latest assembled matrices as a function of uvel_2d and vvel_2d Avu_2d, Avv_2d, & Auu_2d_sav, Auv_2d_sav, & ! assembled matrices as a function of uvel_2d_sav and vvel_2d_sav @@ -9886,6 +9889,10 @@ subroutine evaluate_accelerated_picard_2d(& logical, intent(inout) :: & assembly_is_done ! if true, then accept the current assembled matrices and proceed to solution + if (verbose_picard .and. this_rank == rtest) then + write(iulog,*) 'Evaluate Picard acceleration: L2_norm_sav, L2_norm:', L2_norm_alpha_sav, L2_norm + endif + if (L2_norm < resid_reduction_threshold*L2_norm_alpha_sav .and. & alpha_accel + gamma_accel <= alpha_accel_max) then @@ -9893,7 +9900,6 @@ subroutine evaluate_accelerated_picard_2d(& ! ("Substantially" is defined by the factor resid_reduction_threshold < 1.) ! Save the latest values of the solver inputs - uvel_2d_sav = uvel_2d vvel_2d_sav = vvel_2d Auu_2d_sav = Auu_2d @@ -9901,28 +9907,29 @@ subroutine evaluate_accelerated_picard_2d(& Avu_2d_sav = Avu_2d Avv_2d_sav = Avv_2d beta_internal_sav = beta_internal + if (whichapprox == HO_APPROX_DIVA) then + beta_eff_x_sav = beta_eff_x + beta_eff_y_sav = beta_eff_y + omega_k_sav = omega_k + endif ! Increase alpha_accel and see if the residual keeps getting smaller. ! If not, we will back off to the saved values above. alpha_accel = alpha_accel + gamma_accel L2_norm_alpha_sav = L2_norm - if (verbose_picard .and. main_task) then - write(iulog,*) 'Keep going, alpha =', alpha_accel + if (verbose_picard .and. this_rank == rtest) then + write(iulog,*) 'Increase alpha to', alpha_accel endif ! Since assembly_is_done = F, we now return to the start of the loop: - ! do while (.not.assembly_is_done) + ! do while (.not.assembly_is_done) elseif (L2_norm < L2_norm_alpha_sav) then ! The residual norm decreased only a little (or we have reached alpha_accel_max). ! Call it good and move on to the solver. - if (verbose_picard .and. main_task) then - write(iulog,*) 'Hold, alpha =', alpha_accel - endif - ! Save this velocity as the starting point for the next nonlinear iteration uvel_2d_old = uvel_2d vvel_2d_old = vvel_2d @@ -9934,8 +9941,12 @@ subroutine evaluate_accelerated_picard_2d(& ! proceed to the matrix solution assembly_is_done = .true. - else + if (verbose_picard .and. this_rank == rtest) then + write(iulog,*) 'Hold alpha at', alpha_accel + write(iulog,*) ' Continue to matrix solver' + endif + else ! The residual is larger than the previous value. ! Switch back to the previously saved velocity and matrix with the lower residual. uvel_2d = uvel_2d_sav @@ -9945,16 +9956,16 @@ subroutine evaluate_accelerated_picard_2d(& Avu_2d = Avu_2d_sav Avv_2d = Avv_2d_sav beta_internal = beta_internal_sav + if (whichapprox == HO_APPROX_DIVA) then + beta_eff_x = beta_eff_x_sav + beta_eff_y = beta_eff_y_sav + omega_k = omega_k_sav + endif ! Save this velocity as the starting point for the next nonlinear iteration uvel_2d_old = uvel_2d vvel_2d_old = vvel_2d - if (verbose_picard .and. main_task) then - write(iulog,*) 'Back up to alpha =', alpha_accel - gamma_accel - write(iulog,*) 'Continue to matrix solver' - endif - ! Reset alpha_accel and L2_norm_alpha_sav for the next nonlinear iteration alpha_accel = 1.0d0 L2_norm_alpha_sav = L2_norm_large @@ -9962,6 +9973,11 @@ subroutine evaluate_accelerated_picard_2d(& ! proceed to the matrix solution assembly_is_done = .true. + if (verbose_picard .and. this_rank == rtest) then + write(iulog,*) 'Reduce alpha to', alpha_accel - gamma_accel + write(iulog,*) 'Continue to matrix solver' + endif + endif ! L2_norm of residual has reduced end subroutine evaluate_accelerated_picard_2d @@ -9969,6 +9985,7 @@ end subroutine evaluate_accelerated_picard_2d !**************************************************************************** subroutine evaluate_accelerated_picard_3d(& + rtest, & L2_norm, L2_norm_large, & L2_norm_alpha_sav, & alpha_accel, alpha_accel_max, & @@ -9984,6 +10001,8 @@ subroutine evaluate_accelerated_picard_3d(& beta_internal, beta_internal_sav, & assembly_is_done) + integer, intent(in) :: rtest ! rank for diagnostic point + real(dp), intent(in) :: & L2_norm, & ! latest value of L2 norm of residual L2_norm_large, & ! large value for re-initializing the L2 norm @@ -10035,8 +10054,8 @@ subroutine evaluate_accelerated_picard_3d(& alpha_accel = alpha_accel + gamma_accel L2_norm_alpha_sav = L2_norm - if (verbose_picard .and. main_task) then - write(iulog,*) 'Keep going, alpha =', alpha_accel + if (verbose_picard .and. this_rank == rtest) then + write(iulog,*) 'Increase alpha to', alpha_accel endif ! Since assembly_is_done = F, we now return to the start of the loop: @@ -10062,6 +10081,11 @@ subroutine evaluate_accelerated_picard_3d(& ! proceed to the matrix solution assembly_is_done = .true. + if (verbose_picard .and. this_rank == rtest) then + write(iulog,*) 'Hold alpha at', alpha_accel + write(iulog,*) ' Continue to matrix solver' + endif + else ! The residual is larger than the previous value. @@ -10078,11 +10102,6 @@ subroutine evaluate_accelerated_picard_3d(& uvel_old = uvel vvel_old = vvel - if (verbose_picard .and. main_task) then - write(iulog,*) 'Back up to alpha =', alpha_accel - gamma_accel - write(iulog,*) 'Continue to matrix solver' - endif - ! Reset alpha_accel and L2_norm_alpha_sav for the next nonlinear iteration alpha_accel = 1.0d0 L2_norm_alpha_sav = L2_norm_large @@ -10090,6 +10109,11 @@ subroutine evaluate_accelerated_picard_3d(& ! proceed to the matrix solution assembly_is_done = .true. + if (verbose_picard .and. main_task) then + write(iulog,*) 'Back up to alpha =', alpha_accel - gamma_accel + write(iulog,*) 'Continue to matrix solver' + endif + endif ! L2_norm of residual has reduced end subroutine evaluate_accelerated_picard_3d From e5d65807c2063d4cc9cbfaea1e1b438d90e1656d Mon Sep 17 00:00:00 2001 From: William Lipscomb Date: Mon, 2 Feb 2026 17:04:40 -0700 Subject: [PATCH 20/21] Added a modified Schoof basal sliding law This commit adds a new sliding law similar to the Schoof law but with a simpler algebraic form: 1/taub = 1/taub_p + 1/taub_c, where taub_p and taub_c denote the basal shear stress for a power law (taub ~ Cp * ub^(1/m), a coulomb law (taub ~ Cc * N). This is similar to the equation for the harmonic mean, which is weighted toward smaller values while ignoring large outliers. When taub_p is very large, taub asymptotes to taub_c, and when taub_c is very large, taub asymptotes to taub_p. In this way, the new sliding law resembles the School law. Unlike the Schoof law, the new law supports simulataneous inversion for both Cp and Cc. Where the ice is too thin (thick), both Cp and Cc are driven to higher (lower) values. For Antarctic tests, I chose max and min values of Cc and Cp such that taub_p will largely determine the net taub in the interior where N is very large, and taub_c is likely to determine taub in coastal regions where N is small. In these tests, the pattern of Cp is very similar to the patterns obtained with the Schoof law. But for a given value of Cp_min, the modified Schoof law has smaller thick biases (e.g., for the Siple coast), because it has the freedom to lower Cc as well as Cp where the bed needs to be slippery. Also, the modified Schoof law seems to have better convergence properties than the standard Schoof law, significantly reducing the cost of a typical AIS fun. I suspect this is true because the Schoof law struggles with a few grid cells where taub is insensitive to velocity, whereas the modified law always has some velocity dependence. I also ran some Greenland tests where the modified Schoof law doesn't converge as well. I'll look at this later. To run the new law, the user should set which_ho_babc = 12 in the config file. I moved the old option 12 (which is not often used) to option 14, and removed the previous option 14 (which wasn't parallelized and hasn't been used in years). To invert for both Cp and Cc, simply set which_ho_coulomb_c = which_ho_powerlaw_c = 1. Note: Others have used this law, although I'm not aware that it's widely used. For example, Barnes and Gudmundsson (2022) tested this law in WAIS simulations; see their Eq. (6). They call it "regularised Coulomb sliding", citing the MISMIP+ paper by Cornford et al. (2020), but I didn't find this particular formulation in Cornford et al. --- libglide/glide_setup.F90 | 50 +++++++++++----- libglide/glide_types.F90 | 22 +++---- libglissade/glissade_basal_traction.F90 | 77 ++++++++++++++++--------- libglissade/glissade_inversion.F90 | 49 +++++++++------- 4 files changed, 123 insertions(+), 75 deletions(-) diff --git a/libglide/glide_setup.F90 b/libglide/glide_setup.F90 index 28f73706..e25fcc3e 100644 --- a/libglide/glide_setup.F90 +++ b/libglide/glide_setup.F90 @@ -1129,9 +1129,9 @@ subroutine print_options(model) 'power law ', & 'Coulomb friction law w/ effec press ', & 'Schoof friction law ', & + 'modified Schoof friction law ', & 'min of Coulomb stress and power-law stress (Tsai)', & 'power law using effective pressure ', & - 'simple pattern of beta ', & 'till yield stress (Picard) ' /) character(len=*), dimension(0:1), parameter :: ho_whichbeta_limit = (/ & @@ -1829,8 +1829,8 @@ subroutine print_options(model) if (model%options%use_c_space_factor) then if (model%options%which_ho_babc == HO_BABC_COULOMB_FRICTION .or. & - model%options%which_ho_babc == HO_BABC_COULOMB_POWERLAW_SCHOOF .or. & - model%options%which_ho_babc == HO_BABC_COULOMB_POWERLAW_TSAI) then + model%options%which_ho_babc == HO_BABC_SCHOOF .or. & + model%options%which_ho_babc == HO_BABC_TSAI) then write(message,*) 'Multiplying beta by C_space_factor' call write_log(message) else @@ -1879,18 +1879,19 @@ subroutine print_options(model) ! Inversion options - ! Note: Inversion for Cp is currently supported for the Schoof sliding law, Tsai law, and basic power law + ! Note: Inversion for Cp is supported for the basic power law plus the Schoof and Tsai laws if (model%options%which_ho_powerlaw_c == HO_POWERLAW_C_INVERSION .or. & model%options%which_ho_powerlaw_c == HO_POWERLAW_C_INVERSION_BASIN) then - if (model%options%which_ho_babc == HO_BABC_COULOMB_POWERLAW_SCHOOF .or. & - model%options%which_ho_babc == HO_BABC_COULOMB_POWERLAW_TSAI .or. & - model%options%which_ho_babc == HO_BABC_POWERLAW) then + if (model%options%which_ho_babc == HO_BABC_POWERLAW .or. & + model%options%which_ho_babc == HO_BABC_SCHOOF .or. & + model%options%which_ho_babc == HO_BABC_MODIFIED_SCHOOF .or. & + model%options%which_ho_babc == HO_BABC_TSAI) then ! inversion for Cp is supported else call write_log('Error, Cp inversion is not supported for this basal BC option') write(message,*) 'Cp inversion is supported for these options: ', & - HO_BABC_COULOMB_POWERLAW_SCHOOF, HO_BABC_COULOMB_POWERLAW_TSAI, HO_BABC_POWERLAW + HO_BABC_POWERLAW, HO_BABC_SCHOOF, HO_BABC_MODIFIED_SCHOOF, HO_BABC_TSAI call write_log(message, GM_FATAL) endif endif @@ -1900,12 +1901,13 @@ subroutine print_options(model) model%options%which_ho_coulomb_c == HO_COULOMB_C_INVERSION_BASIN) then if (model%options%which_ho_babc == HO_BABC_ZOET_IVERSON .or. & - model%options%which_ho_babc == HO_BABC_PSEUDO_PLASTIC) then + model%options%which_ho_babc == HO_BABC_PSEUDO_PLASTIC .or. & + model%options%which_ho_babc == HO_BABC_MODIFIED_SCHOOF) then ! inversion for Cc is supported else call write_log('Error, Cc inversion is not supported for this basal BC option') write(message,*) 'Cc inversion is supported for these options: ', & - HO_BABC_ZOET_IVERSON, HO_BABC_PSEUDO_PLASTIC + HO_BABC_ZOET_IVERSON, HO_BABC_PSEUDO_PLASTIC, HO_BABC_MODIFIED_SCHOOF call write_log(message, GM_FATAL) endif endif @@ -2774,7 +2776,7 @@ subroutine print_parameters(model) call write_log(message) write(message,*) 'bed bump wavelength for Coulomb friction law : ', model%basal_physics%coulomb_bump_wavelength call write_log(message) - elseif (model%options%which_ho_babc == HO_BABC_COULOMB_POWERLAW_SCHOOF) then + elseif (model%options%which_ho_babc == HO_BABC_SCHOOF) then ! Note: The Schoof law typically uses a spatially variable powerlaw_c. ! If so, the value written here is just the initial value. write(message,*) 'Cc for Schoof Coulomb law : ', model%basal_physics%coulomb_c_const @@ -2787,7 +2789,24 @@ subroutine print_parameters(model) call write_log(message) write(message,*) 'm exponent for Schoof power law : ', model%basal_physics%powerlaw_m call write_log(message) - elseif (model%options%which_ho_babc == HO_BABC_COULOMB_POWERLAW_TSAI) then + elseif (model%options%which_ho_babc == HO_BABC_MODIFIED_SCHOOF) then + ! Note: This law supports inversion for both Cc and Cp. + ! When inverting, the values here are just the initial values. + write(message,*) 'Cc for modified Schoof law : ', model%basal_physics%coulomb_c_const + call write_log(message) + write(message,*) 'Max Cc : ', model%basal_physics%coulomb_c_max + call write_log(message) + write(message,*) 'Min Cc : ', model%basal_physics%coulomb_c_min + call write_log(message) + write(message,*) 'Cp for modified Schoof law, Pa (m/yr)^(-1/3) : ', model%basal_physics%powerlaw_c_const + call write_log(message) + write(message,*) 'Max Cp : ', model%basal_physics%powerlaw_c_max + call write_log(message) + write(message,*) 'Min Cp : ', model%basal_physics%powerlaw_c_min + call write_log(message) + write(message,*) 'm exponent for power law : ', model%basal_physics%powerlaw_m + call write_log(message) + elseif (model%options%which_ho_babc == HO_BABC_TSAI) then ! Note: The Tsai law typically uses a spatially variable powerlaw_c. ! If so, the value written here is just the initial value. write(message,*) 'Cc for Tsai Coulomb law : ', model%basal_physics%coulomb_c_const @@ -2889,6 +2908,9 @@ subroutine print_parameters(model) write(message,*) 'coulomb_c min : ', & model%basal_physics%coulomb_c_min call write_log(message) + write(message,*) 'coulomb_c const : ', & + model%basal_physics%coulomb_c_const + call write_log(message) write(message,*) 'thickness scale (m) for C_c inversion : ', & model%inversion%babc_thck_scale call write_log(message) @@ -4027,12 +4049,12 @@ subroutine define_glide_restart_variables(model, model_id) ! basal sliding option select case (options%which_ho_babc) !WHL - Removed effecpress as a restart variable; it is recomputed with each velocity solve. -!! case (HO_BABC_POWERLAW, HO_BABC_COULOMB_FRICTION, HO_BABC_COULOMB_POWERLAW_SCHOOF) +!! case (HO_BABC_POWERLAW, HO_BABC_COULOMB_FRICTION, HO_BABC_SCHOOF) !! ! These friction laws need effective pressure !! call glide_add_to_restart_variable_list('effecpress', model_id) !! case(HO_BABC_COULOMB_POWERLAW_TSAI) !! call glide_add_to_restart_variable_list('effecpress', model_id) - case (HO_BABC_COULOMB_FRICTION, HO_BABC_COULOMB_POWERLAW_SCHOOF, HO_BABC_COULOMB_POWERLAW_TSAI) + case (HO_BABC_COULOMB_FRICTION, HO_BABC_SCHOOF, HO_BABC_TSAI) ! Note: These options compute beta internally, so it does not need to be in the restart file. if (options%use_c_space_factor) then ! c_space_factor needs to be in the restart file diff --git a/libglide/glide_types.F90 b/libglide/glide_types.F90 index 0731616b..e52641f1 100644 --- a/libglide/glide_types.F90 +++ b/libglide/glide_types.F90 @@ -249,7 +249,7 @@ module glide_types integer, parameter :: HO_THERMAL_AFTER_TRANSPORT = 1 integer, parameter :: HO_THERMAL_SPLIT_TIMESTEP = 2 - !TODO - Deprecate the last two options? Rarely if ever used. + !TODO - Deprecate some little-used options? integer, parameter :: HO_BABC_BETA_CONSTANT = 0 integer, parameter :: HO_BABC_BETA_BPMP = 1 integer, parameter :: HO_BABC_PSEUDO_PLASTIC = 2 @@ -261,10 +261,10 @@ module glide_types integer, parameter :: HO_BABC_ISHOMC = 8 integer, parameter :: HO_BABC_POWERLAW = 9 integer, parameter :: HO_BABC_COULOMB_FRICTION = 10 - integer, parameter :: HO_BABC_COULOMB_POWERLAW_SCHOOF = 11 - integer, parameter :: HO_BABC_COULOMB_POWERLAW_TSAI = 12 - integer, parameter :: HO_BABC_POWERLAW_EFFECPRESS = 13 - integer, parameter :: HO_BABC_SIMPLE = 14 + integer, parameter :: HO_BABC_SCHOOF = 11 + integer, parameter :: HO_BABC_MODIFIED_SCHOOF = 12 + integer, parameter :: HO_BABC_TSAI = 13 + integer, parameter :: HO_BABC_POWERLAW_EFFECPRESS = 14 integer, parameter :: HO_BABC_YIELD_PICARD = 15 integer, parameter :: HO_BETA_LIMIT_ABSOLUTE = 0 @@ -851,10 +851,10 @@ module glide_types !> \item[8] beta field as prescribed for ISMIP-HOM test C (serial only) !> \item[9] power law !> \item[10] Coulomb friction law using effective pressure, with flwa from lowest ice layer - !> \item[11] Coulomb friction law using effective pressure, with constant basal flwa - !> \item[12] basal stress is the minimum of Coulomb and power-law values, as in Tsai et al. (2015) - !> \item[13] power law using effective pressure - !> \item[14] simple hard-coded pattern (useful for debugging) + !> \item[11] Schoof law that blends powerlaw and Coulomb behavior + !> \item[12] modified version of the Schoof law + !> \item[13] basal stress is the minimum of Coulomb and power-law values, as in Tsai et al. (2015) + !> \item[14] power law using effective pressure !> \item[15] treat beta value as a till yield stress (in Pa) using Picard iteration !> \end{description} @@ -2204,7 +2204,7 @@ module glide_types !> Note: Defined on velocity grid, whereas temp and bpmp are on ice grid - ! Note: c_space_factor supported for which_ho_babc = HO_BABC_COULOMB_FRICTION, *COULOMB_POWERLAW_SCHOOF AND *COULOMB_POWERLAW_TSAI + ! Note: c_space_factor supported for which_ho_babc = HO_BABC_COULOMB_FRICTION, *SCHOOF AND *TSAI real(dp), dimension(:,:), pointer :: c_space_factor => null() !> spatial factor for basal shear stress (no dimension) real(dp), dimension(:,:), pointer :: c_space_factor_stag => null() !> spatial factor for basal shear stress on staggered grid @@ -2246,7 +2246,7 @@ module glide_types coulomb_c_hi => null(), & !> coulomb_c value at high bed elevation, topg >= bed_hi coulomb_c_lo => null() !> coulomb_c value at low bed elevation, topg <= bed_lo - ! parameters for power law, taub_b = C * u_b^(1/m); used for HO_BABC_COULOMB_POWERLAW_TSAI/SCHOOF + ! parameters for power law, taub_b = C * u_b^(1/m); used for HO_BABC_SCHOOF AND *_TSAI ! The default values are from Asay-Davis et al. (2016). ! The value of powerlaw_c suggested by Tsai et al. (2015) is 7.624d6 Pa m^(-1/3) s^(1/3). ! This value can be converted to CISM units by dividing by scyr^(1/3), to obtain 2.413d4 Pa m^(-1/3) yr^(1/3). diff --git a/libglissade/glissade_basal_traction.F90 b/libglissade/glissade_basal_traction.F90 index 3ffcca40..91ec33a1 100644 --- a/libglissade/glissade_basal_traction.F90 +++ b/libglissade/glissade_basal_traction.F90 @@ -157,9 +157,10 @@ subroutine glissade_calcbeta (& big_lambda, & ! bedrock characteristics flwa_basal_stag ! basal flwa interpolated to the staggered grid (Pa^{-n} yr^{-1}) - ! variables for Tsai et al. parameterization - real(dp) :: taub_powerlaw ! basal shear stress given by a power law as in Tsai et al. (2015) - real(dp) :: taub_coulomb ! basal shear stress given by Coulomb friction as in Tsai et al. (2015) + ! variables for mixed power/Coulomb laws + real(dp) :: taub ! basal shear stress + real(dp) :: taub_powerlaw ! basal shear stress given by a power law + real(dp) :: taub_coulomb ! basal shear stress given by Coulomb friction ! variables for pseudo-plastic law real(dp) :: q ! exponent for pseudo-plastic law (unitless) @@ -185,6 +186,7 @@ subroutine glissade_calcbeta (& ! Enforce a minimum speed to prevent beta from become very large when velocity is small. speed(:,:) = dsqrt(thisvel(:,:)**2 + othervel(:,:)**2 + smallnum**2) + !TODO - Should this be done only for powerlaw sliding, and not for ZI or PP? ! If beta_powerlaw_umax is set to a nonzero value, then limit the speed to this value. ! Note: The actual ice speed can be greater than umax. This is just a way of shutting off the feedback ! between beta and ice speed (beta down as speed up) when the ice speed is large. @@ -504,9 +506,9 @@ subroutine glissade_calcbeta (& beta = 1.0d8 end where - case(HO_BABC_COULOMB_POWERLAW_SCHOOF) + case(HO_BABC_SCHOOF) - ! Use the basal friction formulation of Schoof (2005), modified following Asay-Davis et al. (2016). + ! Use the basal friction formulation of Schoof (2005), formluated following Asay-Davis et al. (2016). ! This formulation uses a constant value of basal flwa, which allows several Coulomb parameters ! (lambda_max, m_max and flwa_basal) to be combined into a single parameter powerlaw_c, ! as in the Tsai power law below. @@ -576,16 +578,48 @@ subroutine glissade_calcbeta (& ! write(iulog,*) ew, ns, speed(ew,ns), basal_physics%effecpress_stag(ew,ns), beta(ew,ns), beta(ew,ns)*speed(ew,ns) ! enddo - case(HO_BABC_COULOMB_POWERLAW_TSAI) + case(HO_BABC_MODIFIED_SCHOOF) - ! Basal stress representation based on Tsai et al. (2015) - ! The basal stress is the minimum of two values: - ! (1) power law: tau_b = powerlaw_c * |u_b|^(1/powerlaw_m) - ! (2) Coulomb friction: tau_b = coulomb_c * N - ! N = effective pressure = rhoi*g*(H - H_f) - ! H_f = flotation thickness = (rhow/rhoi)*(eus-topg) - ! This value of N is obtained by setting p_ocean_penetration = 1.0 in the config file. - ! The other parameters (powerlaw_c, powerlaw_m and coulomb_c) can also be set in the config file. + ! Modified version of the Schoof law, with a simpler albebraic form + ! The basal stress is given by + ! 1/tau_b = 1/tau_p + 1/tau_c + ! where tau_p = powerlaw_c * |u_b|^(1/powerlaw_m) + ! tau_c = coulomb_c * N + ! N = effective pressure + ! Note: taub = 1/2 of the harmonic mean of tau_p and tau_c + + do ns = 1, nsn-1 + do ew = 1, ewn-1 + + taub_powerlaw = basal_physics%powerlaw_c(ew,ns) * speed(ew,ns)**(1.d0/basal_physics%powerlaw_m) + taub_coulomb = basal_physics%coulomb_c(ew,ns) * basal_physics%effecpress_stag(ew,ns) + + if (taub_coulomb > 0.0d0 .and. taub_powerlaw > 0.0d0) then + taub = 1.0d0 / (1.0d0/taub_powerlaw + 1.0d0/taub_coulomb) + elseif (taub_powerlaw > 0.0d0) then + taub = taub_powerlaw + elseif (taub_coulomb > 0.0d0) then + taub = taub_coulomb + endif + beta(ew,ns) = taub / speed(ew,ns) + + if (verbose_beta .and. present(rtest) .and. present(itest) .and. present(jtest)) then + if (this_rank == rtest .and. ew == itest .and. ns == jtest) then + write(iulog,*) ' ' + write(iulog,'(a38,3i4,4f12.3)') 'rank, i, j, tau_p, tau_c, tau_b, beta:', & + this_rank, ew, ns, taub_powerlaw, taub_coulomb, taub, beta(ew,ns) + endif + endif + enddo ! ew + enddo ! ns + + case(HO_BABC_TSAI) + + ! Basal stress representation based on Tsai et al. (2015) + ! The basal stress is the minimum of two values: + ! (1) power law: tau_p = powerlaw_c * |u_b|^(1/powerlaw_m) + ! (2) Coulomb friction: tau_c = coulomb_c * N + ! N = effective pressure do ns = 1, nsn-1 do ew = 1, ewn-1 @@ -607,18 +641,6 @@ subroutine glissade_calcbeta (& beta(:,:) = beta(:,:) * basal_physics%c_space_factor_stag(:,:) endif - case(HO_BABC_SIMPLE) ! simple pattern; also useful for debugging and test cases - ! (here, a strip of weak bed surrounded by stronger bed to simulate an ice stream) - - beta(:,:) = 1.d4 ! Pa yr/m - - !TODO - Change this loop to work in parallel (set beta on the global grid and scatter to local) - do ns = 5, nsn-5 - do ew = 1, ewn-1 - beta(ew,ns) = 100.d0 ! Pa yr/m - end do - end do - case default ! do nothing @@ -686,11 +708,10 @@ subroutine glissade_calcbeta (& !TODO - Move this halo update to a higher level? call staggered_parallel_halo(beta, parallel) - !WHL - debug if (verbose_beta .and. present(rtest) .and. present(itest) .and. present(jtest)) then if (this_rank == rtest) then ew = itest; ns = jtest - write(iulog,*) 'End of calcbeta, r, i, j, speed, f_ground, beta:', & + write(iulog,'(a48,3i4,3f12.5)') 'End of calcbeta, r, i, j, speed, f_ground, beta:', & rtest, ew, ns, speed(ew,ns), f_ground(ew,ns), beta(ew,ns) endif endif diff --git a/libglissade/glissade_inversion.F90 b/libglissade/glissade_inversion.F90 index d12f25ca..54c688b3 100644 --- a/libglissade/glissade_inversion.F90 +++ b/libglissade/glissade_inversion.F90 @@ -89,9 +89,6 @@ subroutine glissade_inversion_init(model) f_flotation, & ! flotation function (m) thck_obs ! observed ice thickness, derived from usrf_obs and topg - real(dp), dimension(model%general%ewn, model%general%nsn) :: & - coulomb_c_icegrid ! initial coulomb_c at cell centers based on masks - real(dp) :: h_obs, h_flotation, h_buff ! thck_obs, flotation thickness, and thck_flotation_buffer scaled to m real(dp) :: dh ! h_obs - h_flotation real(dp) :: dh_decimal ! decimal part remaining after subtracting the truncation of dh @@ -226,7 +223,6 @@ subroutine glissade_inversion_init(model) endif endif ! inversion for Cp, Cc or deltaT_ocn - !---------------------------------------------------------------------- ! If inverting for E, then make sure there is a target surface speed, velo_sfc_obs. !---------------------------------------------------------------------- @@ -254,6 +250,9 @@ subroutine glissade_inversion_init(model) !---------------------------------------------------------------------- ! computations specific to powerlaw_c (Cp) and coulomb_c (Cc) inversion + ! Note: Most sliding laws have inversion for Cp or Cc, but not both. + ! The modified Schoof law, however, supports inversion for both. + ! (This could be extended to the School and Tsai laws.) !---------------------------------------------------------------------- if (model%options%which_ho_powerlaw_c == HO_POWERLAW_C_INVERSION .or. & @@ -268,7 +267,11 @@ subroutine glissade_inversion_init(model) call point_diag(model%basal_physics%powerlaw_c, 'init_inversion for powerlaw_c', itest, jtest, rtest, 7, 7) endif - elseif (model%options%which_ho_coulomb_c == HO_COULOMB_C_INVERSION) then + endif ! invert for powerlaw_c + + !TODO - Add distinct logic for powerlaw_c_inversion_basin? + + if (model%options%which_ho_coulomb_c == HO_COULOMB_C_INVERSION) then if (parallel_is_zero(model%basal_physics%coulomb_c)) then ! initialize coulomb_c (for which we will invert) @@ -280,7 +283,9 @@ subroutine glissade_inversion_init(model) 'init_inversion for coulomb_c', itest, jtest, rtest, 7, 7) endif - elseif (model%options%which_ho_coulomb_c == HO_COULOMB_C_INVERSION_BASIN) then + endif ! invert for coulomb_c + + if (model%options%which_ho_coulomb_c == HO_COULOMB_C_INVERSION_BASIN) then !TODO - Should this calculation be done in glissade_initialise? if (parallel_is_zero(model%basal_physics%coulomb_c_lo)) then @@ -311,10 +316,11 @@ subroutine glissade_inversion_init(model) 'init_inversion for basin-scale coulomb_c', itest, jtest, rtest, 7, 7) endif - endif + endif ! invert for coulomb_c_basin !---------------------------------------------------------------------- ! computations specific to flow_enhancement_factor inversion + ! TODO: Remove this inversion option? !---------------------------------------------------------------------- if (model%options%which_ho_flow_enhancement_factor == HO_FLOW_ENHANCEMENT_FACTOR_INVERSION) then @@ -557,7 +563,7 @@ subroutine glissade_inversion_solve(model) call staggered_parallel_halo(stag_thck, parallel) call staggered_parallel_halo(stag_dthck_dt, parallel) - ! Invert for powerlaw_c or coulomb_c + ! Invert for powerlaw_c and/or coulomb_c ! The logic is the same for each; only the max and min values and the in/out field are different. if ( model%options%which_ho_powerlaw_c == HO_POWERLAW_C_INVERSION) then @@ -593,7 +599,9 @@ subroutine glissade_inversion_solve(model) call point_diag(model%basal_physics%powerlaw_c, 'New powerlaw_c', itest, jtest, rtest, 7, 7) endif - elseif ( model%options%which_ho_coulomb_c == HO_COULOMB_C_INVERSION) then + endif ! invert for powerlaw_c + + if (model%options%which_ho_coulomb_c == HO_COULOMB_C_INVERSION) then if (verbose_inversion .and. this_rank == rtest) then write(iulog,*) ' ' @@ -624,23 +632,18 @@ subroutine glissade_inversion_solve(model) if (verbose_inversion) then call point_diag(model%basal_physics%effecpress_stag, 'effecpress_stag', itest, jtest, rtest, 7, 7, '(f10.1)') - call point_diag(rhoi*grav*stag_thck, 'overburden', itest, jtest, rtest, 7, 7, '(f10.1)') - call point_diag((model%geometry%thck - thck_obs), 'thck - thck_obs (m)', itest, jtest, rtest, 7, 7) call point_diag(model%basal_physics%coulomb_c, 'New coulomb_c', itest, jtest, rtest, 7, 7, '(f10.5)') endif ! verbose_inversion - endif ! invert for powerlaw_c or coulomb_c + endif ! invert for coulomb_c - else ! do not invert for powerlaw_c or coulomb_c; just print optional diagnostics + elseif (verbose_inversion) then ! not inverting, but print some diagnostic values - if (verbose_inversion) then - call point_diag(model%geometry%f_ground, 'f_ground at vertices', itest, jtest, rtest, 7, 7, '(f10.4)') - call point_diag(model%basal_physics%powerlaw_c, 'powerlaw_c', itest, jtest, rtest, 7, 7, '(f10.2)') - call point_diag(model%basal_physics%coulomb_c, 'coulomb_c', itest, jtest, rtest, 7, 7, '(f10.4)') - endif - - endif ! invert for powerlaw_c or coulomb_c + call point_diag(model%geometry%f_ground, 'f_ground at vertices', itest, jtest, rtest, 7, 7, '(f10.4)') + call point_diag(model%basal_physics%powerlaw_c, 'powerlaw_c', itest, jtest, rtest, 7, 7, '(f10.2)') + call point_diag(model%basal_physics%coulomb_c, 'coulomb_c', itest, jtest, rtest, 7, 7, '(f10.4)') + endif ! If inverting for powerlaw_c or coulomb_c at the basin scale, then update it here @@ -1261,6 +1264,8 @@ subroutine invert_basal_friction(& if (f_ground(i,j) > 0.0d0) then ! ice is at least partly grounded ! Compute tendency terms based on the thickness target + !TODO: Try putting max(babc_thck_scale, stag_dthck_obs) in the denominator + ! Alex Robinson says this might improve convergence term_thck = -stag_dthck(i,j) / (babc_thck_scale*babc_timescale) term_dHdt = -stag_dthck_dt(i,j) * 2.0d0 / babc_thck_scale @@ -1330,8 +1335,8 @@ subroutine invert_basal_friction(& call point_diag(stag_dthck, 'stag_thck - stag_thck_obs', itest, jtest, rtest, 7, 7) call point_diag(stag_dthck_dt*scyr, 'stag_dthck_dt (m/yr)', itest, jtest, rtest, 7, 7) call point_diag(f_ground, 'f_ground', itest, jtest, rtest, 7, 7) - call point_diag(del2_logc, 'del2(logC)', itest, jtest, rtest, 7, 7, '(e12.3)') - call point_diag(logC, 'logC', itest, jtest, rtest, 7, 7) +!! call point_diag(del2_logc, 'del2(logC)', itest, jtest, rtest, 7, 7, '(e12.3)') +!! call point_diag(logC, 'logC', itest, jtest, rtest, 7, 7) call point_diag(dlogc, 'dlogC', itest, jtest, rtest, 7, 7, '(e12.3)') endif From 2edf195ea52b485a847fde65e556686c99d2b601 Mon Sep 17 00:00:00 2001 From: Katetc Date: Thu, 19 Feb 2026 17:34:25 -0700 Subject: [PATCH 21/21] Changes needed to compile with the GNU compiler --- libglimmer/glimmer_utils.F90 | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/libglimmer/glimmer_utils.F90 b/libglimmer/glimmer_utils.F90 index 00a3859e..45038da6 100644 --- a/libglimmer/glimmer_utils.F90 +++ b/libglimmer/glimmer_utils.F90 @@ -621,13 +621,13 @@ subroutine double_to_binary(& binary_number = transfer(x, binary_number) ! Get the sign bit (bit 64) - sign_bit = ishft(binary_number, -63) .and. 1 + sign_bit = IAND(ishft(binary_number, -63), INT(1, 8)) ! Get the exponent bits (bits 63–53) - exponent_bits = ishft(binary_number, -52) .and. Z'7FF' + exponent_bits = IAND(ishft(binary_number, -52), INT(Z'7FF', 8)) ! Extract mantissa (fraction) bits (bits 52–1) - mantissa_bits = binary_number .and. Z'FFFFFFFFFFFFF' + mantissa_bits = IAND(binary_number, INT(Z'FFFFFFFFFFFFF', 8)) if (present(binary_full)) binary_full = binary_number if (present(binary_sign)) binary_sign = sign_bit @@ -655,7 +655,7 @@ subroutine double_to_binary(& binary_str = concat(bin) if (verbose_binary) then write(iulog,*) 'Full 64-bit binary:' - write(iulog,*), ' ', binary_str + write(iulog,*) ' ', binary_str endif end subroutine double_to_binary