CONQUEST-release/src/multiply_module.f90 at 1ced294fa75cd87e49868a3f99b4c12f7e97e4c0 · OrderN/CONQUEST-release · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
! -*- mode: F90; mode: font-lock -*-
! ------------------------------------------------------------------------------
! $Id$
! ------------------------------------------------------------------------------
! Module multiply_module
! ------------------------------------------------------------------------------
! Code area 2: matrices
! ------------------------------------------------------------------------------

!!****h* Conquest/multiply_module *
!!  NAME
!!   multiply_module
!!  PURPOSE
!!   Contains subroutines related to matrix manipulation:
!!   multiplication (plus local transpose, prefetch, kernel etc);
!!   transposition; adding and copying different range matrices.
!!  USES
!!   GenComms, basic_types, comms_module, datatypes, global_module, matrix_comms_module,
!!   matrix_module, mpi, trans_module
!!  AUTHOR
!!   D.R.Bowler
!!  CREATION DATE
!!   25/11/99
!!  MODIFICATION HISTORY
!!   7/12/99 by D.R.Bowler
!!    Sizes of the _rem variables in mat_mult changed from mx_part*mx_bnab
!!    to mx_part*mx_babs (subtle difference which was causing array overflow)
!!    Also changed parameters passed for these sizes to end_part_comms etc
!!   27/01/00 by D.R.Bowler
!!    Now uses new comms: all indices in the matrix type are stored by
!!    partition, and point to different areas of an array; that array is
!!    all that is sent when we transfer the indices
!!   20/06/2001 dave
!!    Added ROBODoc headers, RCS Id and Log tags and cq_abort throughout
!!   24/04/2002 dave
!!    Corrected ROBODoc headers and added RCS static Id tag, and USES field
!!   17/06/2002 dave
!!    Tiny change to headers
!!   20/06/2002 drb
!!    Added numbers to cq_abort calls in loc_trans to make them more meaningful
!!   20/06/2002 dave
!!    Changed dimension of nreqs (away from 500 to actual maximum) and made atrans dynamically allocated rather
!!    than automatically allocated.
!!   10:59, 13/02/2006 drb
!!    Removed prune, graft routines (no longer used) and small tidying in preparation for variable NSF
!!   2008/02/06 08:26 dave
!!    Changed for output to file not stdout
!!   2008/05/22 ast
!!    Added timers
!!   2012/08/20 L.Tong
!!   - Moved multiplication kernels to separate modules
!!     multiply_kernel_VERSION.f90
!!   2014/09/15 18:30 lat
!!    fixed call start/stop_timer to timer_module (not timer_stdlocks_module !)
!!  SOURCE
!!
module multiply_module

  use global_module,          only: io_lun
  use timer_module,           only: start_timer, stop_timer
  use timer_stdclocks_module, only: tmr_std_allocation

  implicit none

!!***

contains

  !!****f* multiply_module/mat_mult *
  !!
  !!  NAME
  !!   mat_mult - sparse matrix multiplication
  !!  USAGE
  !!
  !!  PURPOSE
  !!   Multiply together a and b and produce c.
  !!   There are two types of multiplication - "extension" where
  !!   Rc is close to Ra + Rb, and "reduction" where Rc is much
  !!   smaller.  In practice, the latter case is actually done as
  !!   A = C.B.
  !!
  !!   See the notes "A comprehensive scheme for matrix multiplication".
  !!  INPUTS
  !!
  !!
  !!  USES
  !!
  !!  AUTHOR
  !!   D.R.Bowler
  !!  CREATION DATE
  !!   25/11/99
  !!  MODIFICATION HISTORY
  !!   20/06/2001 dave
  !!    Added ROBODoc header and cq_abort and tidied
  !!   20/06/2002 dave
  !!    Changed dimension of nreqs to be mx_nponn*mx_neighbour_nodes
  !!    (the actual maximum that will be required) and converted
  !!    atrans to being dynamically allocated rather than automatic.
  !!   10:44, 06/03/2003 drb
  !!    Changed size of nreqs (temporary fix)
  !!   09:19, 11/05/2005 dave
  !!    Removed unnecessary MPI_Wait (second one on nreqs)
  !!   2008/05/22 ast
  !!    Added timers
  !!   2012/08/20 L.Tong
  !!   - Removed conditionals for choosing GEMM or normal kernels.
  !!     The choice will be done during compilation time.
  !!   2012/09/05 L.Tomg
  !!   - Added timer for matrix multiplication
  !!   2018/10/04 17:26 dave
  !!    Added counter for partitions received from different processes (nodes)
  !!    to use for tags to be compliant with MPI standard
  !!  SOURCE
  !!
  subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug)

    ! Module usage
    use datatypes
    use numbers
    use global_module
    use matrix_module
    use basic_types
    use matrix_comms_module
    use comms_module
    use GenComms, only: my_barrier, cq_abort, mtime
    use mpi
    use multiply_kernel
    use timer_stdclocks_module, only: tmr_std_matmult

    implicit none

    ! Mult type - contains indices, small groups etc
    type(matrix_mult)::a_b_c
    ! Matrices
    integer :: lena, lenb, lenc
    real(double) :: a(lena)
    real(double) :: b(lenb)
    real(double) :: c(lenc)
    ! Node id - should be in a module ?
    integer :: myid
    integer, OPTIONAL :: debug

    ! Local variables
    ! This will be dynamically allocated/deallocated by the system
    real(double), allocatable, dimension(:) :: atrans
    integer :: lab_const
    integer :: invdir,ierr,kpart,ind_part,ncover_yz,n_which,ipart,nnode
    integer :: icall,n_cont,kpart_next,ind_partN,k_off
    integer :: stat,ilen2,lenb_rem
    ! Remote variables to be allocated
    integer(integ),allocatable :: ibpart_rem(:)
    real(double),allocatable :: b_rem(:)
    ! Remote variables which will point to part_array
    integer(integ),pointer :: nbnab_rem(:)
    integer(integ),pointer :: ibseq_rem(:)
    integer(integ),pointer :: ibind_rem(:)
    integer(integ),pointer :: ib_nd_acc_rem(:)
    integer(integ),pointer :: npxyz_rem(:)
    integer(integ),pointer :: ibndimj_rem(:)
    ! Arrays for remote variables to point to
    integer, target :: part_array(3*a_b_c%parts%mx_mem_grp+ &
         5*a_b_c%parts%mx_mem_grp*a_b_c%bmat(1)%mx_abs)
    integer, dimension(:), allocatable :: nreqs
    integer :: offset,sends,i,j
    integer, dimension(MPI_STATUS_SIZE) :: mpi_stat
    integer, allocatable, dimension(:) :: recv_part
    real(double) :: t0,t1

    logical :: new_partition

    call start_timer(tmr_std_matmult)

    call start_timer(tmr_std_allocation)
    if(iprint_mat>3.AND.myid==0) t0 = mtime()
    ! Allocate memory for the elements
    allocate(ibpart_rem(a_b_c%parts%mx_mem_grp*a_b_c%bmat(1)%mx_abs),STAT=stat)
    if(stat/=0) call cq_abort('mat_mult: error allocating ibpart_rem')
    !allocate(atrans(a_b_c%amat(1)%length),STAT=stat)
    allocate(atrans(lena),STAT=stat)
    if(stat/=0) call cq_abort('mat_mult: error allocating atrans')
    allocate(recv_part(0:a_b_c%comms%inode),STAT=stat)
    if(stat/=0) call cq_abort('mat_mult: error allocating recv_part')
    recv_part = zero
    call stop_timer(tmr_std_allocation)
    !write(io_lun,*) 'Sizes: ',a_b_c%comms%mx_dim3*a_b_c%comms%mx_dim2*a_b_c%parts%mx_mem_grp*a_b_c%bmat(1)%mx_abs,&
    !     a_b_c%parts%mx_mem_grp*a_b_c%bmat(1)%mx_abs,a_b_c%comms%mx_dim3*a_b_c%comms%mx_dim1* &
    !     a_b_c%prim%mx_iprim*a_b_c%amat(1)%mx_nab
    sends = 0
    do i=1,a_b_c%comms%inode
       if(a_b_c%comms%np_send(i)>0) then
          do j=1,a_b_c%comms%np_send(i)
             sends = sends+1
          end do
       end if
    end do
    call start_timer(tmr_std_allocation)
    allocate(nreqs(sends*2),STAT=stat)
    if(stat/=0) call cq_abort("mat_mult: Error allocating nreqs",sends,stat)
    call stop_timer(tmr_std_allocation)
    sends = 0
    ! For type 1, form local transpose of my data (a_halo) and zero C-matrix
    invdir=0
    !write(io_lun,*) 'Local trans'
    if(a_b_c%mult_type.eq.1) then
       call loc_trans( a_b_c%ltrans, a_b_c%ahalo,a,lena,atrans,lena,invdir)
       c = zero
    else if(a_b_c%mult_type.eq.2) then
       atrans = zero
    end if
    ! Start the send procedure
    !write(io_lun,*) 'Sending'
    call Mquest_start_send(a_b_c,b,nreqs,myid,a_b_c%prim%mx_ngonn,sends)
    !write(io_lun,*) 'Returned ',a_b_c%ahalo%np_in_halo,myid
    ncover_yz=a_b_c%gcs%ncovery*a_b_c%gcs%ncoverz

    !$omp parallel default(shared)
    main_loop: do kpart = 1,a_b_c%ahalo%np_in_halo

       !$omp master
       icall=1
       ind_part = a_b_c%ahalo%lab_hcell(kpart)
       new_partition = .true.

       ! Check if this is a periodic image of the previous partition
       if(kpart>1) then
          if(ind_part.eq.a_b_c%ahalo%lab_hcell(kpart-1)) then
             new_partition = .false.
          end if
       end if

       if(new_partition) then
          ! Get the data
          ipart = a_b_c%parts%i_cc2seq(ind_part)
          nnode = a_b_c%comms%neigh_node_list(kpart)
          recv_part(nnode) = recv_part(nnode)+1
          if(allocated(b_rem)) deallocate(b_rem)
          if(a_b_c%parts%i_cc2node(ind_part)==myid+1) then
             lenb_rem = a_b_c%bmat(ipart)%part_nd_nabs
          else
             lenb_rem = a_b_c%comms%ilen3rec(ipart,nnode)
          end if
          allocate(b_rem(lenb_rem))
          part_array = 0
          b_rem = zero
          call prefetch(kpart,a_b_c%ahalo,a_b_c%comms,a_b_c%bmat,icall,&
               n_cont,part_array,a_b_c%bindex,b_rem,lenb_rem,b,myid,ilen2,&
               mx_msg_per_part,a_b_c%parts,a_b_c%prim,a_b_c%gcs,(recv_part(nnode)-1)*2)
          ! Now point the _rem variables at the appropriate parts of
          ! the array where we will receive the data
          offset = 0
          nbnab_rem => part_array(offset+1:offset+n_cont)
          offset = offset+n_cont
          ibind_rem => part_array(offset+1:offset+n_cont)
          offset = offset+n_cont
          ib_nd_acc_rem => part_array(offset+1:offset+n_cont)
          offset = offset+n_cont
          ibseq_rem => part_array(offset+1:offset+ilen2)
          offset = offset+ilen2
          npxyz_rem => part_array(offset+1:offset+3*ilen2)
          offset = offset+3*ilen2
          ibndimj_rem => part_array(offset+1:offset+ilen2)
          if(offset+ilen2>3*a_b_c%parts%mx_mem_grp+ &
               5*a_b_c%parts%mx_mem_grp*a_b_c%bmat(1)%mx_abs) then
             call cq_abort('mat_mult: error pointing to part_array ',kpart)
          end if
          ! Create ibpart_rem
          call end_part_comms(myid,n_cont,nbnab_rem,ibind_rem,npxyz_rem,&
               ibpart_rem,ncover_yz,a_b_c%gcs%ncoverz)
       end if

       k_off=a_b_c%ahalo%lab_hcover(kpart) ! --- offset for pbcs
       ! Omp master doesn't include a implicit barrier. We want master
       ! to be finished with comms before calling the multiply kernels
       ! hence the explicit barrier
       !$omp end master
       !$omp barrier

       if(a_b_c%mult_type.eq.1) then  ! C is full mult
          call m_kern_max( k_off,kpart,ib_nd_acc_rem, ibind_rem,nbnab_rem,&
               ibpart_rem,ibseq_rem,ibndimj_rem,&
               atrans,b_rem,c,a_b_c%ahalo,a_b_c%chalo,a_b_c%ltrans,&
               a_b_c%bmat(1)%mx_abs,a_b_c%parts%mx_mem_grp, &
               a_b_c%prim%mx_iprim, lena, lenb_rem, lenc)
       else if(a_b_c%mult_type.eq.2) then ! A is partial mult
          call m_kern_min( k_off,kpart,ib_nd_acc_rem, ibind_rem,nbnab_rem,&
               ibpart_rem,ibseq_rem,ibndimj_rem,&
               atrans,b_rem,c,a_b_c%ahalo,a_b_c%chalo,a_b_c%ltrans,&
               a_b_c%bmat(1)%mx_abs,a_b_c%parts%mx_mem_grp, &
               a_b_c%prim%mx_iprim, lena, lenb_rem, lenc)
       end if
       !$omp barrier
    end do main_loop
    !$omp end parallel
    call start_timer(tmr_std_allocation)
    if(allocated(b_rem)) deallocate(b_rem)
    call stop_timer(tmr_std_allocation)
    ! --------------------------------------------------
    ! End of the main loop over partitions in the A-halo
    ! --------------------------------------------------
    ! sends is only incremented in the MPI version of Mquest_start_send, so this works
    !write(io_lun,*) 'Done loop ',myid,sends
    if(sends>0) then
       do i=1,sends
          call MPI_Wait(nreqs(i),mpi_stat,ierr)
          if(ierr/=0) call cq_abort("Error waiting for send to finish",i)
          !write(io_lun,*) 'Send done ',i,myid
       end do
    end if
    call my_barrier
    call start_timer(tmr_std_allocation)
    deallocate(nreqs,STAT=stat)
    if(stat/=0) call cq_abort('mat_mult: error deallocating nreqs')
    call stop_timer(tmr_std_allocation)
    call my_barrier
    ! --- for type 2, make backward local transpose of A-matrix -----------
    if(a_b_c%mult_type.eq.2) then
       invdir=1
       call loc_trans( a_b_c%ltrans, a_b_c%ahalo,a,lena,atrans,lena,invdir)
    end if
    call my_barrier
    call start_timer(tmr_std_allocation)
    deallocate(atrans,STAT=stat)
    if(stat/=0) call cq_abort('mat_mult: error deallocating atrans')
    call stop_timer(tmr_std_allocation)
    call my_barrier
    call start_timer(tmr_std_allocation)
    deallocate(ibpart_rem,STAT=stat)
    if(stat/=0) call cq_abort('mat_mult: error deallocating ibpart_rem')
    deallocate(recv_part,STAT=stat)
    if(stat/=0) call cq_abort('mat_mult: error deallocating recv_part')
    call stop_timer(tmr_std_allocation)
    call my_barrier
    !deallocate(b_rem,STAT=stat)
    !if(stat/=0) call cq_abort('mat_mult: error deallocating b_rem')
    !call my_barrier
    if(iprint_mat>3.AND.myid==0) then
       t1 = mtime()
       write(io_lun,fmt='(10x,a,f16.6)') 'mult time: ',t1-t0
    end if

    call stop_timer(tmr_std_matmult)

    return
  end subroutine mat_mult
  !!***


  !!****f* multiply_module/mat_trans *
  !!
  !!  NAME
  !!   mat_trans - global transpose of a matrix
  !!  USAGE
  !!
  !!  PURPOSE
  !!   Takes the global transpose of a matrix.  Many of the preparatory
  !!   and subsidiary routines for this are in trans_module.
  !!  INPUTS
  !!
  !!
  !!  USES
  !!
  !!  AUTHOR
  !!   D.R.Bowler
  !!  CREATION DATE
  !!   25/11/99
  !!  MODIFICATION HISTORY
  !!   20/06/2001 dave
  !!    Added ROBODoc header and cq_abort
  !!  SOURCE
  !!
  subroutine mat_trans(nnd,at_rem,at,ahalo,a,atrans,ltrans,mx_iprim)

    use datatypes
    use matrix_module
    use comms_module
    !MPI_Wait  T. Miyazaki 2003/11/18
    !use mpi
    use GenComms, ONLY: cq_abort

    implicit none

    integer(integ)     :: nnd,mx_iprim,dr
    type(trans_remote) :: at_rem
    type(matrix_trans) :: at
    type(matrix_halo)  :: ahalo
    real(double)       :: a(:),atrans(:)

    real(double)       :: ltrans(:)

    !MPI_Wait  T. Miyazaki 2003/11/18
    !integer  :: nreqs(1:1000), isend, ii, ierr=0
    !integer, dimension(MPI_STATUS_SIZE) :: mpi_stat

    ! Form the local transpose of A
    atrans = 0.0_double
    ltrans = 0.0_double
    !call loc_trans(at,ahalo,a,ltrans,0)
    ! Send out the data
    !call send_trans_data(mx_iprim,at_rem,ltrans,nnd, isend, nreqs)
    ! Receive it and built AT
    !call mat_tran(mx_iprim,at%mx_nab,nnd,at_rem,atrans,ltrans)

  !MPI_Wait  T. Miyazaki 2003/11/18
!    do ii= 1, isend
!      call MPI_Wait(nreqs(ii),mpi_stat,ierr)
!      if(ierr/=0) call cq_abort("Error in mat_trans: waiting for send to finish",ii)
!    end do
   return
  end subroutine mat_trans
  !!***


  !!****f* multiply_module/loc_trans *
  !!
  !!  NAME
  !!   loc_trans - local (to processor) transpose
  !!  USAGE
  !!
  !!  PURPOSE
  !!   Performs a local transpose, so that A_ik (indexed as primary set
  !!   atoms, halo atoms) becomes A_ki (halo atoms, primary set atoms).
  !!   This is used extensively by the matrix multiplication.
  !!  INPUTS
  !!
  !!
  !!  USES
  !!
  !!  AUTHOR
  !!   D.R.Bowler
  !!  CREATION DATE
  !!   25/11/99
  !!  MODIFICATION HISTORY
  !!   20/06/2001 dave
  !!    Added ROBODoc header and cq_abort
  !!   20/06/2002 drb
  !!    Added numbers to cq_abort calls so that they are more meaningful
  !!  SOURCE
  !!
  subroutine loc_trans(at,ahalo,a,lena,atrans,lenat,invdir)

    use datatypes
    use GenComms, ONLY: cq_abort
    use maxima_module, ONLY: maxatomsproc
    use matrix_module, ONLY: matrix_trans, matrix_halo
    use basic_types, ONLY: primary_set
    use primary_module, ONLY: bundle

    implicit none

    type(matrix_trans) :: at
    type(matrix_halo)  :: ahalo

    integer :: invdir,irc,ierr,ni,nn
    integer :: i,istart,n1,n2,tpos,nd1,nd2
    ! ---------------------------------------------------------------------
    integer :: lena,lenat
    real(double) :: a(lena)
    real(double) :: atrans(lenat)

    ! --- invdir=0: normal -> retro; invdir=1: retro -> normal ------------
    if(invdir.eq.0) then
       do ni=1,ahalo%ni_in_halo
          !write(io_lun,*) 'Atom no: ',ni
          if(at%n_hnab(ni)<=0) then
             call cq_abort('loc_trans: no. of neighbours < 1')
          end if
          if(at%i_beg(ni)+at%n_hnab(ni)-1>maxatomsproc*at%mx_nab) then
             call cq_abort('loc_trans: i_prim arg. out of range',at%i_beg(ni)+at%n_hnab(ni)-1,maxatomsproc*at%mx_nab)
          end if
          tpos = at%i_nd_beg(ni)
          nd2 = ahalo%ndimj(ni)
          do nn=1,at%n_hnab(ni)
             i=at%i_prim(at%i_beg(ni)+nn-1)
             istart=ahalo%i_h2d((i-1)*ahalo%ni_in_halo+ni)
             !if(istart>maxatomsproc*at%mx_nab) then
             !   call cq_abort('loc_trans: a arg. out of range',istart,maxatomsproc*at%mx_nab)
             !end if
             nd1 = ahalo%ndimi(i)
             if(istart+nd1*nd2-1>lena) call cq_abort('loc_trans: a arg. out of range',istart+nd1*nd2-1,lena)
             if(tpos+nd1*nd2-1>lenat) call cq_abort('loc_trans: at arg. out of range',tpos+nd1*nd2-1,lenat)
             if(istart<=0) call cq_abort('loc_trans: a arg. out of range',istart)
             if(tpos<=0) call cq_abort('loc_trans: at arg. out of range',tpos)
             do n1=1,nd1
                do n2=1,nd2
                   atrans((n1-1)*nd2+n2-1+tpos)=a((n2-1)*nd1+n1-1+istart)
                end do
             end do
             tpos = tpos+nd2*nd1
          end do
       end do
    elseif(invdir.eq.1) then
       do ni=1,ahalo%ni_in_halo
          if(at%n_hnab(ni)<=0) then
             call cq_abort('loc_trans: no. of neighbours < 1')
          end if
          if(at%i_beg(ni)+at%n_hnab(ni)-1>maxatomsproc*at%mx_nab) then
             call cq_abort('error loc_trans: i_prim arg. out of range',at%i_beg(ni)+at%n_hnab(ni)-1,maxatomsproc*at%mx_nab)
          end if
          tpos = at%i_nd_beg(ni)
          nd2 = ahalo%ndimj(ni)
          do nn=1,at%n_hnab(ni)
             i=at%i_prim(at%i_beg(ni)+nn-1)
             nd1 = ahalo%ndimi(i)
             istart=ahalo%i_h2d((i-1)*ahalo%ni_in_halo+ni)
             !if(istart>maxatomsproc*at%mx_nab) then
             !   call cq_abort('loc_trans: a arg. out of range',istart,maxatomsproc*at%mx_nab)
             !end if
             if(istart+nd1*nd2-1>lena) call cq_abort('loc_trans: a arg. out of range',istart+nd1*nd2-1,lena)
             if(tpos+nd1*nd2-1>lenat) call cq_abort('loc_trans: at arg. out of range',tpos+nd1*nd2-1,lenat)
             if(istart<=0) call cq_abort('loc_trans: a arg. out of range',istart)
             if(tpos<=0) call cq_abort('loc_trans: at arg. out of range',tpos)
             do n1=1,nd1
                do n2=1,nd2
                   a((n2-1)*nd1+n1-1+istart)=atrans((n1-1)*nd2+n2-1+tpos)
                end do
             end do
             tpos = tpos+nd2*nd1
          end do
       end do
    else
       call cq_abort('loc_trans: invdir out of range ',invdir)
    end if
    return
  end subroutine loc_trans
  !!***


  !!****f* multiply_module/prefetch *
  !!
  !!  NAME
  !!   prefetch
  !!  USAGE
  !!
  !!  PURPOSE
  !!   Chooses whether to fetch locally or remotely. If we're hiding
  !!   comms behind calculations, this is called before the matrix mult
  !!  INPUTS
  !!
  !!
  !!  USES
  !!
  !!  AUTHOR
  !!   D.R.Bowler
  !!  CREATION DATE
  !!   25/11/99
  !!  MODIFICATION HISTORY
  !!   20/06/2001 dave
  !!    Added ROBODoc header
  !!   2018/10/04 17:31 dave
  !!    Adding tag for MPI compliance
  !!  SOURCE
  !!
  subroutine prefetch(this_part,ahalo,a_b_c,bmat,icall,&
       n_cont,bind_rem,bind,b_rem,lenb_rem,b,myid,ilen2,mx_mpp, &
       parts,prim,gcs,tag)

    ! Module usage
    use datatypes
    use matrix_module
    use basic_types
    use matrix_comms_module
    use comms_module

    implicit none

    ! Passed variables
    ! First, the small group variables
    type(group_set) :: parts
    type(primary_set) :: prim
    type(cover_set) :: gcs
    integer :: mx_mpp
    integer :: this_part,icall,n_cont,myid,ilen2
    type(matrix), dimension(:) :: bmat
    type(matrix_halo) :: ahalo
    type(comms_data) :: a_b_c
    integer(integ), dimension(:)  :: bind_rem,bind
    integer :: lenb_rem, tag
    real(double), dimension(lenb_rem) :: b_rem
    real(double) :: b(:)
    ! Local variables
    integer :: ncover_yz,ind_part,iskip,ind_last
    integer :: inode,ipart,nnode

    ind_part = ahalo%lab_hcell(this_part)
    n_cont=parts%nm_group(ind_part)
    ipart = parts%i_cc2seq(ind_part)
    inode = parts%i_cc2node(ind_part)
    nnode = a_b_c%neigh_node_list(this_part)
    if(inode.eq.myid+1) then ! If this is local, then copy
       icall = 0
       ncover_yz=gcs%ncovery*gcs%ncoverz
       ilen2 = bmat(ipart)%part_nabs
       call Mquest_get_local(ipart,&
            bind_rem,b_rem,lenb_rem,bind,bmat,&
            ind_part,b,myid)
    end if
    if(icall.eq.1) then ! Else fetch the data
       ilen2 = a_b_c%ilen2rec(ipart,nnode)
       call Mquest_get( prim%mx_ngonn, &
            a_b_c%ilen2rec(ipart,nnode),&
            a_b_c%ilen3rec(ipart,nnode),&
            n_cont,inode,ipart,myid,&
            bind_rem,b_rem,lenb_rem,bind,b,&
            a_b_c%istart(ipart,nnode), &
            bmat(1)%mx_abs,parts%mx_mem_grp,tag)
    end if
    return
  end subroutine prefetch
  !!***


  !!****f* multiply_module/check_mkm *
  !!
  !!  NAME
  !!   check_mkm
  !!  USAGE
  !!
  !!  PURPOSE
  !!   Checks dimensions used by m_kern_max/min
  !!  INPUTS
  !!
  !!
  !!  USES
  !!
  !!  AUTHOR
  !!   D.R.Bowler
  !!  CREATION DATE
  !!   25/11/99
  !!  MODIFICATION HISTORY
  !!   20/06/2001 dave
  !!    Added ROBODoc header
  !!  SOURCE
  !!
  subroutine check_mkm(kpart,nah_part,kseq,kbeg,ibind_rem,&
     ibpart_rem,nbnab_rem,ibseq_rem,jchbeg,k_off,icall,&
     mx_apart,mx_pcover,mx_ahalo,mx_part,mx_babs,mx_icover)

    use datatypes

    implicit none

    integer :: mx_apart,mx_pcover,mx_icover
    integer :: mx_ahalo,mx_part,mx_babs
    integer :: nbindbeg,kk,jpart,j,jseq,k_off
    integer :: np_in_ahalo,kpart,k,k_in_part,icall
    integer :: kbeg(mx_apart),nah_part(mx_apart)
    integer :: kseq(mx_ahalo)
    integer :: jchbeg(mx_pcover)
    integer(integ) :: ibind_rem(mx_part)
    integer(integ) :: ibpart_rem(mx_part*mx_babs),nbnab_rem(mx_part)
    integer(integ) :: ibseq_rem(mx_part*mx_babs)

    do k=1,nah_part(kpart)
      k_in_part=kseq(kbeg(kpart)+k-1)
      nbindbeg=ibind_rem(k_in_part)
      do j=1,nbnab_rem(k_in_part)
        jpart=ibpart_rem(nbindbeg+j-1)+k_off
        if(jpart.gt.mx_pcover) then
          write(io_lun,911) jpart,mx_pcover
911       format(//'error mat_mult: jchbeg arg. out of range',i12,i6)
          icall = 0
          return
        end if
        jseq=ibseq_rem(nbindbeg+j-1)
        if(jchbeg(jpart)+jseq-1.gt.mx_icover) then
          write(io_lun,912) jchbeg(jpart)+jseq-1,mx_icover
912       format(//'error mat_mult: j_halo arg. out of range',i10,i6)
          icall = 0
          return
        end if
      end do
    end do
    if(icall==0) write(io_lun,*) 'ERROR!!!!!!!!!!!!!!!!!!!!!!!!!'
    return
  end subroutine check_mkm
  !!***


  !!****f* multiply_module/mat_tran *
  !!
  !!  NAME
  !!   mat_tran
  !!  USAGE
  !!
  !!  PURPOSE
  !!   Fetches local transpose from remote nodes
  !!   and puts in place on my node. The nodes from which data are fetched
  !!   are the halo-nodes of my node. I loop over my halo-nodes,
  !!   and from each one I fetch all the local-transpose data in one go.
  !!   I then pass through all these data and use the index
  !!   table itran_addr() to send it to the right address. The index
  !!   itran_addr() is passed to mat_tran in the argument list, and
  !!   should have been created by sbrt ind_tran, which is called
  !!   once only each time atoms are moved.
  !!  INPUTS
  !!   itran_addr(nt): address on my node to which to start putting
  !!    global-transpose data for a given atom pair number nn fetched
  !!    from halo-node nr; in practice, nt=i_pair_indaddr(nr)+nn-1
  !!  USES
  !!
  !!  AUTHOR
  !!   D.R.Bowler
  !!  CREATION DATE
  !!   25/11/99
  !!  MODIFICATION HISTORY
  !!   20/06/2001 dave
  !!    Added ROBODoc header and cq_abort
  !!   2006/09/07 11:49 dave
  !!    Bug fix for variable function numbers: length now found
  !!    explicitly from pairs%submat
  !!  SOURCE
  !!
  subroutine mat_tran(mx_iprim,mynode,data,atrans,ltrans,pairs,pairsize)

    use datatypes
    use matrix_module
    use trans_module
    use comms_module
    use GenComms, ONLY: cq_abort

    implicit none

    ! Passed variables
    integer(integ) :: mx_iprim
    integer(integ) :: mynode,pairsize
    real(double) :: ltrans(:)
    real(double) :: atrans(:)
    type(trans_remote) :: data
    type(pair_data) :: pairs(:)

    ! Local variables
    real(double), dimension(:), allocatable :: a_store
    integer(integ) :: nr,nnd_rem,tag,n,ic,nn,n1,n2,stat,ierr
    integer :: posn, count,lenat,lensub

    lenat = size(atrans)
    do nr=1,data%n_rem_node ! Loop over halo nodes
       lensub = 0
       do nn=1,data%n_pair(nr)
          lensub = lensub + pairs(nr)%submat(nn)
       end do
       allocate(a_store(lensub),STAT=stat)
       if(stat/=0) call cq_abort("Error allocating a_store in mat_tran ",pairs(nr)%length)
       call fetch_trans_data(a_store,ltrans,data,lensub,nr,mynode,mx_iprim)
       ic=data%i_pair_addr(nr)
       posn = 0
       if(data%n_pair(nr)>data%mx_pair) call cq_abort('Pair error: ',data%n_pair(nr),data%mx_pair)
       do nn=1,data%n_pair(nr)   ! Put it in place
          count = pairs(nr)%submat(nn)
          if(data%itran_addr(ic+nn-1)+count-1>lenat) then
             write(*,*) nr,ic,nn,count,data%itran_addr(ic+nn-1),lenat
             call cq_abort('mat_tran: itran_addr wrong ', data%itran_addr(ic+nn-1)+count-1,lenat)
          end if
          atrans(data%itran_addr(ic+nn-1):data%itran_addr(ic+nn-1)+count-1)=a_store(posn+1:posn+count)
          posn = posn+count
       end do
       deallocate(a_store)
    end do
    return
  end subroutine mat_tran
  !!***


  !!****f* multiply_module/matrix_add *
  !!
  !!  NAME
  !!   matrix_add - adds different range matrices
  !!  USAGE
  !!
  !!  PURPOSE
  !!   Adds two different range matrices and returns the
  !!   result in the first, so A = alpha*A + beta*B
  !!
  !!   Changes the order of arguments passed to addscan so
  !!   that the shorter range matrix is always first - but
  !!   in such a way that the result is put into A (so if
  !!   A is first, dr = 0, and if A is second, dr = 1)
  !!  INPUTS
  !!
  !!
  !!  USES
  !!   addscan
  !!  AUTHOR
  !!   D.R.Bowler
  !!  CREATION DATE
  !!   25/11/99
  !!  MODIFICATION HISTORY
  !!   20/06/2001 dave
  !!    Added ROBODoc header
  !!  SOURCE
  !!
  subroutine matrix_add(alpha,a,lena,amat,beta,b,lenb,bmat,prim,myid)

    use datatypes
    use matrix_module,ONLY:  matrix
    use basic_types, ONLY: primary_set

    implicit none

    ! Passed variables
    type(matrix) :: amat(:),bmat(:)
    type(primary_set) :: prim
    integer :: lena, lenb
    real(double) :: a(lena),b(lenb)
    real(double) :: alpha,beta
    integer      :: myid

    ! Local variables
    integer :: nn,j,nnd

    nnd = myid+1
    do nn=1,prim%groups_on_node
       if(prim%nm_nodgroup(nn)>0) then
          do j=1,prim%nm_nodgroup(nn)
             if(amat(nn)%n_nab(j)<=bmat(nn)%n_nab(j)) then
                call addscan(prim,nn,j,amat,bmat,a,lena,b,lenb,alpha,beta,myid,0)
             else
                call addscan(prim,nn,j,bmat,amat,b,lenb,a,lena,beta,alpha,myid,1)
             end if
          end do ! j=1,nc_nodpart
       end if ! if(nc_nodpart(nn)>0)
    end do ! nn=1,np_on_node
  end subroutine matrix_add
  !!***


  !!****f* multiply_module/addscan *
  !!
  !!  NAME
  !!   addscan
  !!  USAGE
  !!
  !!  PURPOSE
  !!   Scans through the neighbours of A and B and either adds b to a (dr==0)
  !!   of a to b (dr==1).  Assumes that A is shorter than B.
  !!
  !!   N.B. This only works for the jth atom in the nnth partition of this
  !!   processor's primary set - it's called many times by matrix_add
  !!
  !!   The algorithm is as follows.  Start at the beginning of both matrices'
  !!   lists, and increment B list until it's greater than or equal to A list.
  !!   If equal, check ALL indices, and if they agree, add and continue on.
  !!   If B list is greater than A, then increment A and continue on.
  !!  INPUTS
  !!
  !!
  !!  USES
  !!
  !!  AUTHOR
  !!   D.R.Bowler
  !!  CREATION DATE
  !!   25/11/99
  !!  MODIFICATION HISTORY
  !!   20/06/2001 dave
  !!    Added ROBODoc header
  !!  SOURCE
  !!
  subroutine addscan(prim,nn,j,amat,bmat,a,lena,b,lenb,alpha,beta,myid,dr)

    use datatypes
    use matrix_module
    use basic_types
    use GenComms, ONLY: cq_abort

    implicit none

    ! Passed variables
    type(matrix) :: amat(:),bmat(:)
    type(primary_set) :: prim
    integer :: lena, lenb
    real(double) :: a(lena),b(lenb)
    real(double) :: alpha,beta
    integer      :: nn,j,myid,aposn,bposn,dr

    ! Local variables
    integer :: marker,ni,nb,nnd,apos_arr,nd1,nd2, bsize

    nnd = myid+1
    marker = bmat(nn)%i_acc(j)  ! Indexes b
    aposn=amat(nn)%nd_offset+amat(nn)%i_nd_acc(j)  ! Posn in a
    bposn=bmat(nn)%nd_offset+bmat(nn)%i_nd_acc(j)  ! Posn in b
    ni = prim%nm_nodbeg(nn)+j-1  ! Primary atom
    nd1 = amat(nn)%ndimi(j)
    if(amat(nn)%n_nab(j)>0) then  ! Scan over neighbours of j
       do nb=1,amat(nn)%n_nab(j)
          apos_arr = amat(nn)%i_acc(j)+nb-1
          nd2=amat(nn)%ndimj(apos_arr)
          bsize = nd1*nd2
          ! Increment B list until not greater than A
          do while(bmat(nn)%i_part(marker)<amat(nn)%i_part(apos_arr))
             bposn = bposn + nd1*bmat(nn)%ndimj(marker)
             marker = marker+1
          end do
          ! Check against A
          if(bmat(nn)%i_part(marker)>amat(nn)%i_part(apos_arr)) then
             cycle ! We've gone past present neighbour - increment A
          else if(bmat(nn)%i_part(marker)==amat(nn)%i_part(apos_arr)) then
             ! Check through seq and part nos
             do while(bmat(nn)%i_seq(marker)<amat(nn)%i_seq(apos_arr).AND.&
                  bmat(nn)%i_part(marker)==amat(nn)%i_part(apos_arr))
                bposn = bposn + nd1*bmat(nn)%ndimj(marker)
                marker = marker+1
             end do
             ! If either partition no or seq no is too big, increment A
             if(bmat(nn)%i_part(marker)>amat(nn)%i_part(apos_arr)) cycle
             if(bmat(nn)%i_seq(marker)>amat(nn)%i_seq(apos_arr)) then
                cycle
             else if(bmat(nn)%i_part(marker)==amat(nn)%i_part(apos_arr) &
                  .AND.bmat(nn)%i_seq(marker)==amat(nn)%i_seq(apos_arr)) &
                  then
                ! We have found a match - decide which matrix to store result in
                if(dr==0) then
                   a(aposn:aposn+bsize-1)= &
                        alpha*a(aposn:aposn+bsize-1)+beta*b(bposn:bposn+bsize-1)
                else if(dr==1) then
                   b(bposn:bposn+bsize-1)= &
                        beta*b(bposn:bposn+bsize-1)+alpha*a(aposn:aposn+bsize-1)
                end if
                marker=marker+1
                bposn = bposn + bsize
                if(bposn-1>lenb) call cq_abort("Length error for B in addscan: ",bposn-1,lenb)
             end if
          end if
          aposn = aposn+bsize
          if(aposn-1>lena) call cq_abort("Length error for A in addscan: ",aposn-1,lena)
       end do ! nb=1,amat(nn)%n_nab(j)
    end if ! if(amat(nn)%n_nab(j)>0)
  end subroutine addscan
  !!***

end module multiply_module