FrontISTR  5.9.0
Large-scale structural analysis program with finit element method
hecmw_precond_SSOR_33.F90
Go to the documentation of this file.
1 !-------------------------------------------------------------------------------
2 ! Copyright (c) 2019 FrontISTR Commons
3 ! This software is released under the MIT License, see LICENSE.txt
4 !-------------------------------------------------------------------------------
5 
6 !C
7 !C***
8 !C*** module hecmw_precond_SSOR_33
9 !C***
10 !C
12  use hecmw_util
17 #ifndef _OPENACC
18  !$ use omp_lib
19 #endif
20 
21  private
22 
26 
27  integer(kind=kint) :: N
28  real(kind=kreal), pointer :: d(:) => null()
29  real(kind=kreal), pointer :: al(:) => null()
30  real(kind=kreal), pointer :: au(:) => null()
31  integer(kind=kint), pointer :: indexL(:) => null()
32  integer(kind=kint), pointer :: indexU(:) => null()
33  integer(kind=kint), pointer :: itemL(:) => null()
34  integer(kind=kint), pointer :: itemU(:) => null()
35  real(kind=kreal), pointer :: alu(:) => null()
36 
37  integer(kind=kint) :: NColor
38  integer(kind=kint), pointer :: COLORindex(:) => null()
39  integer(kind=kint), pointer :: perm(:) => null()
40  integer(kind=kint), pointer :: iperm(:) => null()
41 
42  logical, save :: isFirst = .true.
43 
44  logical, save :: INITIALIZED = .false.
45 
46  ! for tuning
47  integer(kind=kint), parameter :: numOfBlockPerThread = 100
48  integer(kind=kint), save :: numOfThread = 1, numofblock
49  integer(kind=kint), save, allocatable :: icToBlockIndex(:)
50  integer(kind=kint), save, allocatable :: blockIndexToColorIndex(:)
51  integer(kind=kint), save :: sectorCacheSize0, sectorCacheSize1
52 
53  integer(kind=kint), parameter :: DEBUG = 0
54 
55 contains
56 
57  subroutine hecmw_precond_ssor_33_setup(hecMAT)
58  implicit none
59  type(hecmwst_matrix), intent(inout) :: hecmat
60  integer(kind=kint ) :: npl, npu
61  integer(kind=kint ) :: ncolor_in
62  real (kind=kreal) :: sigma_diag
63  real (kind=kreal) :: alutmp(3,3), pw(3)
64  integer(kind=kint ) :: ii, i, j, k
65  integer(kind=kint ) :: nthreads = 1
66  integer(kind=kint ), allocatable :: perm_tmp(:)
67  real (kind=kreal) :: t0
68 
69  if (debug >= 1) then
70  t0 = hecmw_wtime()
71  write(*,*) 'DEBUG: SSOR setup start', hecmw_wtime()-t0
72  endif
73 
74  if (initialized) then
75  if (hecmat%Iarray(98) == 1) then ! need symbolic and numerical setup
76  call hecmw_precond_ssor_33_clear(hecmat)
77  else if (hecmat%Iarray(97) == 1) then ! need numerical setup only
78  call hecmw_precond_ssor_33_clear(hecmat) ! TEMPORARY
79  else
80  return
81  endif
82  endif
83 
84 #ifndef _OPENACC
85  !$ nthreads = omp_get_max_threads()
86 #endif
87 
88  n = hecmat%N
89  ! N = hecMAT%NP
90  ncolor_in = hecmw_mat_get_ncolor_in(hecmat)
91  sigma_diag = hecmw_mat_get_sigma_diag(hecmat)
92 
93 #ifdef _OPENACC
94  allocate(colorindex(0:n), perm_tmp(n), perm(n), iperm(n))
95  call hecmw_matrix_ordering_rcm(n, hecmat%indexL, hecmat%itemL, &
96  hecmat%indexU, hecmat%itemU, perm_tmp, iperm)
97  if (debug >= 1) write(*,*) 'DEBUG: RCM ordering done', hecmw_wtime()-t0
98  call hecmw_matrix_ordering_mc(n, hecmat%indexL, hecmat%itemL, &
99  hecmat%indexU, hecmat%itemU, perm_tmp, &
100  ncolor_in, ncolor, colorindex, perm, iperm)
101  if (debug >= 1) write(*,*) 'DEBUG: MC ordering done', hecmw_wtime()-t0
102  deallocate(perm_tmp)
103 
104  !call write_debug_info
105 #else
106  if (nthreads == 1) then
107  ncolor = 1
108  allocate(colorindex(0:1), perm(n), iperm(n))
109  colorindex(0) = 0
110  colorindex(1) = n
111  do i=1,n
112  perm(i) = i
113  iperm(i) = i
114  end do
115  else
116  allocate(colorindex(0:n), perm_tmp(n), perm(n), iperm(n))
117  call hecmw_matrix_ordering_rcm(n, hecmat%indexL, hecmat%itemL, &
118  hecmat%indexU, hecmat%itemU, perm_tmp, iperm)
119  if (debug >= 1) write(*,*) 'DEBUG: RCM ordering done', hecmw_wtime()-t0
120  call hecmw_matrix_ordering_mc(n, hecmat%indexL, hecmat%itemL, &
121  hecmat%indexU, hecmat%itemU, perm_tmp, &
122  ncolor_in, ncolor, colorindex, perm, iperm)
123  if (debug >= 1) write(*,*) 'DEBUG: MC ordering done', hecmw_wtime()-t0
124  deallocate(perm_tmp)
125 
126  !call write_debug_info
127  endif
128 #endif
129 
130  npl = 0
131  do i=1,n
132  do j=hecmat%indexU(i-1)+1,hecmat%indexU(i)
133  if( hecmat%itemU(j) > n ) exit
134  npl = npl + 1
135  enddo
136  enddo
137  npl = max(hecmat%indexL(n),npl)
138  npu = hecmat%indexU(n)
139  allocate(indexl(0:n), indexu(0:n), iteml(npl), itemu(npu))
140  call hecmw_matrix_reorder_profile(n, perm, iperm, &
141  hecmat%indexL, hecmat%indexU, hecmat%itemL, hecmat%itemU, &
142  indexl, indexu, iteml, itemu)
143  if (debug >= 1) write(*,*) 'DEBUG: reordering profile done', hecmw_wtime()-t0
144 
145  !call check_ordering
146 
147  allocate(d(9*n), al(9*npl), au(9*npu))
148  call hecmw_matrix_reorder_values(n, 3, perm, iperm, &
149  hecmat%indexL, hecmat%indexU, hecmat%itemL, hecmat%itemU, &
150  hecmat%AL, hecmat%AU, hecmat%D, &
151  indexl, indexu, iteml, itemu, al, au, d)
152  if (debug >= 1) write(*,*) 'DEBUG: reordering values done', hecmw_wtime()-t0
153 
154  call hecmw_matrix_reorder_renum_item(n, perm, indexl, iteml)
155  call hecmw_matrix_reorder_renum_item(n, perm, indexu, itemu)
156 
157  allocate(alu(9*n))
158  alu = 0.d0
159 
160  do ii= 1, 9*n
161  alu(ii) = d(ii)
162  enddo
163 
164 #ifdef _OPENACC
165  !$acc kernels
166  !$acc loop independent private(ALUtmp,PW)
167 #else
168  !$omp parallel default(none),private(ii,ALUtmp,k,i,j,PW),shared(N,ALU,SIGMA_DIAG)
169  !$omp do
170 #endif
171  do ii= 1, n
172  alutmp(1,1)= alu(9*ii-8) * sigma_diag
173  alutmp(1,2)= alu(9*ii-7)
174  alutmp(1,3)= alu(9*ii-6)
175  alutmp(2,1)= alu(9*ii-5)
176  alutmp(2,2)= alu(9*ii-4) * sigma_diag
177  alutmp(2,3)= alu(9*ii-3)
178  alutmp(3,1)= alu(9*ii-2)
179  alutmp(3,2)= alu(9*ii-1)
180  alutmp(3,3)= alu(9*ii ) * sigma_diag
181  do k= 1, 3
182  alutmp(k,k)= 1.d0/alutmp(k,k)
183  do i= k+1, 3
184  alutmp(i,k)= alutmp(i,k) * alutmp(k,k)
185  do j= k+1, 3
186  pw(j)= alutmp(i,j) - alutmp(i,k)*alutmp(k,j)
187  enddo
188  do j= k+1, 3
189  alutmp(i,j)= pw(j)
190  enddo
191  enddo
192  enddo
193  alu(9*ii-8)= alutmp(1,1)
194  alu(9*ii-7)= alutmp(1,2)
195  alu(9*ii-6)= alutmp(1,3)
196  alu(9*ii-5)= alutmp(2,1)
197  alu(9*ii-4)= alutmp(2,2)
198  alu(9*ii-3)= alutmp(2,3)
199  alu(9*ii-2)= alutmp(3,1)
200  alu(9*ii-1)= alutmp(3,2)
201  alu(9*ii )= alutmp(3,3)
202  enddo
203 #ifdef _OPENACC
204  !$acc end kernels
205 #else
206  !$omp end do
207  !$omp end parallel
208 #endif
209 
210  isfirst = .true.
211 
212  initialized = .true.
213  hecmat%Iarray(98) = 0 ! symbolic setup done
214  hecmat%Iarray(97) = 0 ! numerical setup done
215 
216  if (debug >= 1) write(*,*) 'DEBUG: SSOR setup done', hecmw_wtime()-t0
217 
218  end subroutine hecmw_precond_ssor_33_setup
219 
220  subroutine setup_tuning_parameters
221  use hecmw_tuning_fx
222  implicit none
223  integer(kind=kint) :: blockindex, elementcount, numofelement, ii
224  real(kind=kreal) :: numofelementperblock
225  integer(kind=kint) :: my_rank
226  integer(kind=kint) :: ic, i
227  if (debug >= 1) write(*,*) 'DEBUG: setting up tuning parameters for SSOR'
228 #ifndef _OPENACC
229  !$ numOfThread = omp_get_max_threads()
230 #endif
231 
232  numofblock = numofthread * numofblockperthread
233  if (allocated(ictoblockindex)) deallocate(ictoblockindex)
234  if (allocated(blockindextocolorindex)) deallocate(blockindextocolorindex)
235  allocate (ictoblockindex(0:ncolor), &
236  blockindextocolorindex(0:numofblock + ncolor))
237  numofelement = n + indexl(n) + indexu(n)
238  numofelementperblock = dble(numofelement) / numofblock
239  blockindex = 0
240  ictoblockindex = -1
241  ictoblockindex(0) = 0
242  blockindextocolorindex = -1
243  blockindextocolorindex(0) = 0
244  my_rank = hecmw_comm_get_rank()
245  ! write(9000+my_rank,*) &
246  ! '# numOfElementPerBlock =', numOfElementPerBlock
247  ! write(9000+my_rank,*) &
248  ! '# ic, blockIndex, colorIndex, elementCount'
249  do ic = 1, ncolor
250  elementcount = 0
251  ii = 1
252  do i = colorindex(ic-1)+1, colorindex(ic)
253  elementcount = elementcount + 1
254  elementcount = elementcount + (indexl(i) - indexl(i-1))
255  elementcount = elementcount + (indexu(i) - indexu(i-1))
256  if (elementcount > ii * numofelementperblock &
257  .or. i == colorindex(ic)) then
258  ii = ii + 1
259  blockindex = blockindex + 1
260  blockindextocolorindex(blockindex) = i
261  ! write(9000+my_rank,*) ic, blockIndex, &
262  ! blockIndexToColorIndex(blockIndex), elementCount
263  endif
264  enddo
265  ictoblockindex(ic) = blockindex
266  enddo
267  numofblock = blockindex
268 
270  sectorcachesize0, sectorcachesize1 )
271  end subroutine setup_tuning_parameters
272 
274  implicit none
275  real(kind=kreal), intent(inout) :: zp(:)
276  integer(kind=kint) :: ic, i, iold, j, isl, iel, isu, ieu, k
277  real(kind=kreal) :: sw1, sw2, sw3, x1, x2, x3
278 
279  ! added for tuning >>>
280  integer(kind=kint) :: blockindex
281 
282 #ifndef _OPENACC
283  if (isfirst) then
284  call setup_tuning_parameters
285  isfirst = .false.
286  endif
287 #endif
288  ! <<< added for tuning
289 
290 #ifndef _OPENACC
291  !call start_collection("loopInPrecond33")
292 
293  !OCL CACHE_SECTOR_SIZE(sectorCacheSize0,sectorCacheSize1)
294  !OCL CACHE_SUBSECTOR_ASSIGN(ZP)
295 
296  !$omp parallel default(none) &
297  !$omp&shared(NColor,indexL,itemL,indexU,itemU,AL,AU,D,ALU,perm,&
298  !$omp& ZP,icToBlockIndex,blockIndexToColorIndex) &
299  !$omp&private(SW1,SW2,SW3,X1,X2,X3,ic,i,iold,isL,ieL,isU,ieU,j,k,blockIndex)
300 #endif
301 
302  !C-- FORWARD
303  do ic=1,ncolor
304 #ifdef _OPENACC
305  !$acc kernels
306  !$acc loop independent
307  do i = colorindex(ic-1)+1, colorindex(ic)
308 #else
309  !$omp do schedule (static, 1)
310  do blockindex = ictoblockindex(ic-1)+1, ictoblockindex(ic)
311  do i = blockindextocolorindex(blockindex-1)+1, &
312  blockindextocolorindex(blockindex)
313 #endif
314  ! do i = startPos(threadNum, ic), endPos(threadNum, ic)
315  iold = perm(i)
316  sw1= zp(3*iold-2)
317  sw2= zp(3*iold-1)
318  sw3= zp(3*iold )
319  isl= indexl(i-1)+1
320  iel= indexl(i)
321  do j= isl, iel
322  !k= perm(itemL(j))
323  k= iteml(j)
324  x1= zp(3*k-2)
325  x2= zp(3*k-1)
326  x3= zp(3*k )
327  sw1= sw1 - al(9*j-8)*x1 - al(9*j-7)*x2 - al(9*j-6)*x3
328  sw2= sw2 - al(9*j-5)*x1 - al(9*j-4)*x2 - al(9*j-3)*x3
329  sw3= sw3 - al(9*j-2)*x1 - al(9*j-1)*x2 - al(9*j )*x3
330  enddo ! j
331 
332  x1= sw1
333  x2= sw2
334  x3= sw3
335  x2= x2 - alu(9*i-5)*x1
336  x3= x3 - alu(9*i-2)*x1 - alu(9*i-1)*x2
337  x3= alu(9*i )* x3
338  x2= alu(9*i-4)*( x2 - alu(9*i-3)*x3 )
339  x1= alu(9*i-8)*( x1 - alu(9*i-6)*x3 - alu(9*i-7)*x2)
340  zp(3*iold-2)= x1
341  zp(3*iold-1)= x2
342  zp(3*iold )= x3
343 #ifdef _OPENACC
344  enddo
345  !$acc end kernels
346 #else
347  enddo ! i
348  enddo ! blockIndex
349  !$omp end do
350 #endif
351  enddo ! ic
352 
353  !C-- BACKWARD
354  do ic=ncolor, 1, -1
355 #ifdef _OPENACC
356  !$acc kernels
357  !$acc loop independent
358  do i = colorindex(ic-1)+1, colorindex(ic)
359 #else
360  !$omp do schedule (static, 1)
361  do blockindex = ictoblockindex(ic), ictoblockindex(ic-1)+1, -1
362  do i = blockindextocolorindex(blockindex), &
363  blockindextocolorindex(blockindex-1)+1, -1
364 #endif
365  ! do blockIndex = icToBlockIndex(ic-1)+1, icToBlockIndex(ic)
366  ! do i = blockIndexToColorIndex(blockIndex-1)+1, &
367  ! blockIndexToColorIndex(blockIndex)
368  ! do i = endPos(threadNum, ic), startPos(threadNum, ic), -1
369  sw1= 0.d0
370  sw2= 0.d0
371  sw3= 0.d0
372  isu= indexu(i-1) + 1
373  ieu= indexu(i)
374  do j= ieu, isu, -1
375  !k= perm(itemU(j))
376  k= itemu(j)
377  x1= zp(3*k-2)
378  x2= zp(3*k-1)
379  x3= zp(3*k )
380  sw1= sw1 + au(9*j-8)*x1 + au(9*j-7)*x2 + au(9*j-6)*x3
381  sw2= sw2 + au(9*j-5)*x1 + au(9*j-4)*x2 + au(9*j-3)*x3
382  sw3= sw3 + au(9*j-2)*x1 + au(9*j-1)*x2 + au(9*j )*x3
383  enddo ! j
384 
385  x1= sw1
386  x2= sw2
387  x3= sw3
388  x2= x2 - alu(9*i-5)*x1
389  x3= x3 - alu(9*i-2)*x1 - alu(9*i-1)*x2
390  x3= alu(9*i )* x3
391  x2= alu(9*i-4)*( x2 - alu(9*i-3)*x3 )
392  x1= alu(9*i-8)*( x1 - alu(9*i-6)*x3 - alu(9*i-7)*x2)
393  iold = perm(i)
394  zp(3*iold-2)= zp(3*iold-2) - x1
395  zp(3*iold-1)= zp(3*iold-1) - x2
396  zp(3*iold )= zp(3*iold ) - x3
397 #ifdef _OPENACC
398  enddo
399  !$acc end kernels
400 #else
401  enddo ! i
402  enddo ! blockIndex
403  !$omp end do
404 #endif
405  enddo ! ic
406 #ifndef _OPENACC
407  !$omp end parallel
408 
409  !OCL END_CACHE_SUBSECTOR
410  !OCL END_CACHE_SECTOR_SIZE
411 
412  !call stop_collection("loopInPrecond33")
413 #endif
414 
415  end subroutine hecmw_precond_ssor_33_apply
416 
417  subroutine hecmw_precond_ssor_33_clear(hecMAT)
418  implicit none
419  type(hecmwst_matrix), intent(inout) :: hecmat
420  integer(kind=kint ) :: nthreads = 1
421 #ifndef _OPENACC
422  !$ nthreads = omp_get_max_threads()
423 #endif
424  if (associated(colorindex)) deallocate(colorindex)
425  if (associated(perm)) deallocate(perm)
426  if (associated(iperm)) deallocate(iperm)
427  if (associated(alu)) deallocate(alu)
428  if (nthreads >= 1) then
429  if (associated(d)) deallocate(d)
430  if (associated(al)) deallocate(al)
431  if (associated(au)) deallocate(au)
432  if (associated(indexl)) deallocate(indexl)
433  if (associated(indexu)) deallocate(indexu)
434  if (associated(iteml)) deallocate(iteml)
435  if (associated(itemu)) deallocate(itemu)
436  end if
437  nullify(colorindex)
438  nullify(perm)
439  nullify(iperm)
440  nullify(alu)
441  nullify(d)
442  nullify(al)
443  nullify(au)
444  nullify(indexl)
445  nullify(indexu)
446  nullify(iteml)
447  nullify(itemu)
448  initialized = .false.
449  end subroutine hecmw_precond_ssor_33_clear
450 
451  subroutine write_debug_info
452  implicit none
453  integer(kind=kint) :: my_rank, ic, in
454  my_rank = hecmw_comm_get_rank()
455  !--------------------> debug: shizawa
456  if (my_rank.eq.0) then
457  write(*,*) 'DEBUG: Output fort.19000+myrank and fort.29000+myrank for coloring information'
458  endif
459  write(19000+my_rank,'(a)') '#NCOLORTot'
460  write(19000+my_rank,*) ncolor
461  write(19000+my_rank,'(a)') '#ic COLORindex(ic-1)+1 COLORindex(ic)'
462  do ic=1,ncolor
463  write(19000+my_rank,*) ic, colorindex(ic-1)+1,colorindex(ic)
464  enddo ! ic
465  write(29000+my_rank,'(a)') '#n_node'
466  write(29000+my_rank,*) n
467  write(29000+my_rank,'(a)') '#in OLDtoNEW(in) NEWtoOLD(in)'
468  do in=1,n
469  write(29000+my_rank,*) in, iperm(in), perm(in)
470  if (perm(iperm(in)) .ne. in) then
471  write(29000+my_rank,*) '** WARNING **: NEWtoOLD and OLDtoNEW: ',in
472  endif
473  enddo
474  end subroutine write_debug_info
475 
476  subroutine check_ordering
477  implicit none
478  integer(kind=kint) :: ic, i, j, k
479  integer(kind=kint), allocatable :: iicolor(:)
480  ! check color dependence of neighbouring nodes
481  if (ncolor.gt.1) then
482  allocate(iicolor(n))
483  do ic=1,ncolor
484  do i= colorindex(ic-1)+1, colorindex(ic)
485  iicolor(i) = ic
486  enddo ! i
487  enddo ! ic
488  ! FORWARD: L-part
489  do ic=1,ncolor
490  do i= colorindex(ic-1)+1, colorindex(ic)
491  do j= indexl(i-1)+1, indexl(i)
492  k= iteml(j)
493  if (iicolor(i).eq.iicolor(k)) then
494  write(*,*) .eq.'** ERROR **: L-part: iicolor(i)iicolor(k)',i,k,iicolor(i)
495  endif
496  enddo ! j
497  enddo ! i
498  enddo ! ic
499  ! BACKWARD: U-part
500  do ic=ncolor, 1, -1
501  do i= colorindex(ic), colorindex(ic-1)+1, -1
502  do j= indexu(i-1)+1, indexu(i)
503  k= itemu(j)
504  if (iicolor(i).eq.iicolor(k)) then
505  write(*,*) .eq.'** ERROR **: U-part: iicolor(i)iicolor(k)',i,k,iicolor(i)
506  endif
507  enddo ! j
508  enddo ! i
509  enddo ! ic
510  deallocate(iicolor)
511  endif ! if (NColor.gt.1)
512  !--------------------< debug: shizawa
513  end subroutine check_ordering
514 
515 end module hecmw_precond_ssor_33
real(kind=kreal) function, public hecmw_mat_get_sigma_diag(hecMAT)
integer(kind=kint) function, public hecmw_mat_get_ncolor_in(hecMAT)
subroutine, public hecmw_matrix_reorder_values(N, NDOF, perm, iperm, indexL, indexU, itemL, itemU, AL, AU, D, indexLp, indexUp, itemLp, itemUp, ALp, AUp, Dp)
subroutine, public hecmw_matrix_reorder_renum_item(N, perm, indexXp, itemXp)
subroutine, public hecmw_matrix_reorder_profile(N, perm, iperm, indexL, indexU, itemL, itemU, indexLp, indexUp, itemLp, itemUp)
subroutine, public hecmw_precond_ssor_33_apply(ZP)
subroutine, public hecmw_precond_ssor_33_setup(hecMAT)
subroutine, public hecmw_precond_ssor_33_clear(hecMAT)
subroutine, public hecmw_tuning_fx_calc_sector_cache(N, NDOF, sectorCacheSize0, sectorCacheSize1)
I/O and Utility.
Definition: hecmw_util_f.F90:7
integer(kind=4), parameter kreal
integer(kind=kint) function hecmw_comm_get_rank()
real(kind=kreal) function hecmw_wtime()
subroutine, public hecmw_matrix_ordering_rcm(N, indexL, itemL, indexU, itemU, perm, iperm)
subroutine, public hecmw_matrix_ordering_mc(N, indexL, itemL, indexU, itemU, perm_cur, ncolor_in, ncolor_out, COLORindex, perm, iperm)