FrontISTR  5.7.0
Large-scale structural analysis program with finit element method
hecmw_precond_SSOR_33.f90
Go to the documentation of this file.
1 !-------------------------------------------------------------------------------
2 ! Copyright (c) 2019 FrontISTR Commons
3 ! This software is released under the MIT License, see LICENSE.txt
4 !-------------------------------------------------------------------------------
5 
6 !C
7 !C***
8 !C*** module hecmw_precond_SSOR_33
9 !C***
10 !C
12  use hecmw_util
17  !$ use omp_lib
18 
19  private
20 
24 
25  integer(kind=kint) :: N
26  real(kind=kreal), pointer :: d(:) => null()
27  real(kind=kreal), pointer :: al(:) => null()
28  real(kind=kreal), pointer :: au(:) => null()
29  integer(kind=kint), pointer :: indexL(:) => null()
30  integer(kind=kint), pointer :: indexU(:) => null()
31  integer(kind=kint), pointer :: itemL(:) => null()
32  integer(kind=kint), pointer :: itemU(:) => null()
33  real(kind=kreal), pointer :: alu(:) => null()
34 
35  integer(kind=kint) :: NColor
36  integer(kind=kint), pointer :: COLORindex(:) => null()
37  integer(kind=kint), pointer :: perm(:) => null()
38  integer(kind=kint), pointer :: iperm(:) => null()
39 
40  logical, save :: isFirst = .true.
41 
42  logical, save :: INITIALIZED = .false.
43 
44  ! for tuning
45  integer(kind=kint), parameter :: numOfBlockPerThread = 100
46  integer(kind=kint), save :: numOfThread = 1, numofblock
47  integer(kind=kint), save, allocatable :: icToBlockIndex(:)
48  integer(kind=kint), save, allocatable :: blockIndexToColorIndex(:)
49  integer(kind=kint), save :: sectorCacheSize0, sectorCacheSize1
50 
51  integer(kind=kint), parameter :: DEBUG = 0
52 
53 contains
54 
55  subroutine hecmw_precond_ssor_33_setup(hecMAT)
56  implicit none
57  type(hecmwst_matrix), intent(inout) :: hecmat
58  integer(kind=kint ) :: npl, npu, npcl, npcu
59  real (kind=kreal), allocatable :: cd(:)
60  integer(kind=kint ) :: ncolor_in
61  real (kind=kreal) :: sigma_diag
62  real (kind=kreal) :: alutmp(3,3), pw(3)
63  integer(kind=kint ) :: ii, i, j, k
64  integer(kind=kint ) :: nthreads = 1
65  integer(kind=kint ), allocatable :: perm_tmp(:)
66  real (kind=kreal) :: t0
67 
68  if (debug >= 1) then
69  t0 = hecmw_wtime()
70  write(*,*) 'DEBUG: SSOR setup start', hecmw_wtime()-t0
71  endif
72 
73  if (initialized) then
74  if (hecmat%Iarray(98) == 1) then ! need symbolic and numerical setup
75  call hecmw_precond_ssor_33_clear(hecmat)
76  else if (hecmat%Iarray(97) == 1) then ! need numerical setup only
77  call hecmw_precond_ssor_33_clear(hecmat) ! TEMPORARY
78  else
79  return
80  endif
81  endif
82 
83  !$ nthreads = omp_get_max_threads()
84 
85  n = hecmat%N
86  ! N = hecMAT%NP
87  ncolor_in = hecmw_mat_get_ncolor_in(hecmat)
88  sigma_diag = hecmw_mat_get_sigma_diag(hecmat)
89 
90  if (nthreads == 1) then
91  ncolor = 1
92  allocate(colorindex(0:1), perm(n), iperm(n))
93  colorindex(0) = 0
94  colorindex(1) = n
95  do i=1,n
96  perm(i) = i
97  iperm(i) = i
98  end do
99  else
100  allocate(colorindex(0:n), perm_tmp(n), perm(n), iperm(n))
101  call hecmw_matrix_ordering_rcm(n, hecmat%indexL, hecmat%itemL, &
102  hecmat%indexU, hecmat%itemU, perm_tmp, iperm)
103  if (debug >= 1) write(*,*) 'DEBUG: RCM ordering done', hecmw_wtime()-t0
104  call hecmw_matrix_ordering_mc(n, hecmat%indexL, hecmat%itemL, &
105  hecmat%indexU, hecmat%itemU, perm_tmp, &
106  ncolor_in, ncolor, colorindex, perm, iperm)
107  if (debug >= 1) write(*,*) 'DEBUG: MC ordering done', hecmw_wtime()-t0
108  deallocate(perm_tmp)
109 
110  !call write_debug_info
111  endif
112 
113  npl = 0
114  do i=1,n
115  do j=hecmat%indexU(i-1)+1,hecmat%indexU(i)
116  if( hecmat%itemU(j) > n ) exit
117  npl = npl + 1
118  enddo
119  enddo
120  npl = max(hecmat%indexL(n),npl)
121  npu = hecmat%indexU(n)
122  allocate(indexl(0:n), indexu(0:n), iteml(npl), itemu(npu))
123  call hecmw_matrix_reorder_profile(n, perm, iperm, &
124  hecmat%indexL, hecmat%indexU, hecmat%itemL, hecmat%itemU, &
125  indexl, indexu, iteml, itemu)
126  if (debug >= 1) write(*,*) 'DEBUG: reordering profile done', hecmw_wtime()-t0
127 
128  !call check_ordering
129 
130  allocate(d(9*n), al(9*npl), au(9*npu))
131  call hecmw_matrix_reorder_values(n, 3, perm, iperm, &
132  hecmat%indexL, hecmat%indexU, hecmat%itemL, hecmat%itemU, &
133  hecmat%AL, hecmat%AU, hecmat%D, &
134  indexl, indexu, iteml, itemu, al, au, d)
135  if (debug >= 1) write(*,*) 'DEBUG: reordering values done', hecmw_wtime()-t0
136 
137  call hecmw_matrix_reorder_renum_item(n, perm, indexl, iteml)
138  call hecmw_matrix_reorder_renum_item(n, perm, indexu, itemu)
139 
140  allocate(alu(9*n))
141  alu = 0.d0
142 
143  do ii= 1, 9*n
144  alu(ii) = d(ii)
145  enddo
146 
147  !$omp parallel default(none),private(ii,ALUtmp,k,i,j,PW),shared(N,ALU,SIGMA_DIAG)
148  !$omp do
149  do ii= 1, n
150  alutmp(1,1)= alu(9*ii-8) * sigma_diag
151  alutmp(1,2)= alu(9*ii-7)
152  alutmp(1,3)= alu(9*ii-6)
153  alutmp(2,1)= alu(9*ii-5)
154  alutmp(2,2)= alu(9*ii-4) * sigma_diag
155  alutmp(2,3)= alu(9*ii-3)
156  alutmp(3,1)= alu(9*ii-2)
157  alutmp(3,2)= alu(9*ii-1)
158  alutmp(3,3)= alu(9*ii ) * sigma_diag
159  do k= 1, 3
160  alutmp(k,k)= 1.d0/alutmp(k,k)
161  do i= k+1, 3
162  alutmp(i,k)= alutmp(i,k) * alutmp(k,k)
163  do j= k+1, 3
164  pw(j)= alutmp(i,j) - alutmp(i,k)*alutmp(k,j)
165  enddo
166  do j= k+1, 3
167  alutmp(i,j)= pw(j)
168  enddo
169  enddo
170  enddo
171  alu(9*ii-8)= alutmp(1,1)
172  alu(9*ii-7)= alutmp(1,2)
173  alu(9*ii-6)= alutmp(1,3)
174  alu(9*ii-5)= alutmp(2,1)
175  alu(9*ii-4)= alutmp(2,2)
176  alu(9*ii-3)= alutmp(2,3)
177  alu(9*ii-2)= alutmp(3,1)
178  alu(9*ii-1)= alutmp(3,2)
179  alu(9*ii )= alutmp(3,3)
180  enddo
181  !$omp end do
182  !$omp end parallel
183 
184  isfirst = .true.
185 
186  initialized = .true.
187  hecmat%Iarray(98) = 0 ! symbolic setup done
188  hecmat%Iarray(97) = 0 ! numerical setup done
189 
190  if (debug >= 1) write(*,*) 'DEBUG: SSOR setup done', hecmw_wtime()-t0
191 
192  end subroutine hecmw_precond_ssor_33_setup
193 
194  subroutine setup_tuning_parameters
195  use hecmw_tuning_fx
196  implicit none
197  integer(kind=kint) :: blockindex, elementcount, numofelement, ii
198  real(kind=kreal) :: numofelementperblock
199  integer(kind=kint) :: my_rank
200  integer(kind=kint) :: ic, i
201  if (debug >= 1) write(*,*) 'DEBUG: setting up tuning parameters for SSOR'
202  !$ numOfThread = omp_get_max_threads()
203  numofblock = numofthread * numofblockperthread
204  if (allocated(ictoblockindex)) deallocate(ictoblockindex)
205  if (allocated(blockindextocolorindex)) deallocate(blockindextocolorindex)
206  allocate (ictoblockindex(0:ncolor), &
207  blockindextocolorindex(0:numofblock + ncolor))
208  numofelement = n + indexl(n) + indexu(n)
209  numofelementperblock = dble(numofelement) / numofblock
210  blockindex = 0
211  ictoblockindex = -1
212  ictoblockindex(0) = 0
213  blockindextocolorindex = -1
214  blockindextocolorindex(0) = 0
215  my_rank = hecmw_comm_get_rank()
216  ! write(9000+my_rank,*) &
217  ! '# numOfElementPerBlock =', numOfElementPerBlock
218  ! write(9000+my_rank,*) &
219  ! '# ic, blockIndex, colorIndex, elementCount'
220  do ic = 1, ncolor
221  elementcount = 0
222  ii = 1
223  do i = colorindex(ic-1)+1, colorindex(ic)
224  elementcount = elementcount + 1
225  elementcount = elementcount + (indexl(i) - indexl(i-1))
226  elementcount = elementcount + (indexu(i) - indexu(i-1))
227  if (elementcount > ii * numofelementperblock &
228  .or. i == colorindex(ic)) then
229  ii = ii + 1
230  blockindex = blockindex + 1
231  blockindextocolorindex(blockindex) = i
232  ! write(9000+my_rank,*) ic, blockIndex, &
233  ! blockIndexToColorIndex(blockIndex), elementCount
234  endif
235  enddo
236  ictoblockindex(ic) = blockindex
237  enddo
238  numofblock = blockindex
239 
241  sectorcachesize0, sectorcachesize1 )
242  end subroutine setup_tuning_parameters
243 
244  subroutine hecmw_precond_ssor_33_apply(ZP)
245  implicit none
246  real(kind=kreal), intent(inout) :: zp(:)
247  integer(kind=kint) :: ic, i, iold, j, isl, iel, isu, ieu, k
248  real(kind=kreal) :: sw1, sw2, sw3, x1, x2, x3
249 
250  ! added for turning >>>
251  integer(kind=kint) :: blockindex
252 
253  if (isfirst) then
254  call setup_tuning_parameters
255  isfirst = .false.
256  endif
257  ! <<< added for turning
258 
259  !call start_collection("loopInPrecond33")
260 
261  !OCL CACHE_SECTOR_SIZE(sectorCacheSize0,sectorCacheSize1)
262  !OCL CACHE_SUBSECTOR_ASSIGN(ZP)
263 
264  !$omp parallel default(none) &
265  !$omp&shared(NColor,indexL,itemL,indexU,itemU,AL,AU,D,ALU,perm,&
266  !$omp& ZP,icToBlockIndex,blockIndexToColorIndex) &
267  !$omp&private(SW1,SW2,SW3,X1,X2,X3,ic,i,iold,isL,ieL,isU,ieU,j,k,blockIndex)
268 
269  !C-- FORWARD
270  do ic=1,ncolor
271  !$omp do schedule (static, 1)
272  do blockindex = ictoblockindex(ic-1)+1, ictoblockindex(ic)
273  do i = blockindextocolorindex(blockindex-1)+1, &
274  blockindextocolorindex(blockindex)
275  ! do i = startPos(threadNum, ic), endPos(threadNum, ic)
276  iold = perm(i)
277  sw1= zp(3*iold-2)
278  sw2= zp(3*iold-1)
279  sw3= zp(3*iold )
280  isl= indexl(i-1)+1
281  iel= indexl(i)
282  do j= isl, iel
283  !k= perm(itemL(j))
284  k= iteml(j)
285  x1= zp(3*k-2)
286  x2= zp(3*k-1)
287  x3= zp(3*k )
288  sw1= sw1 - al(9*j-8)*x1 - al(9*j-7)*x2 - al(9*j-6)*x3
289  sw2= sw2 - al(9*j-5)*x1 - al(9*j-4)*x2 - al(9*j-3)*x3
290  sw3= sw3 - al(9*j-2)*x1 - al(9*j-1)*x2 - al(9*j )*x3
291  enddo ! j
292 
293  x1= sw1
294  x2= sw2
295  x3= sw3
296  x2= x2 - alu(9*i-5)*x1
297  x3= x3 - alu(9*i-2)*x1 - alu(9*i-1)*x2
298  x3= alu(9*i )* x3
299  x2= alu(9*i-4)*( x2 - alu(9*i-3)*x3 )
300  x1= alu(9*i-8)*( x1 - alu(9*i-6)*x3 - alu(9*i-7)*x2)
301  zp(3*iold-2)= x1
302  zp(3*iold-1)= x2
303  zp(3*iold )= x3
304  enddo ! i
305  enddo ! blockIndex
306  !$omp end do
307  enddo ! ic
308 
309  !C-- BACKWARD
310  do ic=ncolor, 1, -1
311  !$omp do schedule (static, 1)
312  do blockindex = ictoblockindex(ic), ictoblockindex(ic-1)+1, -1
313  do i = blockindextocolorindex(blockindex), &
314  blockindextocolorindex(blockindex-1)+1, -1
315  ! do blockIndex = icToBlockIndex(ic-1)+1, icToBlockIndex(ic)
316  ! do i = blockIndexToColorIndex(blockIndex-1)+1, &
317  ! blockIndexToColorIndex(blockIndex)
318  ! do i = endPos(threadNum, ic), startPos(threadNum, ic), -1
319  sw1= 0.d0
320  sw2= 0.d0
321  sw3= 0.d0
322  isu= indexu(i-1) + 1
323  ieu= indexu(i)
324  do j= ieu, isu, -1
325  !k= perm(itemU(j))
326  k= itemu(j)
327  x1= zp(3*k-2)
328  x2= zp(3*k-1)
329  x3= zp(3*k )
330  sw1= sw1 + au(9*j-8)*x1 + au(9*j-7)*x2 + au(9*j-6)*x3
331  sw2= sw2 + au(9*j-5)*x1 + au(9*j-4)*x2 + au(9*j-3)*x3
332  sw3= sw3 + au(9*j-2)*x1 + au(9*j-1)*x2 + au(9*j )*x3
333  enddo ! j
334 
335  x1= sw1
336  x2= sw2
337  x3= sw3
338  x2= x2 - alu(9*i-5)*x1
339  x3= x3 - alu(9*i-2)*x1 - alu(9*i-1)*x2
340  x3= alu(9*i )* x3
341  x2= alu(9*i-4)*( x2 - alu(9*i-3)*x3 )
342  x1= alu(9*i-8)*( x1 - alu(9*i-6)*x3 - alu(9*i-7)*x2)
343  iold = perm(i)
344  zp(3*iold-2)= zp(3*iold-2) - x1
345  zp(3*iold-1)= zp(3*iold-1) - x2
346  zp(3*iold )= zp(3*iold ) - x3
347  enddo ! i
348  enddo ! blockIndex
349  !$omp end do
350  enddo ! ic
351  !$omp end parallel
352 
353  !OCL END_CACHE_SUBSECTOR
354  !OCL END_CACHE_SECTOR_SIZE
355 
356  !call stop_collection("loopInPrecond33")
357 
358  end subroutine hecmw_precond_ssor_33_apply
359 
360  subroutine hecmw_precond_ssor_33_clear(hecMAT)
361  implicit none
362  type(hecmwst_matrix), intent(inout) :: hecmat
363  integer(kind=kint ) :: nthreads = 1
364  !$ nthreads = omp_get_max_threads()
365  if (associated(colorindex)) deallocate(colorindex)
366  if (associated(perm)) deallocate(perm)
367  if (associated(iperm)) deallocate(iperm)
368  if (associated(alu)) deallocate(alu)
369  if (nthreads >= 1) then
370  if (associated(d)) deallocate(d)
371  if (associated(al)) deallocate(al)
372  if (associated(au)) deallocate(au)
373  if (associated(indexl)) deallocate(indexl)
374  if (associated(indexu)) deallocate(indexu)
375  if (associated(iteml)) deallocate(iteml)
376  if (associated(itemu)) deallocate(itemu)
377  end if
378  nullify(colorindex)
379  nullify(perm)
380  nullify(iperm)
381  nullify(alu)
382  nullify(d)
383  nullify(al)
384  nullify(au)
385  nullify(indexl)
386  nullify(indexu)
387  nullify(iteml)
388  nullify(itemu)
389  initialized = .false.
390  end subroutine hecmw_precond_ssor_33_clear
391 
392  subroutine write_debug_info
393  implicit none
394  integer(kind=kint) :: my_rank, ic, in
395  my_rank = hecmw_comm_get_rank()
396  !--------------------> debug: shizawa
397  if (my_rank.eq.0) then
398  write(*,*) 'DEBUG: Output fort.19000+myrank and fort.29000+myrank for coloring information'
399  endif
400  write(19000+my_rank,'(a)') '#NCOLORTot'
401  write(19000+my_rank,*) ncolor
402  write(19000+my_rank,'(a)') '#ic COLORindex(ic-1)+1 COLORindex(ic)'
403  do ic=1,ncolor
404  write(19000+my_rank,*) ic, colorindex(ic-1)+1,colorindex(ic)
405  enddo ! ic
406  write(29000+my_rank,'(a)') '#n_node'
407  write(29000+my_rank,*) n
408  write(29000+my_rank,'(a)') '#in OLDtoNEW(in) NEWtoOLD(in)'
409  do in=1,n
410  write(29000+my_rank,*) in, iperm(in), perm(in)
411  if (perm(iperm(in)) .ne. in) then
412  write(29000+my_rank,*) '** WARNING **: NEWtoOLD and OLDtoNEW: ',in
413  endif
414  enddo
415  end subroutine write_debug_info
416 
417  subroutine check_ordering
418  implicit none
419  integer(kind=kint) :: ic, i, j, k
420  integer(kind=kint), allocatable :: iicolor(:)
421  ! check color dependence of neighbouring nodes
422  if (ncolor.gt.1) then
423  allocate(iicolor(n))
424  do ic=1,ncolor
425  do i= colorindex(ic-1)+1, colorindex(ic)
426  iicolor(i) = ic
427  enddo ! i
428  enddo ! ic
429  ! FORWARD: L-part
430  do ic=1,ncolor
431  do i= colorindex(ic-1)+1, colorindex(ic)
432  do j= indexl(i-1)+1, indexl(i)
433  k= iteml(j)
434  if (iicolor(i).eq.iicolor(k)) then
435  write(*,*) .eq.'** ERROR **: L-part: iicolor(i)iicolor(k)',i,k,iicolor(i)
436  endif
437  enddo ! j
438  enddo ! i
439  enddo ! ic
440  ! BACKWARD: U-part
441  do ic=ncolor, 1, -1
442  do i= colorindex(ic), colorindex(ic-1)+1, -1
443  do j= indexu(i-1)+1, indexu(i)
444  k= itemu(j)
445  if (iicolor(i).eq.iicolor(k)) then
446  write(*,*) .eq.'** ERROR **: U-part: iicolor(i)iicolor(k)',i,k,iicolor(i)
447  endif
448  enddo ! j
449  enddo ! i
450  enddo ! ic
451  deallocate(iicolor)
452  endif ! if (NColor.gt.1)
453  !--------------------< debug: shizawa
454  end subroutine check_ordering
455 
456 end module hecmw_precond_ssor_33
hecmw_util::hecmw_wtime
real(kind=kreal) function hecmw_wtime()
Definition: hecmw_util_f.F90:549
m_hecmw_matrix_ordering_cm
Definition: hecmw_matrix_ordering_CM.f90:6
hecmw_matrix_reorder::hecmw_matrix_reorder_values
subroutine, public hecmw_matrix_reorder_values(N, NDOF, perm, iperm, indexL, indexU, itemL, itemU, AL, AU, D, indexLp, indexUp, itemLp, itemUp, ALp, AUp, Dp)
Definition: hecmw_matrix_reorder.f90:71
hecmw_precond_ssor_33::hecmw_precond_ssor_33_apply
subroutine, public hecmw_precond_ssor_33_apply(ZP)
Definition: hecmw_precond_SSOR_33.f90:245
hecmw_precond_ssor_33::hecmw_precond_ssor_33_setup
subroutine, public hecmw_precond_ssor_33_setup(hecMAT)
Definition: hecmw_precond_SSOR_33.f90:56
hecmw_matrix_reorder::hecmw_matrix_reorder_profile
subroutine, public hecmw_matrix_reorder_profile(N, perm, iperm, indexL, indexU, itemL, itemU, indexLp, indexUp, itemLp, itemUp)
Definition: hecmw_matrix_reorder.f90:22
hecmw_matrix_reorder::hecmw_matrix_reorder_renum_item
subroutine, public hecmw_matrix_reorder_renum_item(N, perm, indexXp, itemXp)
Definition: hecmw_matrix_reorder.f90:144
hecmw_precond_ssor_33
Definition: hecmw_precond_SSOR_33.f90:11
hecmw_matrix_misc::hecmw_mat_get_ncolor_in
integer(kind=kint) function, public hecmw_mat_get_ncolor_in(hecMAT)
Definition: hecmw_matrix_misc.f90:531
hecmw_tuning_fx
Definition: hecmw_tuning_fx.f90:6
hecmw_tuning_fx::hecmw_tuning_fx_calc_sector_cache
subroutine, public hecmw_tuning_fx_calc_sector_cache(N, NDOF, sectorCacheSize0, sectorCacheSize1)
Definition: hecmw_tuning_fx.f90:25
hecmw_util
I/O and Utility.
Definition: hecmw_util_f.F90:7
m_hecmw_matrix_ordering_mc::hecmw_matrix_ordering_mc
subroutine, public hecmw_matrix_ordering_mc(N, indexL, itemL, indexU, itemU, perm_cur, ncolor_in, ncolor_out, COLORindex, perm, iperm)
Definition: hecmw_matrix_ordering_MC.f90:17
hecmw_util::kreal
integer(kind=4), parameter kreal
Definition: hecmw_util_f.F90:16
hecmw_matrix_misc::hecmw_mat_get_sigma_diag
real(kind=kreal) function, public hecmw_mat_get_sigma_diag(hecMAT)
Definition: hecmw_matrix_misc.f90:694
m_hecmw_matrix_ordering_cm::hecmw_matrix_ordering_rcm
subroutine, public hecmw_matrix_ordering_rcm(N, indexL, itemL, indexU, itemU, perm, iperm)
Definition: hecmw_matrix_ordering_CM.f90:67
m_hecmw_matrix_ordering_mc
Definition: hecmw_matrix_ordering_MC.f90:6
hecmw_matrix_misc
Definition: hecmw_matrix_misc.f90:6
hecmw_matrix_reorder
Definition: hecmw_matrix_reorder.f90:6
hecmw_precond_ssor_33::hecmw_precond_ssor_33_clear
subroutine, public hecmw_precond_ssor_33_clear(hecMAT)
Definition: hecmw_precond_SSOR_33.f90:361
hecmw_util::hecmw_comm_get_rank
integer(kind=kint) function hecmw_comm_get_rank()
Definition: hecmw_util_f.F90:582
hecmw_util::hecmwst_matrix
Definition: hecmw_util_f.F90:444