FrontISTR  5.9.0
Large-scale structural analysis program with finit element method
hecmw_precond_SSOR_nn.F90
Go to the documentation of this file.
1 !-------------------------------------------------------------------------------
2 ! Copyright (c) 2019 FrontISTR Commons
3 ! This software is released under the MIT License, see LICENSE.txt
4 !-------------------------------------------------------------------------------
5 
6 !C
7 !C***
8 !C*** module hecmw_precond_SSOR_nn
9 !C***
10 !C
12  use hecmw_util
17 #ifndef _OPENACC
18  !$ use omp_lib
19 #endif
20 
21  private
22 
26 
27  integer(kind=kint) :: N
28  real(kind=kreal), pointer :: d(:) => null()
29  real(kind=kreal), pointer :: al(:) => null()
30  real(kind=kreal), pointer :: au(:) => null()
31  integer(kind=kint), pointer :: indexL(:) => null()
32  integer(kind=kint), pointer :: indexU(:) => null()
33  integer(kind=kint), pointer :: itemL(:) => null()
34  integer(kind=kint), pointer :: itemU(:) => null()
35  real(kind=kreal), pointer :: alu(:) => null()
36 
37  integer(kind=kint) :: NColor
38  integer(kind=kint), pointer :: COLORindex(:) => null()
39  integer(kind=kint), pointer :: perm(:) => null()
40  integer(kind=kint), pointer :: iperm(:) => null()
41 
42  logical, save :: isFirst = .true.
43 
44  logical, save :: INITIALIZED = .false.
45 
46 contains
47 
48  subroutine hecmw_precond_ssor_nn_setup(hecMAT)
49  implicit none
50  type(hecmwst_matrix), intent(inout) :: hecmat
51  integer(kind=kint ) :: npl, npu
52  integer(kind=kint ) :: ncolor_in
53  real (kind=kreal) :: sigma_diag
54  real (kind=kreal) :: alutmp(hecmat%NDOF,hecmat%NDOF), pw(hecmat%NDOF)
55  integer(kind=kint ) :: ii, i, j, k, ndof, ndof2
56  integer(kind=kint ) :: nthreads = 1
57  integer(kind=kint ), allocatable :: perm_tmp(:)
58  !real (kind=kreal) :: t0
59 
60  !t0 = hecmw_Wtime()
61  !write(*,*) 'DEBUG: SSOR setup start', hecmw_Wtime()-t0
62 
63  if (initialized) then
64  if (hecmat%Iarray(98) == 1) then ! need symbolic and numerical setup
65  call hecmw_precond_ssor_nn_clear(hecmat)
66  else if (hecmat%Iarray(97) == 1) then ! need numerical setup only
67  call hecmw_precond_ssor_nn_clear(hecmat) ! TEMPORARY
68  else
69  return
70  endif
71  endif
72 
73 #ifndef _OPENACC
74  !$ nthreads = omp_get_max_threads()
75 #endif
76 
77  n = hecmat%N
78  ndof=hecmat%NDOF
79  ndof2=ndof*ndof
80 
81  ncolor_in = hecmw_mat_get_ncolor_in(hecmat)
82  sigma_diag = hecmw_mat_get_sigma_diag(hecmat)
83 
84 #ifdef _OPENACC
85  allocate(colorindex(0:n), perm_tmp(n), perm(n), iperm(n))
86  call hecmw_matrix_ordering_rcm(n, hecmat%indexL, hecmat%itemL, &
87  hecmat%indexU, hecmat%itemU, perm_tmp, iperm)
88  !write(*,*) 'DEBUG: RCM ordering done', hecmw_Wtime()-t0
89  call hecmw_matrix_ordering_mc(n, hecmat%indexL, hecmat%itemL, &
90  hecmat%indexU, hecmat%itemU, perm_tmp, &
91  ncolor_in, ncolor, colorindex, perm, iperm)
92  !write(*,*) 'DEBUG: MC ordering done', hecmw_Wtime()-t0
93  deallocate(perm_tmp)
94 
95  !call write_debug_info
96 #else
97  if (nthreads == 1) then
98  ncolor = 1
99  allocate(colorindex(0:1), perm(n), iperm(n))
100  colorindex(0) = 0
101  colorindex(1) = n
102  do i=1,n
103  perm(i) = i
104  iperm(i) = i
105  end do
106  else
107  allocate(colorindex(0:n), perm_tmp(n), perm(n), iperm(n))
108  call hecmw_matrix_ordering_rcm(n, hecmat%indexL, hecmat%itemL, &
109  hecmat%indexU, hecmat%itemU, perm_tmp, iperm)
110  !write(*,*) 'DEBUG: RCM ordering done', hecmw_Wtime()-t0
111  call hecmw_matrix_ordering_mc(n, hecmat%indexL, hecmat%itemL, &
112  hecmat%indexU, hecmat%itemU, perm_tmp, &
113  ncolor_in, ncolor, colorindex, perm, iperm)
114  !write(*,*) 'DEBUG: MC ordering done', hecmw_Wtime()-t0
115  deallocate(perm_tmp)
116 
117  !call write_debug_info
118  endif
119 #endif
120 
121  npl = hecmat%indexL(n)
122  npu = hecmat%indexU(n)
123  allocate(indexl(0:n), indexu(0:n), iteml(npl), itemu(npu))
124  call hecmw_matrix_reorder_profile(n, perm, iperm, &
125  hecmat%indexL, hecmat%indexU, hecmat%itemL, hecmat%itemU, &
126  indexl, indexu, iteml, itemu)
127  !write(*,*) 'DEBUG: reordering profile done', hecmw_Wtime()-t0
128 
129  !call check_ordering
130 
131  allocate(d(ndof2*n), al(ndof2*npl), au(ndof2*npu))
132  call hecmw_matrix_reorder_values(n, ndof, perm, iperm, &
133  hecmat%indexL, hecmat%indexU, hecmat%itemL, hecmat%itemU, &
134  hecmat%AL, hecmat%AU, hecmat%D, &
135  indexl, indexu, iteml, itemu, al, au, d)
136  !write(*,*) 'DEBUG: reordering values done', hecmw_Wtime()-t0
137 
138  call hecmw_matrix_reorder_renum_item(n, perm, indexl, iteml)
139  call hecmw_matrix_reorder_renum_item(n, perm, indexu, itemu)
140 
141  allocate(alu(ndof2*n))
142  alu = 0.d0
143 
144  do ii= 1, ndof2*n
145  alu(ii) = d(ii)
146  enddo
147 
148 #ifdef _OPENACC
149  !$acc kernels
150  !$acc loop independent private(ALUtmp,PW)
151 #else
152  !$omp parallel default(none),private(ii,ALUtmp,k,i,j,PW),shared(N,NDOF,NDOF2,ALU,SIGMA_DIAG)
153  !$omp do
154 #endif
155  do ii= 1, n
156  do i = 1, ndof
157  do j = 1, ndof
158  alutmp(i,j) = alu(ndof2*(ii-1)+(i-1)*ndof+j)
159  if (i==j) alutmp(i,j)=alutmp(i,j)*sigma_diag
160  end do
161  end do
162  do k= 1, ndof
163  alutmp(k,k)= 1.d0/alutmp(k,k)
164  do i= k+1, ndof
165  alutmp(i,k)= alutmp(i,k) * alutmp(k,k)
166  do j= k+1, ndof
167  pw(j)= alutmp(i,j) - alutmp(i,k)*alutmp(k,j)
168  enddo
169  do j= k+1, ndof
170  alutmp(i,j)= pw(j)
171  enddo
172  enddo
173  enddo
174  do i = 1, ndof
175  do j = 1, ndof
176  alu(ndof2*(ii-1)+(i-1)*ndof+j)= alutmp(i,j)
177  end do
178  end do
179  enddo
180 #ifdef _OPENACC
181  !$acc end kernels
182 #else
183  !$omp end do
184  !$omp end parallel
185 #endif
186 
187  isfirst = .true.
188 
189  initialized = .true.
190  hecmat%Iarray(98) = 0 ! symbolic setup done
191  hecmat%Iarray(97) = 0 ! numerical setup done
192 
193  !write(*,*) 'DEBUG: SSOR setup done', hecmw_Wtime()-t0
194 
195  end subroutine hecmw_precond_ssor_nn_setup
196 
197  subroutine hecmw_precond_ssor_nn_apply(ZP, NDOF)
198  use hecmw_tuning_fx
199  implicit none
200  real(kind=kreal), intent(inout) :: zp(:)
201  integer(kind=kint) :: ic, i, iold, j, isl, iel, isu, ieu, k, ndof, ndof2, idof,jdof
202  real(kind=kreal) :: sw(ndof), x(ndof)
203 
204  ! added for tuning >>>
205  integer(kind=kint), parameter :: numofblockperthread = 100
206  integer(kind=kint), save :: numofthread = 1, numofblock
207  integer(kind=kint), save, allocatable :: ictoblockindex(:)
208  integer(kind=kint), save, allocatable :: blockindextocolorindex(:)
209  integer(kind=kint), save :: sectorcachesize0, sectorcachesize1
210  integer(kind=kint) :: blockindex, elementcount, numofelement, ii
211  real(kind=kreal) :: numofelementperblock
212  integer(kind=kint) :: my_rank
213 
214  ndof2=ndof*ndof
215 #ifndef _OPENACC
216  if (isfirst) then
217  !$ numOfThread = omp_get_max_threads()
218  numofblock = numofthread * numofblockperthread
219  if (allocated(ictoblockindex)) deallocate(ictoblockindex)
220  if (allocated(blockindextocolorindex)) deallocate(blockindextocolorindex)
221  allocate (ictoblockindex(0:ncolor), &
222  blockindextocolorindex(0:numofblock + ncolor))
223  numofelement = n + indexl(n) + indexu(n)
224  numofelementperblock = dble(numofelement) / numofblock
225  blockindex = 0
226  ictoblockindex = -1
227  ictoblockindex(0) = 0
228  blockindextocolorindex = -1
229  blockindextocolorindex(0) = 0
230  my_rank = hecmw_comm_get_rank()
231  ! write(9000+my_rank,*) &
232  ! '# numOfElementPerBlock =', numOfElementPerBlock
233  ! write(9000+my_rank,*) &
234  ! '# ic, blockIndex, colorIndex, elementCount'
235  do ic = 1, ncolor
236  elementcount = 0
237  ii = 1
238  do i = colorindex(ic-1)+1, colorindex(ic)
239  elementcount = elementcount + 1
240  elementcount = elementcount + (indexl(i) - indexl(i-1))
241  elementcount = elementcount + (indexu(i) - indexu(i-1))
242  if (elementcount > ii * numofelementperblock &
243  .or. i == colorindex(ic)) then
244  ii = ii + 1
245  blockindex = blockindex + 1
246  blockindextocolorindex(blockindex) = i
247  ! write(9000+my_rank,*) ic, blockIndex, &
248  ! blockIndexToColorIndex(blockIndex), elementCount
249  endif
250  enddo
251  ictoblockindex(ic) = blockindex
252  enddo
253  numofblock = blockindex
254 
255  call hecmw_tuning_fx_calc_sector_cache( n, ndof, &
256  sectorcachesize0, sectorcachesize1 )
257 
258  isfirst = .false.
259  endif
260 #endif
261  ! <<< added for tuning
262 
263 #ifndef _OPENACC
264  !call start_collection("loopInPrecond33")
265 
266  !OCL CACHE_SECTOR_SIZE(sectorCacheSize0,sectorCacheSize1)
267  !OCL CACHE_SUBSECTOR_ASSIGN(ZP)
268 
269  !$omp parallel default(none) &
270  !$omp&shared(NColor,indexL,itemL,indexU,itemU,AL,AU,D,ALU,perm,&
271  !$omp& ZP,icToBlockIndex,blockIndexToColorIndex,NDOF,NDOF2) &
272  !$omp&private(SW,X,ic,i,iold,isL,ieL,isU,ieU,j,k,blockIndex,idof,jdof)
273 #endif
274 
275  !C-- FORWARD
276  do ic=1,ncolor
277 #ifdef _OPENACC
278  !$acc kernels
279  !$acc loop independent private(X,SW)
280  do i = colorindex(ic-1)+1, colorindex(ic)
281 #else
282  !$omp do schedule (static, 1)
283  do blockindex = ictoblockindex(ic-1)+1, ictoblockindex(ic)
284  do i = blockindextocolorindex(blockindex-1)+1, &
285  blockindextocolorindex(blockindex)
286 #endif
287  iold = perm(i)
288  do idof = 1, ndof
289  sw(idof) = zp(ndof*(iold-1)+idof)
290  end do
291  isl= indexl(i-1)+1
292  iel= indexl(i)
293  do j= isl, iel
294  k= iteml(j)
295  do idof = 1, ndof
296  x(idof) = zp(ndof*(k-1)+idof)
297  end do
298  do idof = 1, ndof
299  do jdof = 1, ndof
300  sw(idof) = sw(idof) - al(ndof2*(j-1)+ndof*(idof-1)+jdof)*x(jdof)
301  end do
302  end do
303  enddo ! j
304 
305  x = sw
306  do idof = 2,ndof
307  do jdof = 1, idof-1
308  x(idof) = x(idof) - alu(ndof2*(i-1)+ndof*(idof-1)+jdof )*x(jdof)
309  end do
310  end do
311  do idof = ndof, 1, -1
312  do jdof = ndof, idof+1, -1
313  x(idof) = x(idof) - alu(ndof2*(i-1)+ndof*(idof-1)+jdof)*x(jdof)
314  end do
315  x(idof) = alu(ndof2*(i-1)+(ndof+1)*(idof-1)+1)*x(idof)
316  end do
317  zp(ndof*(iold-1)+1:ndof*(iold-1)+ndof) = x(1:ndof)
318 
319 #ifdef _OPENACC
320  enddo
321  !$acc end kernels
322 #else
323  enddo ! i
324  enddo ! blockIndex
325  !$omp end do
326 #endif
327  enddo ! ic
328 
329  !C-- BACKWARD
330  do ic=ncolor, 1, -1
331 #ifdef _OPENACC
332  !$acc kernels
333  !$acc loop independent private(X,SW)
334  do i = colorindex(ic-1)+1, colorindex(ic)
335 #else
336  !$omp do schedule (static, 1)
337  do blockindex = ictoblockindex(ic), ictoblockindex(ic-1)+1, -1
338  do i = blockindextocolorindex(blockindex), &
339  blockindextocolorindex(blockindex-1)+1, -1
340 #endif
341  ! do blockIndex = icToBlockIndex(ic-1)+1, icToBlockIndex(ic)
342  ! do i = blockIndexToColorIndex(blockIndex-1)+1, &
343  ! blockIndexToColorIndex(blockIndex)
344  ! do i = endPos(threadNum, ic), startPos(threadNum, ic), -1
345  sw= 0.d0
346  isu= indexu(i-1) + 1
347  ieu= indexu(i)
348  do j= ieu, isu, -1
349  k= itemu(j)
350  do idof = 1, ndof
351  x(idof) = zp(ndof*(k-1)+idof)
352  end do
353  do idof = 1, ndof
354  do jdof = 1, ndof
355  sw(idof) = sw(idof) + au(ndof2*(j-1)+ndof*(idof-1)+jdof)*x(jdof)
356  end do
357  end do
358  enddo ! j
359 
360  x = sw
361  do idof = 2, ndof
362  do k = 1,idof-1
363  x(idof) = x(idof) - alu(ndof2*(i-1)+ndof*(idof-1)+k)*x(k)
364  end do
365  end do
366  do idof = ndof, 1, -1
367  do k = ndof, idof+1, -1
368  x(idof) = x(idof) - alu(ndof2*(i-1)+ndof*(idof-1)+k)*x(k)
369  end do
370  x(idof) = alu(ndof2*(i-1)+(ndof+1)*(idof-1)+1)*x(idof)
371  end do
372  iold = perm(i)
373  do idof = 1, ndof
374  zp(ndof*(iold-1)+idof) = zp(ndof*(iold-1)+idof) - x(idof)
375  end do
376 #ifdef _OPENACC
377  enddo
378  !$acc end kernels
379 #else
380  enddo ! i
381  enddo ! blockIndex
382  !$omp end do
383 #endif
384  enddo ! ic
385 #ifndef _OPENACC
386  !$omp end parallel
387 
388  !OCL END_CACHE_SUBSECTOR
389  !OCL END_CACHE_SECTOR_SIZE
390 
391  !call stop_collection("loopInPrecond33")
392 #endif
393 
394  end subroutine hecmw_precond_ssor_nn_apply
395 
396  subroutine hecmw_precond_ssor_nn_clear(hecMAT)
397  implicit none
398  type(hecmwst_matrix), intent(inout) :: hecmat
399  integer(kind=kint ) :: nthreads = 1
400 #ifndef _OPENACC
401  !$ nthreads = omp_get_max_threads()
402 #endif
403  if (associated(colorindex)) deallocate(colorindex)
404  if (associated(perm)) deallocate(perm)
405  if (associated(iperm)) deallocate(iperm)
406  if (associated(alu)) deallocate(alu)
407  if (nthreads >= 1) then
408  if (associated(d)) deallocate(d)
409  if (associated(al)) deallocate(al)
410  if (associated(au)) deallocate(au)
411  if (associated(indexl)) deallocate(indexl)
412  if (associated(indexu)) deallocate(indexu)
413  if (associated(iteml)) deallocate(iteml)
414  if (associated(itemu)) deallocate(itemu)
415  end if
416  nullify(colorindex)
417  nullify(perm)
418  nullify(iperm)
419  nullify(alu)
420  nullify(d)
421  nullify(al)
422  nullify(au)
423  nullify(indexl)
424  nullify(indexu)
425  nullify(iteml)
426  nullify(itemu)
427  initialized = .false.
428  end subroutine hecmw_precond_ssor_nn_clear
429 
430  subroutine write_debug_info
431  implicit none
432  integer(kind=kint) :: my_rank, ic, in
433  my_rank = hecmw_comm_get_rank()
434  !--------------------> debug: shizawa
435  if (my_rank.eq.0) then
436  write(*,*) 'DEBUG: Output fort.19000+myrank and fort.29000+myrank for coloring information'
437  endif
438  write(19000+my_rank,'(a)') '#NCOLORTot'
439  write(19000+my_rank,*) ncolor
440  write(19000+my_rank,'(a)') '#ic COLORindex(ic-1)+1 COLORindex(ic)'
441  do ic=1,ncolor
442  write(19000+my_rank,*) ic, colorindex(ic-1)+1,colorindex(ic)
443  enddo ! ic
444  write(29000+my_rank,'(a)') '#n_node'
445  write(29000+my_rank,*) n
446  write(29000+my_rank,'(a)') '#in OLDtoNEW(in) NEWtoOLD(in)'
447  do in=1,n
448  write(29000+my_rank,*) in, iperm(in), perm(in)
449  if (perm(iperm(in)) .ne. in) then
450  write(29000+my_rank,*) '** WARNING **: NEWtoOLD and OLDtoNEW: ',in
451  endif
452  enddo
453  end subroutine write_debug_info
454 
455  subroutine check_ordering
456  implicit none
457  integer(kind=kint) :: ic, i, j, k
458  integer(kind=kint), allocatable :: iicolor(:)
459  ! check color dependence of neighbouring nodes
460  if (ncolor.gt.1) then
461  allocate(iicolor(n))
462  do ic=1,ncolor
463  do i= colorindex(ic-1)+1, colorindex(ic)
464  iicolor(i) = ic
465  enddo ! i
466  enddo ! ic
467  ! FORWARD: L-part
468  do ic=1,ncolor
469  do i= colorindex(ic-1)+1, colorindex(ic)
470  do j= indexl(i-1)+1, indexl(i)
471  k= iteml(j)
472  if (iicolor(i).eq.iicolor(k)) then
473  write(*,*) .eq.'** ERROR **: L-part: iicolor(i)iicolor(k)',i,k,iicolor(i)
474  endif
475  enddo ! j
476  enddo ! i
477  enddo ! ic
478  ! BACKWARD: U-part
479  do ic=ncolor, 1, -1
480  do i= colorindex(ic), colorindex(ic-1)+1, -1
481  do j= indexu(i-1)+1, indexu(i)
482  k= itemu(j)
483  if (iicolor(i).eq.iicolor(k)) then
484  write(*,*) .eq.'** ERROR **: U-part: iicolor(i)iicolor(k)',i,k,iicolor(i)
485  endif
486  enddo ! j
487  enddo ! i
488  enddo ! ic
489  deallocate(iicolor)
490  endif ! if (NColor.gt.1)
491  !--------------------< debug: shizawa
492  end subroutine check_ordering
493 
494 end module hecmw_precond_ssor_nn
real(kind=kreal) function, public hecmw_mat_get_sigma_diag(hecMAT)
integer(kind=kint) function, public hecmw_mat_get_ncolor_in(hecMAT)
subroutine, public hecmw_matrix_reorder_values(N, NDOF, perm, iperm, indexL, indexU, itemL, itemU, AL, AU, D, indexLp, indexUp, itemLp, itemUp, ALp, AUp, Dp)
subroutine, public hecmw_matrix_reorder_renum_item(N, perm, indexXp, itemXp)
subroutine, public hecmw_matrix_reorder_profile(N, perm, iperm, indexL, indexU, itemL, itemU, indexLp, indexUp, itemLp, itemUp)
subroutine, public hecmw_precond_ssor_nn_setup(hecMAT)
subroutine, public hecmw_precond_ssor_nn_clear(hecMAT)
subroutine, public hecmw_precond_ssor_nn_apply(ZP, NDOF)
subroutine, public hecmw_tuning_fx_calc_sector_cache(N, NDOF, sectorCacheSize0, sectorCacheSize1)
I/O and Utility.
Definition: hecmw_util_f.F90:7
integer(kind=4), parameter kreal
integer(kind=kint) function hecmw_comm_get_rank()
subroutine, public hecmw_matrix_ordering_rcm(N, indexL, itemL, indexU, itemU, perm, iperm)
subroutine, public hecmw_matrix_ordering_mc(N, indexL, itemL, indexU, itemU, perm_cur, ncolor_in, ncolor_out, COLORindex, perm, iperm)