25 integer(kind=kint) :: N
26 real(kind=
kreal),
pointer :: d(:) => null()
27 real(kind=
kreal),
pointer :: al(:) => null()
28 real(kind=
kreal),
pointer :: au(:) => null()
29 integer(kind=kint),
pointer :: indexL(:) => null()
30 integer(kind=kint),
pointer :: indexU(:) => null()
31 integer(kind=kint),
pointer :: itemL(:) => null()
32 integer(kind=kint),
pointer :: itemU(:) => null()
33 real(kind=
kreal),
pointer :: alu(:) => null()
35 integer(kind=kint) :: NColor
36 integer(kind=kint),
pointer :: COLORindex(:) => null()
37 integer(kind=kint),
pointer :: perm(:) => null()
38 integer(kind=kint),
pointer :: iperm(:) => null()
40 logical,
save :: isFirst = .true.
42 logical,
save :: INITIALIZED = .false.
45 integer(kind=kint),
parameter :: numOfBlockPerThread = 100
46 integer(kind=kint),
save :: numOfThread = 1, numofblock
47 integer(kind=kint),
save,
allocatable :: icToBlockIndex(:)
48 integer(kind=kint),
save,
allocatable :: blockIndexToColorIndex(:)
49 integer(kind=kint),
save :: sectorCacheSize0, sectorCacheSize1
51 integer(kind=kint),
parameter :: DEBUG = 0
58 integer(kind=kint ) :: npl, npu, npcl, npcu
59 real (kind=
kreal),
allocatable :: cd(:)
60 integer(kind=kint ) :: ncolor_in
61 real (kind=
kreal) :: sigma_diag
62 real (kind=
kreal) :: alutmp(3,3), pw(3)
63 integer(kind=kint ) :: ii, i, j, k
64 integer(kind=kint ) :: nthreads = 1
65 integer(kind=kint ),
allocatable :: perm_tmp(:)
66 real (kind=
kreal) :: t0
70 write(*,*)
'DEBUG: SSOR setup start',
hecmw_wtime()-t0
74 if (hecmat%Iarray(98) == 1)
then
76 else if (hecmat%Iarray(97) == 1)
then
90 if (nthreads == 1)
then
92 allocate(colorindex(0:1), perm(n), iperm(n))
100 allocate(colorindex(0:n), perm_tmp(n), perm(n), iperm(n))
102 hecmat%indexU, hecmat%itemU, perm_tmp, iperm)
103 if (debug >= 1)
write(*,*)
'DEBUG: RCM ordering done',
hecmw_wtime()-t0
105 hecmat%indexU, hecmat%itemU, perm_tmp, &
106 ncolor_in, ncolor, colorindex, perm, iperm)
107 if (debug >= 1)
write(*,*)
'DEBUG: MC ordering done',
hecmw_wtime()-t0
115 do j=hecmat%indexU(i-1)+1,hecmat%indexU(i)
116 if( hecmat%itemU(j) > n )
exit
120 npl = max(hecmat%indexL(n),npl)
121 npu = hecmat%indexU(n)
122 allocate(indexl(0:n), indexu(0:n), iteml(npl), itemu(npu))
124 hecmat%indexL, hecmat%indexU, hecmat%itemL, hecmat%itemU, &
125 indexl, indexu, iteml, itemu)
126 if (debug >= 1)
write(*,*)
'DEBUG: reordering profile done',
hecmw_wtime()-t0
130 allocate(d(9*n), al(9*npl), au(9*npu))
132 hecmat%indexL, hecmat%indexU, hecmat%itemL, hecmat%itemU, &
133 hecmat%AL, hecmat%AU, hecmat%D, &
134 indexl, indexu, iteml, itemu, al, au, d)
135 if (debug >= 1)
write(*,*)
'DEBUG: reordering values done',
hecmw_wtime()-t0
150 alutmp(1,1)= alu(9*ii-8) * sigma_diag
151 alutmp(1,2)= alu(9*ii-7)
152 alutmp(1,3)= alu(9*ii-6)
153 alutmp(2,1)= alu(9*ii-5)
154 alutmp(2,2)= alu(9*ii-4) * sigma_diag
155 alutmp(2,3)= alu(9*ii-3)
156 alutmp(3,1)= alu(9*ii-2)
157 alutmp(3,2)= alu(9*ii-1)
158 alutmp(3,3)= alu(9*ii ) * sigma_diag
160 alutmp(k,k)= 1.d0/alutmp(k,k)
162 alutmp(i,k)= alutmp(i,k) * alutmp(k,k)
164 pw(j)= alutmp(i,j) - alutmp(i,k)*alutmp(k,j)
171 alu(9*ii-8)= alutmp(1,1)
172 alu(9*ii-7)= alutmp(1,2)
173 alu(9*ii-6)= alutmp(1,3)
174 alu(9*ii-5)= alutmp(2,1)
175 alu(9*ii-4)= alutmp(2,2)
176 alu(9*ii-3)= alutmp(2,3)
177 alu(9*ii-2)= alutmp(3,1)
178 alu(9*ii-1)= alutmp(3,2)
179 alu(9*ii )= alutmp(3,3)
187 hecmat%Iarray(98) = 0
188 hecmat%Iarray(97) = 0
190 if (debug >= 1)
write(*,*)
'DEBUG: SSOR setup done',
hecmw_wtime()-t0
194 subroutine setup_tuning_parameters
197 integer(kind=kint) :: blockindex, elementcount, numofelement, ii
198 real(kind=
kreal) :: numofelementperblock
199 integer(kind=kint) :: my_rank
200 integer(kind=kint) :: ic, i
201 if (debug >= 1)
write(*,*)
'DEBUG: setting up tuning parameters for SSOR'
203 numofblock = numofthread * numofblockperthread
204 if (
allocated(ictoblockindex))
deallocate(ictoblockindex)
205 if (
allocated(blockindextocolorindex))
deallocate(blockindextocolorindex)
206 allocate (ictoblockindex(0:ncolor), &
207 blockindextocolorindex(0:numofblock + ncolor))
208 numofelement = n + indexl(n) + indexu(n)
209 numofelementperblock = dble(numofelement) / numofblock
212 ictoblockindex(0) = 0
213 blockindextocolorindex = -1
214 blockindextocolorindex(0) = 0
223 do i = colorindex(ic-1)+1, colorindex(ic)
224 elementcount = elementcount + 1
225 elementcount = elementcount + (indexl(i) - indexl(i-1))
226 elementcount = elementcount + (indexu(i) - indexu(i-1))
227 if (elementcount > ii * numofelementperblock &
228 .or. i == colorindex(ic))
then
230 blockindex = blockindex + 1
231 blockindextocolorindex(blockindex) = i
236 ictoblockindex(ic) = blockindex
238 numofblock = blockindex
241 sectorcachesize0, sectorcachesize1 )
242 end subroutine setup_tuning_parameters
246 real(kind=
kreal),
intent(inout) :: zp(:)
247 integer(kind=kint) :: ic, i, iold, j, isl, iel, isu, ieu, k
248 real(kind=
kreal) :: sw1, sw2, sw3, x1, x2, x3
251 integer(kind=kint) :: blockindex
254 call setup_tuning_parameters
272 do blockindex = ictoblockindex(ic-1)+1, ictoblockindex(ic)
273 do i = blockindextocolorindex(blockindex-1)+1, &
274 blockindextocolorindex(blockindex)
288 sw1= sw1 - al(9*j-8)*x1 - al(9*j-7)*x2 - al(9*j-6)*x3
289 sw2= sw2 - al(9*j-5)*x1 - al(9*j-4)*x2 - al(9*j-3)*x3
290 sw3= sw3 - al(9*j-2)*x1 - al(9*j-1)*x2 - al(9*j )*x3
296 x2= x2 - alu(9*i-5)*x1
297 x3= x3 - alu(9*i-2)*x1 - alu(9*i-1)*x2
299 x2= alu(9*i-4)*( x2 - alu(9*i-3)*x3 )
300 x1= alu(9*i-8)*( x1 - alu(9*i-6)*x3 - alu(9*i-7)*x2)
312 do blockindex = ictoblockindex(ic), ictoblockindex(ic-1)+1, -1
313 do i = blockindextocolorindex(blockindex), &
314 blockindextocolorindex(blockindex-1)+1, -1
330 sw1= sw1 + au(9*j-8)*x1 + au(9*j-7)*x2 + au(9*j-6)*x3
331 sw2= sw2 + au(9*j-5)*x1 + au(9*j-4)*x2 + au(9*j-3)*x3
332 sw3= sw3 + au(9*j-2)*x1 + au(9*j-1)*x2 + au(9*j )*x3
338 x2= x2 - alu(9*i-5)*x1
339 x3= x3 - alu(9*i-2)*x1 - alu(9*i-1)*x2
341 x2= alu(9*i-4)*( x2 - alu(9*i-3)*x3 )
342 x1= alu(9*i-8)*( x1 - alu(9*i-6)*x3 - alu(9*i-7)*x2)
344 zp(3*iold-2)= zp(3*iold-2) - x1
345 zp(3*iold-1)= zp(3*iold-1) - x2
346 zp(3*iold )= zp(3*iold ) - x3
363 integer(kind=kint ) :: nthreads = 1
365 if (
associated(colorindex))
deallocate(colorindex)
366 if (
associated(perm))
deallocate(perm)
367 if (
associated(iperm))
deallocate(iperm)
368 if (
associated(alu))
deallocate(alu)
369 if (nthreads >= 1)
then
370 if (
associated(d))
deallocate(d)
371 if (
associated(al))
deallocate(al)
372 if (
associated(au))
deallocate(au)
373 if (
associated(indexl))
deallocate(indexl)
374 if (
associated(indexu))
deallocate(indexu)
375 if (
associated(iteml))
deallocate(iteml)
376 if (
associated(itemu))
deallocate(itemu)
389 initialized = .false.
392 subroutine write_debug_info
394 integer(kind=kint) :: my_rank, ic, in
397 if (my_rank.eq.0)
then
398 write(*,*)
'DEBUG: Output fort.19000+myrank and fort.29000+myrank for coloring information'
400 write(19000+my_rank,
'(a)')
'#NCOLORTot'
401 write(19000+my_rank,*) ncolor
402 write(19000+my_rank,
'(a)')
'#ic COLORindex(ic-1)+1 COLORindex(ic)'
404 write(19000+my_rank,*) ic, colorindex(ic-1)+1,colorindex(ic)
406 write(29000+my_rank,
'(a)')
'#n_node'
407 write(29000+my_rank,*) n
408 write(29000+my_rank,
'(a)')
'#in OLDtoNEW(in) NEWtoOLD(in)'
410 write(29000+my_rank,*) in, iperm(in), perm(in)
411 if (perm(iperm(in)) .ne. in)
then
412 write(29000+my_rank,*)
'** WARNING **: NEWtoOLD and OLDtoNEW: ',in
415 end subroutine write_debug_info
417 subroutine check_ordering
419 integer(kind=kint) :: ic, i, j, k
420 integer(kind=kint),
allocatable :: iicolor(:)
422 if (ncolor.gt.1)
then
425 do i= colorindex(ic-1)+1, colorindex(ic)
431 do i= colorindex(ic-1)+1, colorindex(ic)
432 do j= indexl(i-1)+1, indexl(i)
434 if (iicolor(i).eq.iicolor(k))
then
435 write(*,*) .eq.
'** ERROR **: L-part: iicolor(i)iicolor(k)',i,k,iicolor(i)
442 do i= colorindex(ic), colorindex(ic-1)+1, -1
443 do j= indexu(i-1)+1, indexu(i)
445 if (iicolor(i).eq.iicolor(k))
then
446 write(*,*) .eq.
'** ERROR **: U-part: iicolor(i)iicolor(k)',i,k,iicolor(i)
454 end subroutine check_ordering