📄 forw.f90
字号:
subroutine forw(icplex,ndat,n1,n2,n3,nd1,nd2,nd3,nd1eff,nd2proc,nd3proc,nproc,iproc,option,zr,zf)! Adopt standard convention that isign=-1 for forward transform! CALCULATES THE DISCRETE FOURIERTRANSFORM ZF(I1,I3,I2)=! S_(j1,j2,j3) EXP(isign*i*2*pi*(j1*i1/n1+j2*i2/n2+j3*i3/n3)) ZR(j1,j2,j3)! in parallel using MPI/OpenMP and BLAS library calls.! SHOULD describe nd1eff! SHOULD put icplex and nd1eff in OMP declarations! SHOULD describe the change of value of nd2prod! INPUT:! ZR: input array! ZR(1,i1,i2,i3,idat)=real(R(i1,i2,i3,idat))! ZR(2,i1,i2,i3,idat)=imag(R(i1,i2,i3,idat))! i1=1,n1 , i2=1,n2 , i3=1,n3 , idat=1,ndat! OUTPUT:! ZF: output array (note the switch of i2 and i3)! real(F(i1,i3,i2,idat))=ZF(1,i1,i3,i2,idat)! imag(F(i1,i3,i2,idat))=ZF(2,i1,i3,i2,idat)! i1=1,n1 , i2=1,n2 , i3=1,n3 , idat=1,ndat! nproc: number of processors used as returned by MPI_COMM_SIZE! iproc: [0:nproc-1] number of processor as returned by MPI_COMM_RANK! n1,n2,n3: logical dimension of the transform. As transform lengths ! most products of the prime factors 2,3,5 are allowed.! The detailed table with allowed transform lengths can ! be found in subroutine CTRIG! nd1,nd2,nd3: Dimension of ZR and ZF! nd2proc=((nd2-1)/nproc)+1 maximal number of 2nd dim slices! nd3proc=((nd3-1)/nproc)+1 maximal number of 3rd dim slices!! PERFORMANCE CONSIDERATIONS:! The maximum number of processors that can reasonably be used is max(n2,n3)!! It is very important to find the optimal ! value of NCACHE. NCACHE determines the size of the work array ZW, that! has to fit into cache. It has therefore to be chosen to equal roughly ! half the size of the physical cache in units of real*8 numbers.! The optimal value of ncache can easily be determined by numerical ! experimentation. A too large value of ncache leads to a dramatic ! and sudden decrease of performance, a too small value to a to a ! slow and less dramatic decrease of performance. If NCACHE is set ! to a value so small, that not even a single one dimensional transform ! can be done in the workarray zw, the program stops with an error message.!! RESTRICTIONS on USAGE! Copyright (C) 2002-2005 Stefan Goedecker, CEA Grenoble! This file is distributed under the terms of the! GNU General Public License, see http://www.gnu.org/copyleft/gpl.txt . implicit real*8 (a-h,o-z)# if defined MPI_FFT include 'mpif.h'# endif! real space input integer, parameter :: ddp = kind(1.d0) REAL(DDP), DIMENSION(2,nd1eff,nd2,nd3proc,ndat) :: zr! Fourier space output REAL(DDP), DIMENSION(2,nd1,nd3,nd2proc,ndat) :: zf! work arrays for transpositions REAL(DDP), ALLOCATABLE, DIMENSION(:,:,:) :: zt! work arrays for MPI REAL(DDP), ALLOCATABLE, DIMENSION(:,:,:,:) :: zmpi1 REAL(DDP), ALLOCATABLE, DIMENSION(:,:,:,:) :: zmpi2! cache work array REAL(DDP), ALLOCATABLE, DIMENSION(:,:,:) :: zw! FFT work arrays REAL(DDP), ALLOCATABLE, DIMENSION(:,:) :: trig1,trig2,trig3 INTEGER, ALLOCATABLE, DIMENSION(:) :: after1,now1,before1, & after2,now2,before2,after3,now3,before3 integer :: option!$ interface!$ integer ( kind=4 ) function omp_get_num_threads ( )!$ end function omp_get_num_threads!$ end interface!$ interface!$ integer ( kind=4 ) function omp_get_thread_num ( )!$ end function omp_get_thread_num!$ end interface!DEBUG! write(6,*)' forw : enter '!ENDDEBUG if (option==1) call timab(532,1,tsec)! find cache size that gives optimal performance on machine ncache=4*1024 if (ncache/(4*max(n1,n2,n3)).lt.1) then write(6,*) & ' ncache has to be enlarged to be able to hold at' // & 'least one 1-d FFT of each size even though this will' // & 'reduce the performance for shorter transform lengths' stop endif! check input if (nd1.lt.n1) stop 'ERROR:nd1' if (nd2.lt.n2) stop 'ERROR:nd2' if (nd3.lt.n3) stop 'ERROR:nd3' lock=0!$omp parallel default(private) &!$omp shared(ndat,n1,n2,n3,nd1,nd2,nd3,nd2proc,nd3proc,iproc,nproc,ncache,zr,zf,lock,icplex) iam=0 npr=1!$ iam=omp_get_thread_num()!$ npr=omp_get_num_threads()! Effective n1 and n2 (complex-to-complex or real-to-complex) n1eff=n1 ; n2eff=n2 ; n1zt=n1 if(icplex==1)then n1eff=(n1+1)/2 ; n2eff=n2/2+1 ; n1zt=2*(n1/2+1) endif lzt=n2eff if (mod(n2eff,2).eq.0) lzt=lzt+1 if (mod(n2eff,4).eq.0) lzt=lzt+1 nnd3=nd3proc*nproc ! maximal number of big box 3rd dim slices for all procs!$omp critical allocate(trig1(2,2048),after1(7),now1(7),before1(7), & trig2(2,2048),after2(7),now2(7),before2(7), & trig3(2,2048),after3(7),now3(7),before3(7), & zw(2,ncache/4,2),zt(2,lzt,n1zt), & zmpi2(2,n1,nd2proc,nnd3)) if (nproc.gt.1) allocate(zmpi1(2,n1,nd2proc,nnd3))!$omp end critical call ctrig(n2,trig2,after2,before2,now2,-1,ic2) call ctrig(n1,trig1,after1,before1,now1,-1,ic1) call ctrig(n3,trig3,after3,before3,now3,-1,ic3)!$omp do do 12345,idat=1,ndat do 1212,j3=1,nd3proc if (iproc*(nd3proc)+j3.le.n3) then Jp2st=1 J2st=1! transform along y axis! input: i1,i2,j3,(jp3) lot=ncache/(4*n2) do 2000,j=1,n1eff,lot ma=j mb=min(j+(lot-1),n1eff) n1dfft=mb-ma+1 i=1 call fftstp(nd1eff,n1dfft,nd2,lot,n2,zr(1,j,1,j3,idat),zw(1,1,1), & trig2,after2(i),now2(i),before2(i),-1) inzee=1 do i=2,ic2 call fftstp(lot,n1dfft,n2,lot,n2,zw(1,1,inzee),zw(1,1,3-inzee), & trig2,after2(i),now2(i),before2(i),-1) inzee=3-inzee enddo! input: i1,I2,j3,(jp3) if(icplex==2)then call unswitch(n1dfft,n2,lot,n1zt,lzt,zw(1,1,inzee),zt(1,1,j)) else call unswitchreal(n1dfft,n2,n2eff,lot,n1zt,lzt,zw(1,1,inzee),zt(1,1,2*j-1)) endif! output: I2,i1,j3,(jp3)2000 continue! transform along x axis! input: I2,i1,j3,(jp3) lot=ncache/(4*n1) do 1000,j=1,n2eff,lot ma=j mb=min(j+(lot-1),n2eff) n1dfft=mb-ma+1 i=1 call fftstp(lzt,n1dfft,n1zt,lot,n1,zt(1,j,1),zw(1,1,1), & trig1,after1(i),now1(i),before1(i),-1) inzee=1 do i=2,ic1 call fftstp(lot,n1dfft,n1,lot,n1,zw(1,1,inzee),zw(1,1,3-inzee), & trig1,after1(i),now1(i),before1(i),-1) inzee=3-inzee enddo! output: I2,I1,j3,(jp3)! input: J2,Jp2,I1,j3,(jp3)! write(6,*) 'J2st,Jp2st',J2st,Jp2st if (nproc.eq.1) then call unmpiswitch(j3,n1dfft,Jp2st,J2st,lot,n1,nd2proc,nd3proc,nproc,option,zw(1,1,inzee),zmpi2) else call unmpiswitch(j3,n1dfft,Jp2st,J2st,lot,n1,nd2proc,nd3proc,nproc,option,zw(1,1,inzee),zmpi1) endif! output: I1,J2,j3,Jp2,(jp3)1000 continue endif 1212 continue!DEBUG! write(6,*)' after j3 loop'!ENDDEBUG! Interprocessor data transposition! intput: I1,J2,j3,Jp2,(jp3) if (nproc.gt.1) then11 continue!$omp flush(lock) if (mod(lock,npr).ne.iam) goto 11# if defined MPI_FFT call timab(534,1,tsec) call MPI_ALLTOALL(zmpi1,2*n1*nd2proc*nd3proc, & MPI_double_precision, & zmpi2,2*n1*nd2proc*nd3proc, & MPI_double_precision,MPI_COMM_WORLD,ierr) call timab(534,2,tsec)# endif lock=lock+1 !$omp flush(lock) endif! output: I1,J2,j3,jp3,(Jp2)! transform along z axis! input: I1,J2,i3,(Jp2) lot=ncache/(4*n3) do 3333,j2=1,nd2proc if (iproc*(nd2proc)+j2.le.n2eff) then do 3000,i1=1,n1,lot ma=i1 mb=min(i1+(lot-1),n1) n1dfft=mb-ma+1! input: I1,J2,i3,(Jp2) call unscramble(i1,j2,lot,n1dfft,n1,n3,nd2proc,nd3,zmpi2,zw(1,1,1))! output: I1,i3,J2,(Jp2) inzee=1 do i=1,ic3 call fftstp(lot,n1dfft,n3,lot,n3,zw(1,1,inzee),zw(1,1,3-inzee), & trig3,after3(i),now3(i),before3(i),-1) inzee=3-inzee enddo call unfill(nd1,nd3,lot,n1dfft,n3,zw(1,1,inzee),zf(1,i1,1,j2,idat))! output: I1,I3,J2,(Jp2)3000 continue endif3333 continue12345 continue!$omp enddo deallocate(trig1,after1,now1,before1, & trig2,after2,now2,before2, & trig3,after3,now3,before3, & zmpi2,zw,zt) if (nproc.gt.1) deallocate(zmpi1)!$omp end parallel!DEBUG! write(6,*)' forw : exit '!ENDDEBUG if (option==1) call timab(532,2,tsec) return end
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -