📄 forw_wf.f90
字号:
subroutine forw_wf(icplexwf,ndat,n1,n2,n3,nd1,nd2,nd3proc,&& max1,max2,max3,m1,m2,m3,md1,md2proc,md3,nproc,iproc,zr,zf)! Does multiple 3-dim backward FFTs from real into Fourier space! Adopt standard convention that isign=-1 for forward transform! CALCULATES THE DISCRETE FOURIERTRANSFORM ZF(I1,I3,I2)=! S_(j1,j2,j3) EXP(isign*i*2*pi*(j1*i1/n1+j2*i2/n2+j3*i3/n3)) ZR(j1,j2,j3)! in parallel using MPI/OpenMP. ! INPUT:! ZR: input array! ZR(1,i1,i2,i3,idat)=real(R(i1,i2,i3,idat))! ZR(2,i1,i2,i3,idat)=imag(R(i1,i2,i3,idat))! i1=1,n1 , i2=1,n2 , i3=1,n3 , idat=1,ndat! n1,n2,n3: logical dimension of the transform. As transform lengths! most products of the prime factors 2,3,5 are allowed.! The detailed table with allowed transform lengths can! be found in subroutine CTRIG! nd1,nd2,nd3: Dimension of ZR! nd3proc=((nd3-1)/nproc)+1 maximal number of big box 3rd dim slices for one proc! OUTPUT:! ZF: output array (note the switch of i2 and i3)! real(F(i1,i3,i2,idat))=ZF(1,i1,i3,i2,idat)! imag(F(i1,i3,i2,idat))=ZF(2,i1,i3,i2,idat)! max1 is positive or zero ; m1 >=max1+1! i1= 1... max1+1 corresponds to positive and zero wavevectors 0 ... max1! then, if m1 > max1+1, one has min1=max1-m1+1 and! i1= max1+2 ... m1 corresponds to negative wavevectors min1 ... -1! i2 and i3 have a similar definition of range! idat=1,ndat! md1,md2,md3: Dimension of ZF! md2proc=((md2-1)/nproc)+1 maximal number of small box 2nd dim slices for one proc! nproc: number of processors used as returned by MPI_COMM_SIZE! iproc: [0:nproc-1] number of processor as returned by MPI_COMM_RANK!! PERFORMANCE CONSIDERATIONS:! The maximum number of processors that can reasonably be used is max(n2/2,n3/2)!! It is very important to find the optimal ! value of NCACHE. NCACHE determines the size of the work array ZW, that! has to fit into cache. It has therefore to be chosen to equal roughly ! half the size of the physical cache in units of real*8 numbers.! The optimal value of ncache can easily be determined by numerical ! experimentation. A too large value of ncache leads to a dramatic ! and sudden decrease of performance, a too small value to a to a ! slow and less dramatic decrease of performance. If NCACHE is set ! to a value so small, that not even a single one dimensional transform ! can be done in the workarray zw, the program stops with an error message.!! RESTRICTIONS on USAGE! Copyright (C) 2002-2005 Stefan Goedecker, CEA Grenoble! This file is distributed under the terms of the! GNU General Public License, see http://www.gnu.org/copyleft/gpl.txt . implicit real*8 (a-h,o-z)# if defined MPI_FFT include 'mpif.h'# endif! real space input integer, parameter :: ddp = kind(1.d0) REAL(DDP), DIMENSION(2,nd1,nd2,nd3proc,ndat) :: zr! Fourier space output REAL(DDP), DIMENSION(2,md1,md3,md2proc,ndat) :: zf! work arrays for transpositions REAL(DDP), ALLOCATABLE, DIMENSION(:,:,:) :: zt! work arrays for MPI REAL(DDP), ALLOCATABLE, DIMENSION(:,:,:,:) :: zmpi1 REAL(DDP), ALLOCATABLE, DIMENSION(:,:,:,:) :: zmpi2! cache work array REAL(DDP), ALLOCATABLE, DIMENSION(:,:,:) :: zw! FFT work arrays REAL(DDP), ALLOCATABLE, DIMENSION(:,:) :: trig1,trig2,trig3 INTEGER, ALLOCATABLE, DIMENSION(:) :: after1,now1,before1, & after2,now2,before2,after3,now3,before3!$ interface!$ integer ( kind=4 ) function omp_get_num_threads ( )!$ end function omp_get_num_threads!$ end interface!$ interface!$ integer ( kind=4 ) function omp_get_thread_num ( )!$ end function omp_get_thread_num!$ end interface write(6,*)' forw_wf : enter ' ! find cache size that gives optimal performance on machine ncache=4*1024 if (ncache/(4*max(n1,n2,n3)).lt.1) then write(6,*) & ' ncache has to be enlarged to be able to hold at' // & 'least one 1-d FFT of each size even though this will' // & 'reduce the performance for shorter transform lengths' stop endif lock=0!$omp parallel default(private) &!$omp shared(ndat,n1,n2,n3,nd1,nd2,nd3proc,md1,md2proc,md3,iproc,nproc,ncache,zr,zf,lock) &!$omp shared(max1,max2,max3,m1,m2,m3,icplexwf) iam=0 npr=1!$ iam=omp_get_thread_num()!$ npr=omp_get_num_threads()! Effective m1 and m2 (complex-to-complex or real-to-complex) n1eff=n1 ; m2eff=m2 ; m1zt=n1 if(icplexwf==1)then n1eff=(n1+1)/2 ; m2eff=m2/2+1 ; m1zt=2*(n1/2+1) endif lzt=m2eff if (mod(m2eff,2).eq.0) lzt=lzt+1 if (mod(m2eff,4).eq.0) lzt=lzt+1 nnd3=nd3proc*nproc ! maximal number of big box 3rd dim slices for all procs!$omp critical allocate(trig1(2,2048),after1(7),now1(7),before1(7), & trig2(2,2048),after2(7),now2(7),before2(7), & trig3(2,2048),after3(7),now3(7),before3(7), & zw(2,ncache/4,2),zt(2,lzt,m1zt), & zmpi2(2,md1,md2proc,nnd3)) if (nproc.gt.1) allocate(zmpi1(2,md1,md2proc,nnd3))!$omp end critical call ctrig(n2,trig2,after2,before2,now2,-1,ic2) call ctrig(n1,trig1,after1,before1,now1,-1,ic1) call ctrig(n3,trig3,after3,before3,now3,-1,ic3)!DEBUG! write(6,'(a,3i4)' )'n1,n2,n3',n1,n2,n3! write(6,'(a,3i4)' )'nd1,nd2,nd3proc',nd1,nd2,nd3proc! write(6,'(a,3i4)' )'m1,m2,m3',m1,m2,m3! write(6,'(a,3i4)' )'max1,max2,max3',max1,max2,max3! write(6,'(a,3i4)' )'md1,md2proc,md3',md1,md2proc,md3! write(6,'(a,3i4)' )'n1eff,m2eff,m1zt',n1eff,m2eff,m1zt! do i3=1,n3! do i2=1,n2! do i1=1,n1! write(6, '(3i4,2es16.6)')i1,i2,i3,zr(1:2,i1,i2,i3)! enddo! enddo! enddo! stop!ENDDEBUG!$omp do do 12345,idat=1,ndat do 1212,j3=1,nd3proc if (iproc*nd3proc+j3.le.n3) then Jp2st=1 J2st=1 if(icplexwf==1)then n1half=n1/2 do i2=1,n2 do i1=1,n1half zr(1,i1,i2,j3,idat)=zr(1,2*i1-1,i2,j3,idat) zr(2,i1,i2,j3,idat)=zr(1,2*i1 ,i2,j3,idat) enddo enddo! If odd if(n1half*2/=n1)then do i2=1,n2 zr(1,n1eff,i2,j3,idat)=zr(1,n1,i2,j3,idat) zr(2,n1eff,i2,j3,idat)=0.0d0 enddo endif endif! transform along y axis! input: i1,i2,j3,(jp3) lot=ncache/(4*n2) do 2000,j=1,n1eff,lot ma=j mb=min(j+(lot-1),n1eff) n1dfft=mb-ma+1 i=1 call fftstp(nd1,n1dfft,nd2,lot,n2,zr(1,j,1,j3,idat),zw(1,1,1), & trig2,after2(i),now2(i),before2(i),-1) inzee=1 do i=2,ic2 call fftstp(lot,n1dfft,n2,lot,n2,zw(1,1,inzee),zw(1,1,3-inzee), & trig2,after2(i),now2(i),before2(i),-1) inzee=3-inzee enddo! input: i1,I2,j3,(jp3) if(icplexwf==2)then call unswitch_cent(n1dfft,max2,m2,n2,lot,n1,lzt,zw(1,1,inzee),zt(1,1,j)) else call unswitchreal_cent(n1dfft,max2,n2,lot,n1,lzt,zw(1,1,inzee),zt(1,1,2*j-1)) endif! output: I2,i1,j3,(jp3)2000 continue!DEBUG! write(6,*)' j3=1, zt ='! do i1=1,m1zt! do i2=1,lzt! write(6, '(2i4,2es16.6)')i2,i1,zt(1:2,i2,i1)! enddo! enddo! stop!ENDDEBUG!DEBUG! write(6,*)' forwf_wf : after unswitch_cent, j3=',j3!ENDDEBUG! transform along x axis! input: I2,i1,j3,(jp3) lot=ncache/(4*n1) do 1000,j=1,m2eff,lot ma=j mb=min(j+(lot-1),m2eff) n1dfft=mb-ma+1 i=1 call fftstp(lzt,n1dfft,m1zt,lot,n1,zt(1,j,1),zw(1,1,1), & trig1,after1(i),now1(i),before1(i),-1) inzee=1 do i=2,ic1 call fftstp(lot,n1dfft,n1,lot,n1,zw(1,1,inzee),zw(1,1,3-inzee), & trig1,after1(i),now1(i),before1(i),-1) inzee=3-inzee enddo! output: I2,I1,j3,(jp3)! input: J2,Jp2,I1,j3,(jp3) if (nproc.eq.1) then call unmpiswitch_cent(j3,n1dfft,Jp2st,J2st,lot,max1,md1,m1,n1,&& md2proc,nd3proc,nproc,zw(1,1,inzee),zmpi2) else call unmpiswitch_cent(j3,n1dfft,Jp2st,J2st,lot,max1,md1,m1,n1,&& md2proc,nd3proc,nproc,zw(1,1,inzee),zmpi1)! output: I1,J2,j3,Jp2,(jp3) endif1000 continue endif1212 continue!DEBUG! write(6,*)' zmpi2 ='! do i3=1,nnd3! do i2=1,m2eff! do i1=1,md1! write(6, '(3i4,2es16.6)')i1,i2,i3,zmpi2(1:2,i1,i2,i3)! enddo! enddo! enddo! stop!ENDDEBUG!DEBUG! write(6,*)' forwf_wf : interproc data transp'!ENDDEBUG! Interprocessor data transposition! intput: I1,J2,j3,Jp2,(jp3) if (nproc.gt.1) then11 continue!$omp flush(lock) if (mod(lock,npr).ne.iam) goto 11# if defined MPI_FFT call MPI_ALLTOALL(zmpi1,n1*md2proc*nd3proc, & MPI_double_precision, & zmpi2,n1*md2proc*nd3proc, & MPI_double_precision,MPI_COMM_WORLD,ierr)# endif lock=lock+1 !$omp flush(lock) endif! output: I1,J2,j3,jp3,(Jp2)! transform along z axis! input: I1,J2,i3,(Jp2) lot=ncache/(4*n3) do 3333,j2=1,md2proc if (iproc*md2proc+j2.le.m2eff) then!DEBUG! write(6,*)' forwf_wf : before unscramble, j2,md2proc,iproc,m2=',j2,md2proc,iproc,m2!ENDDEBUG do 3000,i1=1,m1,lot ma=i1 mb=min(i1+(lot-1),m1) n1dfft=mb-ma+1! input: I1,J2,i3,(Jp2) call unscramble(i1,j2,lot,n1dfft,md1,n3,md2proc,nnd3,zmpi2,zw(1,1,1))! output: I1,i3,J2,(Jp2) inzee=1 do i=1,ic3 call fftstp(lot,n1dfft,n3,lot,n3,zw(1,1,inzee),zw(1,1,3-inzee), & trig3,after3(i),now3(i),before3(i),-1) inzee=3-inzee enddo call unfill_cent(md1,md3,lot,n1dfft,max3,m3,n3,zw(1,1,inzee),zf(1,i1,1,j2,idat))! output: I1,I3,J2,(Jp2)3000 continue endif3333 continue if(icplexwf==1)then! Complete missing values with complex conjugate! Inverse of ix is located at nx+2-ix , except for ix=1, for which it is 1. do i3=1,m3 i3inv=m3+2-i3 if(i3==1)i3inv=1 if(m2eff>1)then do i2=2,m2eff i2inv=m2+2-i2 zf(1,1,i3inv,i2inv,idat)= zf(1,1,i3,i2,idat) zf(2,1,i3inv,i2inv,idat)=-zf(2,1,i3,i2,idat) do i1=2,m1 i1inv=m1+2-i1 zf(1,i1inv,i3inv,i2inv,idat)= zf(1,i1,i3,i2,idat) zf(2,i1inv,i3inv,i2inv,idat)=-zf(2,i1,i3,i2,idat) enddo enddo endif enddo endif12345 continue!DEBUG! write(6,*)' zf ='! do i2=1,md2proc! do i3=1,md3! do i1=1,md1! write(6, '(3i4,2es16.6)')i1,i3,i2,zf(1:2,i1,i3,i2)! enddo! enddo! enddo! stop!ENDDEBUG!$omp enddo deallocate(trig1,after1,now1,before1, & trig2,after2,now2,before2, & trig3,after3,now3,before3, & zmpi2,zw,zt) if (nproc.gt.1) deallocate(zmpi1)!$omp end parallel write(6,*)' forw_wf : exit ' return end
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -