📄 forw_wf.f90

📁 基于linux操作下的fortran快速付立变换的程序
💻 F90
字号:
        subroutine forw_wf(icplexwf,ndat,n1,n2,n3,nd1,nd2,nd3proc,&&        max1,max2,max3,m1,m2,m3,md1,md2proc,md3,nproc,iproc,zr,zf)! Does multiple 3-dim backward FFTs from real into Fourier space! Adopt standard convention that isign=-1 for forward transform!	CALCULATES THE DISCRETE FOURIERTRANSFORM ZF(I1,I3,I2)=!	S_(j1,j2,j3) EXP(isign*i*2*pi*(j1*i1/n1+j2*i2/n2+j3*i3/n3)) ZR(j1,j2,j3)!       in parallel using MPI/OpenMP. !	INPUT:!          ZR: input array!               ZR(1,i1,i2,i3,idat)=real(R(i1,i2,i3,idat))!               ZR(2,i1,i2,i3,idat)=imag(R(i1,i2,i3,idat))!               i1=1,n1 , i2=1,n2 , i3=1,n3 , idat=1,ndat!          n1,n2,n3: logical dimension of the transform. As transform lengths!                    most products of the prime factors 2,3,5 are allowed.!                    The detailed table with allowed transform lengths can!                    be found in subroutine CTRIG!          nd1,nd2,nd3: Dimension of ZR!          nd3proc=((nd3-1)/nproc)+1  maximal number of big box 3rd dim slices for one proc!	OUTPUT:!          ZF: output array (note the switch of i2 and i3)!               real(F(i1,i3,i2,idat))=ZF(1,i1,i3,i2,idat)!               imag(F(i1,i3,i2,idat))=ZF(2,i1,i3,i2,idat)!          max1 is positive or zero ; m1 >=max1+1!          i1= 1... max1+1 corresponds to positive and zero wavevectors 0 ... max1!          then, if m1 > max1+1, one has min1=max1-m1+1 and!          i1= max1+2 ... m1 corresponds to negative wavevectors min1 ... -1!          i2 and i3 have a similar definition of range!          idat=1,ndat!          md1,md2,md3: Dimension of ZF!          md2proc=((md2-1)/nproc)+1  maximal number of small box 2nd dim slices for one proc!          nproc: number of processors used as returned by MPI_COMM_SIZE!          iproc: [0:nproc-1] number of processor as returned by MPI_COMM_RANK!!       PERFORMANCE CONSIDERATIONS:!       The maximum number of processors that can reasonably be used is max(n2/2,n3/2)!!       It is very important to find the optimal !       value of NCACHE. NCACHE determines the size of the work array ZW, that!       has to fit into cache. It has therefore to be chosen to equal roughly !	half the size of the physical cache in units of real*8 numbers.!       The optimal value of ncache can easily be determined by numerical !       experimentation. A too large value of ncache leads to a dramatic !       and sudden decrease of performance, a too small value to a to a !       slow and less dramatic decrease of performance. If NCACHE is set !       to a value so small, that not even a single one dimensional transform !       can be done in the workarray zw, the program stops with an error message.!!       RESTRICTIONS on USAGE!  Copyright (C) 2002-2005 Stefan Goedecker, CEA Grenoble!  This file is distributed under the terms of the!  GNU General Public License, see http://www.gnu.org/copyleft/gpl.txt .        implicit real*8 (a-h,o-z)#       if defined MPI_FFT        include 'mpif.h'#       endif! real space input        integer, parameter :: ddp = kind(1.d0)        REAL(DDP), DIMENSION(2,nd1,nd2,nd3proc,ndat) :: zr! Fourier space output        REAL(DDP), DIMENSION(2,md1,md3,md2proc,ndat) :: zf! work arrays for transpositions        REAL(DDP), ALLOCATABLE, DIMENSION(:,:,:) :: zt! work arrays for MPI        REAL(DDP), ALLOCATABLE, DIMENSION(:,:,:,:) :: zmpi1        REAL(DDP), ALLOCATABLE, DIMENSION(:,:,:,:) :: zmpi2! cache work array        REAL(DDP), ALLOCATABLE, DIMENSION(:,:,:) :: zw! FFT work arrays        REAL(DDP), ALLOCATABLE, DIMENSION(:,:) :: trig1,trig2,trig3        INTEGER, ALLOCATABLE, DIMENSION(:) :: after1,now1,before1, &                           after2,now2,before2,after3,now3,before3!$      interface!$        integer ( kind=4 ) function omp_get_num_threads ( )!$        end function omp_get_num_threads!$      end interface!$      interface!$        integer ( kind=4 ) function omp_get_thread_num ( )!$        end function omp_get_thread_num!$      end interface        write(6,*)' forw_wf : enter '        ! find cache size that gives optimal performance on machine        ncache=4*1024        if (ncache/(4*max(n1,n2,n3)).lt.1) then                      write(6,*) &                         ' ncache has to be enlarged to be able to hold at' // &                         'least one 1-d FFT of each size even though this will' // &                         'reduce the performance for shorter transform lengths'                       stop        endif	lock=0!$omp parallel  default(private) &!$omp shared(ndat,n1,n2,n3,nd1,nd2,nd3proc,md1,md2proc,md3,iproc,nproc,ncache,zr,zf,lock) &!$omp shared(max1,max2,max3,m1,m2,m3,icplexwf)        iam=0	npr=1!$      iam=omp_get_thread_num()!$      npr=omp_get_num_threads()!       Effective m1 and m2 (complex-to-complex or real-to-complex)        n1eff=n1 ; m2eff=m2 ; m1zt=n1        if(icplexwf==1)then         n1eff=(n1+1)/2 ; m2eff=m2/2+1 ; m1zt=2*(n1/2+1)        endif        lzt=m2eff        if (mod(m2eff,2).eq.0) lzt=lzt+1        if (mod(m2eff,4).eq.0) lzt=lzt+1        nnd3=nd3proc*nproc        ! maximal number of big box 3rd dim slices for all procs!$omp critical        allocate(trig1(2,2048),after1(7),now1(7),before1(7), &                 trig2(2,2048),after2(7),now2(7),before2(7), &                 trig3(2,2048),after3(7),now3(7),before3(7), &                 zw(2,ncache/4,2),zt(2,lzt,m1zt), &                 zmpi2(2,md1,md2proc,nnd3))	if (nproc.gt.1) allocate(zmpi1(2,md1,md2proc,nnd3))!$omp end critical	call ctrig(n2,trig2,after2,before2,now2,-1,ic2)        call ctrig(n1,trig1,after1,before1,now1,-1,ic1)        call ctrig(n3,trig3,after3,before3,now3,-1,ic3)!DEBUG!       write(6,'(a,3i4)' )'n1,n2,n3',n1,n2,n3!       write(6,'(a,3i4)' )'nd1,nd2,nd3proc',nd1,nd2,nd3proc!       write(6,'(a,3i4)' )'m1,m2,m3',m1,m2,m3!       write(6,'(a,3i4)' )'max1,max2,max3',max1,max2,max3!       write(6,'(a,3i4)' )'md1,md2proc,md3',md1,md2proc,md3!       write(6,'(a,3i4)' )'n1eff,m2eff,m1zt',n1eff,m2eff,m1zt!       do i3=1,n3!        do i2=1,n2!         do i1=1,n1!          write(6, '(3i4,2es16.6)')i1,i2,i3,zr(1:2,i1,i2,i3)!         enddo!        enddo!       enddo!       stop!ENDDEBUG!$omp do	do 12345,idat=1,ndat	do 1212,j3=1,nd3proc	if (iproc*nd3proc+j3.le.n3) then	Jp2st=1	J2st=1        if(icplexwf==1)then         n1half=n1/2         do i2=1,n2          do i1=1,n1half           zr(1,i1,i2,j3,idat)=zr(1,2*i1-1,i2,j3,idat)           zr(2,i1,i2,j3,idat)=zr(1,2*i1  ,i2,j3,idat)          enddo         enddo!        If odd         if(n1half*2/=n1)then          do i2=1,n2           zr(1,n1eff,i2,j3,idat)=zr(1,n1,i2,j3,idat)           zr(2,n1eff,i2,j3,idat)=0.0d0          enddo         endif        endif! transform along y axis! input: i1,i2,j3,(jp3)        lot=ncache/(4*n2)        do 2000,j=1,n1eff,lot        ma=j        mb=min(j+(lot-1),n1eff)	n1dfft=mb-ma+1	i=1	call fftstp(nd1,n1dfft,nd2,lot,n2,zr(1,j,1,j3,idat),zw(1,1,1), &      	     trig2,after2(i),now2(i),before2(i),-1)	inzee=1	do i=2,ic2	call fftstp(lot,n1dfft,n2,lot,n2,zw(1,1,inzee),zw(1,1,3-inzee), &                    trig2,after2(i),now2(i),before2(i),-1)    	inzee=3-inzee	enddo!  input: i1,I2,j3,(jp3)        if(icplexwf==2)then         call unswitch_cent(n1dfft,max2,m2,n2,lot,n1,lzt,zw(1,1,inzee),zt(1,1,j))        else         call unswitchreal_cent(n1dfft,max2,n2,lot,n1,lzt,zw(1,1,inzee),zt(1,1,2*j-1))        endif! output: I2,i1,j3,(jp3)2000	continue!DEBUG!       write(6,*)' j3=1, zt ='!       do i1=1,m1zt!        do i2=1,lzt!         write(6, '(2i4,2es16.6)')i2,i1,zt(1:2,i2,i1)!        enddo!       enddo!       stop!ENDDEBUG!DEBUG!       write(6,*)' forwf_wf : after unswitch_cent, j3=',j3!ENDDEBUG! transform along x axis! input: I2,i1,j3,(jp3)        lot=ncache/(4*n1)        do 1000,j=1,m2eff,lot        ma=j        mb=min(j+(lot-1),m2eff)	n1dfft=mb-ma+1	i=1	call fftstp(lzt,n1dfft,m1zt,lot,n1,zt(1,j,1),zw(1,1,1), &      	     trig1,after1(i),now1(i),before1(i),-1)	inzee=1	do i=2,ic1	call fftstp(lot,n1dfft,n1,lot,n1,zw(1,1,inzee),zw(1,1,3-inzee), &                    trig1,after1(i),now1(i),before1(i),-1)    	inzee=3-inzee	enddo! output: I2,I1,j3,(jp3)! input: J2,Jp2,I1,j3,(jp3)        if (nproc.eq.1) then        call unmpiswitch_cent(j3,n1dfft,Jp2st,J2st,lot,max1,md1,m1,n1,&&         md2proc,nd3proc,nproc,zw(1,1,inzee),zmpi2)        else 	call unmpiswitch_cent(j3,n1dfft,Jp2st,J2st,lot,max1,md1,m1,n1,&&         md2proc,nd3proc,nproc,zw(1,1,inzee),zmpi1)! output: I1,J2,j3,Jp2,(jp3)        endif1000	continue        endif1212	continue!DEBUG!       write(6,*)' zmpi2 ='!       do i3=1,nnd3!        do i2=1,m2eff!         do i1=1,md1!          write(6, '(3i4,2es16.6)')i1,i2,i3,zmpi2(1:2,i1,i2,i3)!         enddo!        enddo!       enddo!       stop!ENDDEBUG!DEBUG!       write(6,*)' forwf_wf : interproc data transp'!ENDDEBUG! Interprocessor data transposition! intput: I1,J2,j3,Jp2,(jp3)        if (nproc.gt.1) then11      continue!$omp   flush(lock)		if (mod(lock,npr).ne.iam) goto 11#       if defined MPI_FFT        call MPI_ALLTOALL(zmpi1,n1*md2proc*nd3proc, &                          MPI_double_precision, &                          zmpi2,n1*md2proc*nd3proc, &                          MPI_double_precision,MPI_COMM_WORLD,ierr)#       endif         lock=lock+1	!$omp   flush(lock)		endif! output: I1,J2,j3,jp3,(Jp2)! transform along z axis! input: I1,J2,i3,(Jp2)        lot=ncache/(4*n3)        do 3333,j2=1,md2proc	if (iproc*md2proc+j2.le.m2eff) then!DEBUG!       write(6,*)' forwf_wf : before unscramble, j2,md2proc,iproc,m2=',j2,md2proc,iproc,m2!ENDDEBUG        do 3000,i1=1,m1,lot        ma=i1        mb=min(i1+(lot-1),m1)	n1dfft=mb-ma+1!   input: I1,J2,i3,(Jp2)        call unscramble(i1,j2,lot,n1dfft,md1,n3,md2proc,nnd3,zmpi2,zw(1,1,1))! output: I1,i3,J2,(Jp2)        inzee=1        do i=1,ic3        call fftstp(lot,n1dfft,n3,lot,n3,zw(1,1,inzee),zw(1,1,3-inzee), &        trig3,after3(i),now3(i),before3(i),-1)        inzee=3-inzee        enddo        call unfill_cent(md1,md3,lot,n1dfft,max3,m3,n3,zw(1,1,inzee),zf(1,i1,1,j2,idat))! output: I1,I3,J2,(Jp2)3000	continue        endif3333    continue        if(icplexwf==1)then!        Complete missing values with complex conjugate!        Inverse of ix is located at nx+2-ix , except for ix=1, for which it is 1.         do i3=1,m3          i3inv=m3+2-i3          if(i3==1)i3inv=1          if(m2eff>1)then           do i2=2,m2eff            i2inv=m2+2-i2            zf(1,1,i3inv,i2inv,idat)= zf(1,1,i3,i2,idat)            zf(2,1,i3inv,i2inv,idat)=-zf(2,1,i3,i2,idat)            do i1=2,m1             i1inv=m1+2-i1             zf(1,i1inv,i3inv,i2inv,idat)= zf(1,i1,i3,i2,idat)             zf(2,i1inv,i3inv,i2inv,idat)=-zf(2,i1,i3,i2,idat)            enddo           enddo          endif         enddo        endif12345   continue!DEBUG!      write(6,*)' zf ='!      do i2=1,md2proc!       do i3=1,md3!        do i1=1,md1!         write(6, '(3i4,2es16.6)')i1,i3,i2,zf(1:2,i1,i3,i2)!        enddo!       enddo!      enddo!      stop!ENDDEBUG!$omp enddo        deallocate(trig1,after1,now1,before1, &                   trig2,after2,now2,before2, &                   trig3,after3,now3,before3, &                   zmpi2,zw,zt)	if (nproc.gt.1) deallocate(zmpi1)!$omp end parallel        write(6,*)' forw_wf : exit '	return	end
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -