📄 back_wf.f90

📁 基于linux操作下的fortran快速付立变换的程序
💻 F90
字号:
        subroutine back_wf(icplexwf,ndat,n1,n2,n3,nd1,nd2,nd3proc,&&        max1,max2,max3,m1,m2,m3,md1,md2proc,md3,nproc,iproc,zf,zr)! Does multiple 3-dim backward FFTs from Fourier into real space! Adopt standard convention that isign=1 for backward transform!	CALCULATES THE DISCRETE FOURIERTRANSFORM ZF(I1,I2,I3)=!	S_(j1,j2,j3) EXP(isign*i*2*pi*(j1*i1/n1+j2*i2/n2+j3*i3/n3)) ZF(j1,j3,j2)!       in parallel using MPI/OpenMP.!	INPUT:!          ZF: input array (note the switch of i2 and i3)!		real(F(i1,i3,i2,idat))=ZF(1,i1,i3,i2,idat)!		imag(F(i1,i3,i2,idat))=ZF(2,i1,i3,i2,idat)!          max1 is positive or zero ; m1 >=max1+1!          i1= 1... max1+1 corresponds to positive and zero wavevectors 0 ... max1!          then, if m1 > max1+1, one has min1=max1-m1+1 and!          i1= max1+2 ... m1 corresponds to negative wavevectors min1 ... -1!          i2 and i3 have a similar definition of range!          idat=1,ndat!          md1,md2,md3: Dimension of ZF!          md2proc=((md2-1)/nproc)+1 ! maximal number of small box 2nd dim slices for one proc!	OUTPUT:!          ZR: output array!		ZR(1,i1,i2,i3,idat)=real(R(i1,i2,i3,idat))!		ZR(2,i1,i2,i3,idat)=imag(R(i1,i2,i3,idat))!               i1=1,n1 , i2=1,n2 , i3=1,n3 , idat=1,ndat!          n1,n2,n3: logical dimension of the transform. As transform lengths!                    most products of the prime factors 2,3,5 are allowed.!                    The detailed table with allowed transform lengths can!                    be found in subroutine CTRIG!	   nd1,nd2,nd3: Dimension of ZR!          nd3proc=((nd3-1)/nproc)+1 ! maximal number of big box 3rd dim slices for one proc!          nproc: number of processors used as returned by MPI_COMM_SIZE!          iproc: [0:nproc-1] number of processor as returned by MPI_COMM_RANK!!       PERFORMANCE CONSIDERATIONS:!       The maximum number of processors that can reasonably be used is max(n2/2,n3/2)!!       It is very important to find the optimal !       value of NCACHE. NCACHE determines the size of the work array ZW, that!       has to fit into cache. It has therefore to be chosen to equal roughly !	half the size of the physical cache in units of real*8 numbers.!       The optimal value of ncache can easily be determined by numerical !       experimentation. A too large value of ncache leads to a dramatic !       and sudden decrease of performance, a too small value to a to a !       slow and less dramatic decrease of performance. If NCACHE is set !       to a value so small, that not even a single one dimensional transform !       can be done in the workarray zw, the program stops with an error message.!!       RESTRICTIONS on USAGE!  Copyright (C) 2002-2005 Stefan Goedecker, CEA Grenoble!  This file is distributed under the terms of the!  GNU General Public License, see http://www.gnu.org/copyleft/gpl.txt .        implicit real*8 (a-h,o-z)#       if defined MPI_FFT        include 'mpif.h'#       endif! real space input        integer, parameter :: ddp = kind(1.d0)        REAL(DDP), DIMENSION(2,nd1,nd2,nd3proc,ndat) :: zr! Fourier space output        REAL(DDP), DIMENSION(2,md1,md3,md2proc,ndat) :: zf! work arrays for transpositions        REAL(DDP), ALLOCATABLE, DIMENSION(:,:,:) :: zt! work arrays for MPI        REAL(DDP), ALLOCATABLE, DIMENSION(:,:,:,:) :: zmpi1        REAL(DDP), ALLOCATABLE, DIMENSION(:,:,:,:) :: zmpi2! cache work array        REAL(DDP), ALLOCATABLE, DIMENSION(:,:,:) :: zw! FFT work arrays        REAL(DDP), ALLOCATABLE, DIMENSION(:,:) :: trig1,trig2,trig3        INTEGER, ALLOCATABLE, DIMENSION(:) :: after1,now1,before1, &                           after2,now2,before2,after3,now3,before3!$      interface!$        integer ( kind=4 ) function omp_get_num_threads ( )!$        end function omp_get_num_threads!$      end interface!$      interface!$        integer ( kind=4 ) function omp_get_thread_num ( )!$        end function omp_get_thread_num!$      end interface        write(6,*)' back_wf : enter '! find cache size that gives optimal performance on machine        ncache=4*1024        if (ncache/(4*max(n1,n2,n3)).lt.1) then                      write(6,*) &                         ' ncache has to be enlarged to be able to hold at' // &                         'least one 1-d FFT of each size even though this will' // &                         'reduce the performance for shorter transform lengths'                       stop        endif	lock=0!$omp parallel  default(private) &!$omp shared(ndat,n1,n2,n3,nd1,nd2,nd3proc,md1,md2proc,md3,iproc,nproc,ncache,zr,zf,lock) &!$omp shared(max1,max2,max3,m1,m2,m3,icplexwf)	iam=0	npr=1!$      iam=omp_get_thread_num()!$      npr=omp_get_num_threads()!       Effective m1 and m2 (complex-to-complex or real-to-complex)        n1eff=n1 ; m2eff=m2 ; m1zt=n1        if(icplexwf==1)then         n1eff=(n1+1)/2 ; m2eff=m2/2+1 ; m1zt=2*(n1/2+1)        endif        lzt=m2eff        if (mod(m2eff,2).eq.0) lzt=lzt+1        if (mod(m2eff,4).eq.0) lzt=lzt+1        nnd3=nd3proc*nproc        ! maximal number of big box 3rd dim slices for all procs!$omp critical        allocate(trig1(2,2048),after1(7),now1(7),before1(7), &                 trig2(2,2048),after2(7),now2(7),before2(7), &                 trig3(2,2048),after3(7),now3(7),before3(7), &                 zw(2,ncache/4,2),zt(2,lzt,m1zt), &                 zmpi2(2,md1,md2proc,nnd3))	if (nproc.gt.1) allocate(zmpi1(2,md1,md2proc,nnd3))!$omp end critical        call ctrig(n3,trig3,after3,before3,now3,1,ic3)        call ctrig(n1,trig1,after1,before1,now1,1,ic1)        call ctrig(n2,trig2,after2,before2,now2,1,ic2)!DEBUG!       write(6,'(a,3i4)' )'n1,n2,n3',n1,n2,n3!       write(6,'(a,3i4)' )'nd1,nd2,nd3proc',nd1,nd2,nd3proc!       write(6,'(a,3i4)' )'m1,m2,m3',m1,m2,m3!       write(6,'(a,3i4)' )'max1,max2,max3',max1,max2,max3!       write(6,'(a,3i4)' )'md1,md2proc,md3',md1,md2proc,md3!       write(6,'(a,3i4)' )'n1eff,m2eff,m1zt',n1eff,m2eff,m1zt!       do i2=1,md2proc!        do i3=1,md3!         do i1=1,md1!          write(6, '(3i4,2es16.6)')i1,i3,i2,zf(1:2,i1,i3,i2)!         enddo!        enddo!       enddo!       stop!ENDDEBUG!$omp do	do 12345,idat=1,ndat! transform along z axis! input: I1,I3,J2,(Jp2)        lot=ncache/(4*n3)        do 3333,j2=1,md2proc	if (iproc*md2proc+j2.le.m2eff) then        do 3000,i1=1,m1,lot        ma=i1        mb=min(i1+(lot-1),m1)	n1dfft=mb-ma+1!  input: I1,I3,J2,(Jp2)        call fill_cent(md1,md3,lot,n1dfft,max3,m3,n3,zf(1,i1,1,j2,idat),zw(1,1,1))	inzee=1	do i=1,ic3	call fftstp(lot,n1dfft,n3,lot,n3,zw(1,1,inzee),zw(1,1,3-inzee), &                    trig3,after3(i),now3(i),before3(i),1)    	inzee=3-inzee	enddo!  input: I1,i3,J2,(Jp2)	call scramble(i1,j2,lot,n1dfft,md1,n3,md2proc,nnd3,zw(1,1,inzee),zmpi2)!  output: I1,J2,i3,(Jp2)3000	continue	endif3333	continue! Interprocessor data transposition! input: I1,J2,j3,jp3,(Jp2)        if (nproc.gt.1) then11	continue!$omp   flush(lock)		if (mod(lock,npr).ne.iam) goto 11#       if defined MPI_FFT        call MPI_ALLTOALL(zmpi2,md1*md2proc*nd3proc, &                          MPI_double_precision, &                          zmpi1,md1*md2proc*nd3proc, &                          MPI_double_precision,MPI_COMM_WORLD,ierr)#       endif        lock=lock+1	!$omp   flush(lock)	! output: I1,J2,j3,Jp2,(jp3)        endif!DEBUG!       write(6,*)' zmpi2 ='!       do i3=1,nnd3!        do i2=1,m2eff!         do i1=1,md1!          write(6, '(3i4,2es16.6)')i1,i2,i3,zmpi2(1:2,i1,i2,i3)!         enddo!        enddo!       enddo!       stop!ENDDEBUG	do 1212,j3=1,nd3proc	if (iproc*nd3proc+j3.le.n3) then	Jp2st=1	J2st=1! transform along x axis        lot=ncache/(4*n1)        do 1000,j=1,m2eff,lot        ma=j        mb=min(j+(lot-1),m2eff)	n1dfft=mb-ma+1! input: I1,J2,j3,Jp2,(jp3)        if (nproc.eq.1) then        call mpiswitch_cent(j3,n1dfft,Jp2st,J2st,lot,max1,md1,m1,n1,&&         md2proc,nd3proc,nproc,zmpi2,zw(1,1,1))        else 	call mpiswitch_cent(j3,n1dfft,Jp2st,J2st,lot,max1,md1,m1,n1,&&         md2proc,nd3proc,nproc,zmpi1,zw(1,1,1))        endif! output: J2,Jp2,I1,j3,(jp3)! input: I2,I1,j3,(jp3)	inzee=1	do i=1,ic1-1	call fftstp(lot,n1dfft,n1,lot,n1,zw(1,1,inzee),zw(1,1,3-inzee), &                    trig1,after1(i),now1(i),before1(i),1)    	inzee=3-inzee	enddo	i=ic1	call fftstp(lot,n1dfft,n1,lzt,m1zt,zw(1,1,inzee),zt(1,j,1), &       	     trig1,after1(i),now1(i),before1(i),1)! output: I2,i1,j3,(jp3)1000	continue!DEBUG!       write(6,*)' j3=1, zt ='!       do i1=1,m1zt!        do i2=1,lzt!         write(6, '(2i4,2es16.6)')i2,i1,zt(1:2,i2,i1)!        enddo!       enddo!       stop!ENDDEBUG! transform along y axis        lot=ncache/(4*n2)        do 2000,j=1,n1eff,lot        ma=j        mb=min(j+(lot-1),n1eff)	n1dfft=mb-ma+1        includelast=1        if(icplexwf==1)then         jeff=2*j-1         if(mb==n1eff .and. n1eff*2/=n1)includelast=0        endif!  input: I2,i1,j3,(jp3)        if(icplexwf==2)then         call switch_cent(n1dfft,max2,m2,n2,lot,n1,lzt,zt(1,1,j),zw(1,1,1))        else         call switchreal_cent(includelast,n1dfft,max2,n2,lot,m1zt,lzt,zt(1,1,jeff),zw(1,1,1))        endif!DEBUG!       write(6,*)' j3=1, after switchreal_cent, includelast=',includelast!       do i1=1,n1dfft*n2!        write(6, '(2i4,2es16.6)')i1,i1,zw(1:2,i1,1)!       enddo!       stop!ENDDEBUG! output: i1,I2,j3,(jp3)	inzee=1	do i=1,ic2-1	call fftstp(lot,n1dfft,n2,lot,n2,zw(1,1,inzee),zw(1,1,3-inzee), &                    trig2,after2(i),now2(i),before2(i),1)    	inzee=3-inzee	enddo	i=ic2	call fftstp(lot,n1dfft,n2,nd1,nd2,zw(1,1,inzee),zr(1,j,1,j3,idat), &      	     trig2,after2(i),now2(i),before2(i),1)2000	continue! output: i1,i2,j3,(jp3)!DEBUG!       write(6,*)' j3=1, before rearrangement, zr ='!       do i1=1,n1!        do i2=1,n2!         write(6, '(2i4,2es16.6)')i1,i2,zr(1:2,i1,i2,j3,idat)!        enddo!       enddo!       stop!ENDDEBUG        if(icplexwf==1)then         n1half=n1/2!        If odd         if(n1half*2/=n1)then          do i2=1,n2           zr(1,n1,i2,j3,idat)=zr(1,n1eff,i2,j3,idat)            zr(2,n1,i2,j3,idat)=0.0d0          enddo         endif         do i2=1,n2          do i1=n1half,1,-1           zr(1,2*i1-1,i2,j3,idat)=zr(1,i1,i2,j3,idat)           zr(1,2*i1  ,i2,j3,idat)=zr(2,i1,i2,j3,idat)           zr(2,2*i1-1,i2,j3,idat)=0.0d0           zr(2,2*i1  ,i2,j3,idat)=0.0d0          enddo         enddo        endif!DEBUG!       write(6,*)' j3=1, zr ='!       do i1=1,n1!        do i2=1,n2!         write(6, '(2i4,2es16.6)')i1,i2,zr(1:2,i1,i2,j3,idat)!        enddo!       enddo!       stop!ENDDEBUG        endif1212	continue12345   continue!$omp enddo        deallocate(trig1,after1,now1,before1, &                   trig2,after2,now2,before2, &                   trig3,after3,now3,before3, &                   zmpi2,zw,zt)	if (nproc.gt.1) deallocate(zmpi1)!$omp end parallel !DEBUG!      write(6,*)' back_wf : exit '!      do i3=1,n3!       do i2=1,n2!        do i1=1,n1!         write(6, '(3i4,2es16.6)')i1,i2,i3,zr(1:2,i1,i2,i3,1)!        enddo!       enddo!      enddo!ENDDEBUG	return	end
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -