📄 init.f90
字号:
! f90 -O5 -arch ev67 -omp -lmpi -ldxml testall.f90 Implicit real*8 (a-h,o-z) logical parallel integer count1,count2,count_rate,count_max parameter(parallel=.true.,ndat=64,ntime=2) parameter(itest=3) include 'mpif.h' dimension tuma(2),suma(2)! distributed Fourier space wavefunction REAL(KIND=8), ALLOCATABLE, DIMENSION(:,:,:,:,:) :: zf! distributed real space wavefunction REAL(KIND=8), ALLOCATABLE, DIMENSION(:,:,:,:,:) :: zr! distributed real space potential REAL(KIND=8), ALLOCATABLE, DIMENSION(:,:,:) :: pot,rho!$ interface!$ integer ( kind=4 ) function omp_get_num_threads ( )!$ end function omp_get_num_threads!$ end interface!$ interface!$ integer ( kind=4 ) function omp_get_thread_num ( )!$ end function omp_get_thread_num!$ end interface INTEGER, DIMENSION(7,104) :: idata data ((idata(i,j),i=1,7),j=1,104) / & 3, 3, 1, 1, 1, 1, 1, 4, 4, 1, 1, 1, 1, 1, & 5, 5, 1, 1, 1, 1, 1, 6, 6, 1, 1, 1, 1, 1, & 8, 8, 1, 1, 1, 1, 1, 9, 3, 3, 1, 1, 1, 1, & 10, 5, 2, 1, 1, 1, 1, & 12, 4, 3, 1, 1, 1, 1, 15, 5, 3, 1, 1, 1, 1, & 16, 4, 4, 1, 1, 1, 1, 18, 6, 3, 1, 1, 1, 1, & 20, 5, 4, 1, 1, 1, 1, 24, 8, 3, 1, 1, 1, 1, & 25, 5, 5, 1, 1, 1, 1, 27, 3, 3, 3, 1, 1, 1, & 30, 6, 5, 1, 1, 1, 1, 32, 8, 4, 1, 1, 1, 1, & 36, 4, 3, 3, 1, 1, 1, 40, 8, 5, 1, 1, 1, 1, & 45, 5, 3, 3, 1, 1, 1, 48, 4, 4, 3, 1, 1, 1, & 50, 5, 5, 2, 1, 1, 1, & 54, 6, 3, 3, 1, 1, 1, 60, 5, 4, 3, 1, 1, 1, & 64, 8, 8, 1, 1, 1, 1, 72, 8, 3, 3, 1, 1, 1, & 75, 5, 5, 3, 1, 1, 1, 80, 5, 4, 4, 1, 1, 1, & 81, 3, 3, 3, 3, 1, 1, 90, 6, 5, 3, 1, 1, 1, & 96, 8, 4, 3, 1, 1, 1, 100, 5, 5, 4, 1, 1, 1, & 108, 4, 3, 3, 3, 1, 1, 120, 8, 5, 3, 1, 1, 1, & 125, 5, 5, 5, 1, 1, 1, 128, 8, 4, 4, 1, 1, 1, & 135, 5, 3, 3, 3, 1, 1, 144, 6, 8, 3, 1, 1, 1, & 150, 6, 5, 5, 1, 1, 1, 160, 8, 5, 4, 1, 1, 1, & 162, 6, 3, 3, 3, 1, 1, 180, 5, 4, 3, 3, 1, 1, & 192, 6, 8, 4, 1, 1, 1, 200, 8, 5, 5, 1, 1, 1, & 216, 8, 3, 3, 3, 1, 1, 225, 5, 5, 3, 3, 1, 1, & 240, 6, 8, 5, 1, 1, 1, 243, 3, 3, 3, 3, 3, 1, & 250, 5, 5, 5, 2, 1, 1, & 256, 8, 8, 4, 1, 1, 1, 270, 6, 5, 3, 3, 1, 1, & 288, 8, 4, 3, 3, 1, 1, 300, 5, 5, 4, 3, 1, 1, & 320, 5, 4, 4, 4, 1, 1, 324, 4, 3, 3, 3, 3, 1, & 360, 8, 5, 3, 3, 1, 1, 375, 5, 5, 5, 3, 1, 1, & 384, 8, 4, 4, 3, 1, 1, 400, 5, 5, 4, 4, 1, 1, & 405, 5, 3, 3, 3, 3, 1, 432, 4, 4, 3, 3, 3, 1, & 450, 6, 5, 5, 3, 1, 1, 480, 8, 5, 4, 3, 1, 1, & 486, 6, 3, 3, 3, 3, 1, 500, 5, 5, 5, 4, 1, 1, & 512, 8, 8, 8, 1, 1, 1, 540, 5, 4, 3, 3, 3, 1, & 576, 4, 4, 4, 3, 3, 1, 600, 8, 5, 5, 3, 1, 1, & 625, 5, 5, 5, 5, 1, 1, 640, 8, 5, 4, 4, 1, 1, & 648, 8, 3, 3, 3, 3, 1, 675, 5, 5, 3, 3, 3, 1, & 720, 5, 4, 4, 3, 3, 1, 729, 3, 3, 3, 3, 3, 3, & 750, 6, 5, 5, 5, 1, 1, 768, 4, 4, 4, 4, 3, 1, & 800, 8, 5, 5, 4, 1, 1, 810, 6, 5, 3, 3, 3, 1, & 864, 8, 4, 3, 3, 3, 1, 900, 5, 5, 4, 3, 3, 1, & 960, 5, 4, 4, 4, 3, 1, 972, 4, 3, 3, 3, 3, 3, & 1000, 8, 5, 5, 5, 1, 1, 1024, 4, 4, 4, 4, 4, 1, & 1080, 8, 5, 3, 3, 3, 1, & 1152, 8, 4, 4, 3, 3, 1, 1200, 5, 5, 4, 4, 3, 1, & 1250, 5, 5, 5, 5, 2, 1, 1280, 5, 4, 4, 4, 4, 1, & 1296, 4, 4, 3, 3, 3, 3, 1350, 6, 5, 5, 3, 3, 1, & 1440, 6, 5, 4, 4, 3, 1, 1458, 6, 3, 3, 3, 3, 3, & 1500, 5, 5, 5, 4, 3, 1, 1536, 8, 4, 4, 4, 3, 1, & 1600, 8, 8, 5, 5, 1, 1, 1620, 5, 4, 3, 3, 3, 3, & 1728, 8, 8, 3, 3, 3, 1, 1800, 8, 5, 5, 3, 3, 1, & 1920, 8, 5, 4, 4, 3, 1, 1944, 8, 3, 3, 3, 3, 3, & 2000, 5, 5, 5, 4, 4, 1, 2048, 8, 4, 4, 4, 4, 1 /! Start MPI in parallel version if (parallel) then call MPI_INIT(ierr) call MPI_COMM_RANK(MPI_COMM_WORLD,iproc,ierr) call MPI_COMM_SIZE(MPI_COMM_WORLD,nproc,ierr)! write(6,*) 'mpi started',iproc,nproc else nproc=1 iproc=0 endif! Start OpenMP npr=1!$omp parallel default(private) shared(npr)!$ iam=omp_get_thread_num()!$ if (iam.eq.0) npr=omp_get_num_threads()!$omp end parallel if (iproc.eq.0) then open(unit=66,file='results',status='unknown') write(66,*) ' nproc ',nproc,' nthreads ',npr write(66,*) ' ndat ',ndat,' ntime ',ntime endif!! Dimension m1,m2,m3 of the Fourier space cube !! (Real space dimensions n1,n2,n3 are twice as large)!! 1) Hard coded dimensions m1=63 ; m2=63 ; m3=63 !! 2) Read dimensions! if (iproc.eq.0) then! write(6,*) 'm1=? , m2=? , m3=?'! read*, m1 , m2 , m3! endif! if (parallel) then! call MPI_BCAST(m1,1,MPI_INTEGER,0,MPI_COMM_WORLD,ierr)! call MPI_BCAST(m2,1,MPI_INTEGER,0,MPI_COMM_WORLD,ierr)! call MPI_BCAST(m3,1,MPI_INTEGER,0,MPI_COMM_WORLD,ierr)! write(6,*) iproc, m1 , m2 , m3! endif!! 3) loop over all possible dimensions! do 9999,k1=1,104! m1=idata(1,k1)! do 9999,k2=1,104! m2=idata(1,k2)! do 9999,k3=1,104! m3=idata(1,k3)! if (m1*m2*m3.le.64**3 .and. max(m1,m2,m3).le.128) then! if (iproc.eq.0) then! write(6,*) ' '! write(6,*) 'k1,k2,k3',k1,k2,k3! write(6,*) 'm1,m2,m3',m1,m2,m3! write(6,*) ' '! endif! Fourier space grid dimensions (suitable for number of processors) md1=m1+0; md3=m3+0 md2=m2+0150 if (nproc*(md2/nproc).lt.m2) then md2=md2+1 goto 150 endif! real space grid dimension (suitable for number of processors) if (itest.eq.1) then n1=m1; n2=m2; n3=m3 else n1=2*m1; n2=2*m2; n3=2*m3 endif! add buffer to avoid cache trashing nd1=n1+0; nd2=n2+0 nd3=n3+0250 if (nproc*(nd3/nproc).lt.n3) then nd3=nd3+1 goto 250 endif if (itest.eq.1) then nd2=md2 md3=nd3 md1=nd1 endif if (itest.eq.1) then if (iproc.eq.0) write(6,*) 'testing ordinary FFT' allocate(zr(2,nd1,nd2,nd3/nproc,ndat),zf(2,nd1,nd3,nd2/nproc,ndat)) else if (itest.eq.2) then if (iproc.eq.0) write(6,*) 'testing wavefunction FFT' allocate(zr(2,nd1,nd2,nd3/nproc,ndat),zf(2,md1,md3,md2/nproc,ndat)) else if (itest.eq.3) then if (iproc.eq.0) write(6,*) 'testing APPLYPOT' allocate(zf(2,md1,md3,md2/nproc,ndat),pot(nd1,nd2,nd3/nproc)) do 68,i3=1,nd3/nproc do 68,i2=1,nd2 do 68,i1=1,nd168 pot(i1,i2,i3)=1.d0 else if (itest.eq.4) then if (iproc.eq.0) write(6,*) 'testing Accumulation of rho' allocate(zf(2,md1,md3,md2/nproc,ndat),rho(nd1,nd2,nd3/nproc)) else if (iproc.eq.0) write(6,*) 'UNDEFINED itest' if (nproc.gt.1) call MPI_ABORT(MPI_COMM_WORLD,ierr) stop endif if (iproc.eq.0) then Print*,' ' Print*,' '46 format(a,3(2x,i4)) write(6,46) 'FFT TEST for n1 ,n2 ,n3 =',n1,n2,n3 write(6,46) ' nd1 ,nd2 ,nd3 =',nd1,nd2,nd3 write(6,46) ' md1 ,md2 ,md3 =',md1,md2,md3 write(6,*) ' nproc ',nproc,' nthreads ',npr write(6,*) ' ndat ',ndat,' ntime ',ntime Print*,' ' endif do idat=1,ndat call initf(idat,m1,m2,m3,md1,md2,md3,iproc,nproc,zf(1,1,1,1,idat)) enddo!Correctness if (itest.eq.1) then icplex=2 call back(icplex,ndat,n1,n2,n3,nd1,nd2,nd3,nproc,iproc,zf,zr)! call slice(nproc,iproc,n1,n2,n3,n1,n2,n3,nd1,nd2,nd3,nd1,nd2,nd3,zf,zr) call forw(icplex,ndat,n1,n2,n3,nd1,nd2,nd3,nproc,iproc,zr,zf) else if (itest.eq.2) then call back_wf(ndat,n1,n2,n3,nd1,nd2,nd3,md1,md2,md3,nproc,iproc,zf,zr)! call slice(nproc,iproc,m1,m2,m3,n1,n2,n3,md1,md2,md3,nd1,nd2,nd3,zf,zr) call forw_wf(ndat,n1,n2,n3,nd1,nd2,nd3,md1,md2,md3,nproc,iproc,zr,zf) else if (itest.eq.3) then call applypot(ndat,n1,n2,n3,nd1,nd2,nd3,md1,md2,md3,nproc,iproc,pot,zf) else if (itest.eq.4) then call accrho(ndat,n1,n2,n3,nd1,nd2,nd3,md1,md2,md3,nproc,iproc,zf,rho) endif scal=1.d0/(n1*n2*n3) if (itest.lt.4) then do idat=1,ndat call vgl(idat,scal,m1,m2,m3,md1,md2,md3,iproc,nproc,zf(1,1,1,1,idat),tum) if (nproc.gt.1) then call MPI_REDUCE(tum,sum,1,MPI_double_precision, & MPI_SUM,0,MPI_COMM_WORLD,ierr) else sum=tum endif if (iproc.eq.0) then write(6,*) 'error back-forward',idat,sqrt(sum) if (sqrt(sum).gt.1.d-10) then write(6,*) 'ALGEBRAIC ERROR' if (nproc.gt.1) call MPI_ABORT(MPI_COMM_WORLD,ierr) stop endif endif enddo else call fnorm(ndat,m1,m2,m3,md1,md2,md3,iproc,nproc,zf,tuma(1)) call rnorm(n1,n2,n3,nd1,nd2,nd3,iproc,nproc,rho,tuma(2)) if (nproc.gt.1) then call MPI_REDUCE(tuma,suma,2,MPI_double_precision, & MPI_SUM,0,MPI_COMM_WORLD,ierr) else suma(1)=tuma(1) suma(2)=tuma(2) endif if (iproc.eq.0) then tt=abs(1.d0-scal*suma(2)/suma(1)) write(6,*) 'rel DIFF. FOURIER, REAL NORM',tt if (tt.gt.1.d-10) then write(6,*) 'ALGEBRAIC ERROR' if (nproc.gt.1) call MPI_ABORT(MPI_COMM_WORLD,ierr) stop endif endif endif if (nproc.gt.1) call MPI_BARRIER(MPI_COMM_WORLD,ierr)! Timings call cpu_time(t1) call system_clock(count1,count_rate,count_max) if (itest.eq.1) then do irep=1,ntime icplex=2 call back(icplex,ndat,n1,n2,n3,nd1,nd2,nd3,nproc,iproc,zf,zr) call forw(icplex,ndat,n1,n2,n3,nd1,nd2,nd3,nproc,iproc,zr,zf) enddo else if (itest.eq.2) then do irep=1,ntime call back_wf(ndat,n1,n2,n3,nd1,nd2,nd3,md1,md2,md3,nproc,iproc,zf,zr) call forw_wf(ndat,n1,n2,n3,nd1,nd2,nd3,md1,md2,md3,nproc,iproc,zr,zf) enddo else if (itest.eq.3) then do irep=1,ntime call applypot(ndat,n1,n2,n3,nd1,nd2,nd3,md1,md2,md3,nproc,iproc,pot,zf) enddo else if (itest.eq.4) then do irep=1,ntime call accrho(ndat,n1,n2,n3,nd1,nd2,nd3,md1,md2,md3,nproc,iproc,zf,rho) enddo endif call cpu_time(t2) call system_clock(count2,count_rate,count_max) time=(t2-t1)/(ntime*ndat) tela=(count2-count1)/(float(count_rate)*ntime*ndat) write(6,'(a,x,i4,2(x,e10.3))') 'iproc, CPU, elapsed time for forw + backw', & iproc,time,tela if (time*tela.ne.0.d0) then if (itest.eq.1) then flops=10.d0*(n1*n2*n3)* & (log(1.d0*n2)/log(2.d0)+log(1.d0*n1)/log(2.d0)+log(1.d0*n3)/log(2.d0)) else if (itest.eq.2) then flops=(10.d0/4.d0)*(n1*n2*n3)* & (log(1.d0*n2)/log(2.d0)+2*log(1.d0*n1)/log(2.d0)+4*log(1.d0*n3)/log(2.d0)) else if (itest.eq.3) then flops=2*n1*n2*n3 + (10.d0/4.d0)*(n1*n2*n3)* & (log(1.d0*n2)/log(2.d0)+2*log(1.d0*n1)/log(2.d0)+4*log(1.d0*n3)/log(2.d0)) else if (itest.eq.4) then flops=4*n1*n2*n3 + (5.d0/4.d0)*(n1*n2*n3)* & (log(1.d0*n2)/log(2.d0)+2*log(1.d0*n1)/log(2.d0)+4*log(1.d0*n3)/log(2.d0)) endif if (iproc.eq.0) write(6,'(a,x,i4,(x,e12.5))') & 'iproc, Flops forw + backw',iproc,flops if (iproc.eq.0) write(6,'(a,2(x,f10.0))') 'speed (Mflops)', & 1.d-6*flops/time,1.d-6*flops/tela if (iproc.eq.0) write(66,'(i4,3(x,i4),2(x,e10.3),(x,f10.0))') & iproc,m1,m2,m3,time,tela,1.d-6*flops/tela endif if (itest.eq.1) then deallocate(zr,zf) else if (itest.eq.2) then deallocate(zr,zf) else if (itest.eq.3) then deallocate(zf,pot) else if (itest.eq.4) then deallocate(zf,rho) endif! endif!9999 continue if (nproc.gt.1) call MPI_FINALIZE(ierr) end
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -