nb_kernel130_x86_64_sse2.s

来自「最著名最快的分子模拟软件」· S 代码 · 共 1,392 行 · 第 1/3 页

S
1,392
字号
        movq  nb130nf_pos(%rbp),%rax      ## eax = base of pos[]          addsd (%rax,%rbx,8),%xmm0        addsd 8(%rax,%rbx,8),%xmm1        addsd 16(%rax,%rbx,8),%xmm2        movapd %xmm3,nb130nf_iq(%rsp)        shufpd $0,%xmm0,%xmm0        shufpd $0,%xmm1,%xmm1        shufpd $0,%xmm2,%xmm2        movapd %xmm0,nb130nf_ix(%rsp)        movapd %xmm1,nb130nf_iy(%rsp)        movapd %xmm2,nb130nf_iz(%rsp)        movl  %ebx,nb130nf_ii3(%rsp)        ## clear vctot        xorpd %xmm4,%xmm4        movapd %xmm4,nb130nf_vctot(%rsp)        movapd %xmm4,nb130nf_Vvdwtot(%rsp)        movq  nb130nf_jindex(%rsp),%rax        movl  (%rax,%rsi,4),%ecx             ## jindex[n]         movl  4(%rax,%rsi,4),%edx            ## jindex[n+1]         subl  %ecx,%edx              ## number of innerloop atoms         movq  nb130nf_pos(%rbp),%rsi        movq  nb130nf_faction(%rbp),%rdi        movq  nb130nf_jjnr(%rsp),%rax        shll  $2,%ecx        addq  %rcx,%rax        movq  %rax,nb130nf_innerjjnr(%rsp)       ## pointer to jjnr[nj0]         movl  %edx,%ecx        subl  $2,%edx        addl  nb130nf_ninner(%rsp),%ecx        movl  %ecx,nb130nf_ninner(%rsp)        addl  $0,%edx        movl  %edx,nb130nf_innerk(%rsp)      ## number of innerloop atoms         jge   _nb_kernel130nf_x86_64_sse2.nb130nf_unroll_loop        jmp   _nb_kernel130nf_x86_64_sse2.nb130nf_checksingle_nb_kernel130nf_x86_64_sse2.nb130nf_unroll_loop:         ## twice unrolled innerloop here         movq  nb130nf_innerjjnr(%rsp),%rdx       ## pointer to jjnr[k]         movl  (%rdx),%eax        movl  4(%rdx),%ebx        addq $8,nb130nf_innerjjnr(%rsp)                 ## advance pointer (unrolled 2)         movq nb130nf_charge(%rbp),%rsi     ## base of charge[]         movlpd (%rsi,%rax,8),%xmm3        movhpd (%rsi,%rbx,8),%xmm3        movapd nb130nf_iq(%rsp),%xmm5        mulpd %xmm5,%xmm3               ## qq         movd  %eax,%mm0         ## use mmx registers as temp storage         movd  %ebx,%mm1        movq nb130nf_type(%rbp),%rsi        movl (%rsi,%rax,4),%eax        movl (%rsi,%rbx,4),%ebx        movq nb130nf_vdwparam(%rbp),%rsi        shll %eax        shll %ebx        movl nb130nf_ntia(%rsp),%edi        addl %edi,%eax        addl %edi,%ebx        movlpd (%rsi,%rax,8),%xmm6      ## c6a        movlpd (%rsi,%rbx,8),%xmm7      ## c6b        movhpd 8(%rsi,%rax,8),%xmm6     ## c6a c12a         movhpd 8(%rsi,%rbx,8),%xmm7     ## c6b c12b         movapd %xmm6,%xmm4        unpcklpd %xmm7,%xmm4        unpckhpd %xmm7,%xmm6        movd  %mm0,%eax        movd  %mm1,%ebx        movapd %xmm4,nb130nf_c6(%rsp)        movapd %xmm6,nb130nf_c12(%rsp)        movq nb130nf_pos(%rbp),%rsi        ## base of pos[]         lea  (%rax,%rax,2),%rax     ## replace jnr with j3         lea  (%rbx,%rbx,2),%rbx        ## move two coordinates to xmm0-xmm2            movlpd (%rsi,%rax,8),%xmm0        movlpd 8(%rsi,%rax,8),%xmm1        movlpd 16(%rsi,%rax,8),%xmm2        movhpd (%rsi,%rbx,8),%xmm0        movhpd 8(%rsi,%rbx,8),%xmm1        movhpd 16(%rsi,%rbx,8),%xmm2        ## move ix-iz to xmm4-xmm6         movapd nb130nf_ix(%rsp),%xmm4        movapd nb130nf_iy(%rsp),%xmm5        movapd nb130nf_iz(%rsp),%xmm6        ## calc dr         subpd %xmm0,%xmm4        subpd %xmm1,%xmm5        subpd %xmm2,%xmm6        ## square it         mulpd %xmm4,%xmm4        mulpd %xmm5,%xmm5        mulpd %xmm6,%xmm6        addpd %xmm5,%xmm4        addpd %xmm6,%xmm4        ## rsq in xmm4         cvtpd2ps %xmm4,%xmm5        rsqrtps %xmm5,%xmm5        cvtps2pd %xmm5,%xmm2    ## lu in low xmm2         ## lookup seed in xmm2         movapd %xmm2,%xmm5      ## copy of lu         mulpd %xmm2,%xmm2       ## lu*lu         movapd nb130nf_three(%rsp),%xmm1        mulpd %xmm4,%xmm2       ## rsq*lu*lu                            movapd nb130nf_half(%rsp),%xmm0        subpd %xmm2,%xmm1       ## 30-rsq*lu*lu         mulpd %xmm5,%xmm1        mulpd %xmm0,%xmm1       ## xmm0=iter1 of rinv (new lu)         movapd %xmm1,%xmm5      ## copy of lu         mulpd %xmm1,%xmm1       ## lu*lu         movapd nb130nf_three(%rsp),%xmm2        mulpd %xmm4,%xmm1       ## rsq*lu*lu                            movapd nb130nf_half(%rsp),%xmm0        subpd %xmm1,%xmm2       ## 30-rsq*lu*lu         mulpd %xmm5,%xmm2        mulpd %xmm2,%xmm0       ## xmm0=rinv         movapd %xmm0,%xmm6        mulpd  %xmm3,%xmm6  ## vcoul        addpd  nb130nf_vctot(%rsp),%xmm6        movapd %xmm6,nb130nf_vctot(%rsp)        ## LJ table interaction. xmm0=rinv, xmm4=rsq        mulpd %xmm0,%xmm4       ## xmm4=r         mulpd nb130nf_tsc(%rsp),%xmm4        cvttpd2pi %xmm4,%mm6    ## mm6 = lu idx         cvtpi2pd %mm6,%xmm5        subpd %xmm5,%xmm4        movapd %xmm4,%xmm1      ## xmm1=eps         movapd %xmm1,%xmm2        mulpd  %xmm2,%xmm2      ## xmm2=eps2         pslld $3,%mm6           ## idx *= 8         movq nb130nf_VFtab(%rbp),%rsi        movd %mm6,%eax        psrlq $32,%mm6        movd %mm6,%ebx        ## dispersion         movlpd (%rsi,%rax,8),%xmm4      ## Y1           movlpd (%rsi,%rbx,8),%xmm3      ## Y2         movhpd 8(%rsi,%rax,8),%xmm4     ## Y1 F1                movhpd 8(%rsi,%rbx,8),%xmm3     ## Y2 F2         movapd %xmm4,%xmm5        unpcklpd %xmm3,%xmm4    ## Y1 Y2         unpckhpd %xmm3,%xmm5    ## F1 F2         movlpd 16(%rsi,%rax,8),%xmm6    ## G1        movlpd 16(%rsi,%rbx,8),%xmm3    ## G2        movhpd 24(%rsi,%rax,8),%xmm6    ## G1 H1                movhpd 24(%rsi,%rbx,8),%xmm3    ## G2 H2         movapd %xmm6,%xmm7        unpcklpd %xmm3,%xmm6    ## G1 G2         unpckhpd %xmm3,%xmm7    ## H1 H2         ## dispersion table ready, in xmm4-xmm7                 mulpd  %xmm1,%xmm6      ## xmm6=Geps         mulpd  %xmm2,%xmm7      ## xmm7=Heps2         addpd  %xmm6,%xmm5        addpd  %xmm7,%xmm5      ## xmm5=Fp              mulpd  %xmm1,%xmm5 ## xmm5=eps*Fp         addpd  %xmm4,%xmm5 ## xmm5=VV         movapd nb130nf_c6(%rsp),%xmm4        mulpd  %xmm4,%xmm5       ## Vvdw6         ##Update Vvdwtot directly         addpd  nb130nf_Vvdwtot(%rsp),%xmm5        movapd %xmm5,nb130nf_Vvdwtot(%rsp)        ## repulsion         movlpd 32(%rsi,%rax,8),%xmm4    ## Y1           movlpd 32(%rsi,%rbx,8),%xmm3    ## Y2         movhpd 40(%rsi,%rax,8),%xmm4    ## Y1 F1                movhpd 40(%rsi,%rbx,8),%xmm3    ## Y2 F2         movapd %xmm4,%xmm5        unpcklpd %xmm3,%xmm4    ## Y1 Y2         unpckhpd %xmm3,%xmm5    ## F1 F2         movlpd 48(%rsi,%rax,8),%xmm6    ## G1        movlpd 48(%rsi,%rbx,8),%xmm3    ## G2        movhpd 56(%rsi,%rax,8),%xmm6    ## G1 H1                movhpd 56(%rsi,%rbx,8),%xmm3    ## G2 H2         movapd %xmm6,%xmm7        unpcklpd %xmm3,%xmm6    ## G1 G2         unpckhpd %xmm3,%xmm7    ## H1 H2         ## table ready, in xmm4-xmm7            mulpd  %xmm1,%xmm6      ## xmm6=Geps         mulpd  %xmm2,%xmm7      ## xmm7=Heps2         addpd  %xmm6,%xmm5        addpd  %xmm7,%xmm5      ## xmm5=Fp              mulpd  %xmm1,%xmm5 ## xmm5=eps*Fp         addpd  %xmm4,%xmm5 ## xmm5=VV         movapd nb130nf_c12(%rsp),%xmm4        mulpd  %xmm4,%xmm5        addpd  nb130nf_Vvdwtot(%rsp),%xmm5        movapd %xmm5,nb130nf_Vvdwtot(%rsp)        ## should we do one more iteration?         subl $2,nb130nf_innerk(%rsp)        jl    _nb_kernel130nf_x86_64_sse2.nb130nf_checksingle        jmp   _nb_kernel130nf_x86_64_sse2.nb130nf_unroll_loop_nb_kernel130nf_x86_64_sse2.nb130nf_checksingle:         movl  nb130nf_innerk(%rsp),%edx        andl  $1,%edx        jnz    _nb_kernel130nf_x86_64_sse2.nb130nf_dosingle        jmp    _nb_kernel130nf_x86_64_sse2.nb130nf_updateouterdata_nb_kernel130nf_x86_64_sse2.nb130nf_dosingle:         movq nb130nf_charge(%rbp),%rsi        movq nb130nf_pos(%rbp),%rdi        movq  nb130nf_innerjjnr(%rsp),%rcx        xorpd %xmm3,%xmm3        movl  (%rcx),%eax        movlpd (%rsi,%rax,8),%xmm3        movapd nb130nf_iq(%rsp),%xmm5        mulpd %xmm5,%xmm3               ## qq         movd  %eax,%mm0         ## use mmx registers as temp storage         movq nb130nf_type(%rbp),%rsi        movl (%rsi,%rax,4),%eax        movq nb130nf_vdwparam(%rbp),%rsi        shll %eax        movl nb130nf_ntia(%rsp),%edi        addl %edi,%eax        movlpd (%rsi,%rax,8),%xmm6      ## c6a        movhpd 8(%rsi,%rax,8),%xmm6     ## c6a c12a         xorpd %xmm7,%xmm7        movapd %xmm6,%xmm4        unpcklpd %xmm7,%xmm4        unpckhpd %xmm7,%xmm6        movd  %mm0,%eax        movapd %xmm4,nb130nf_c6(%rsp)        movapd %xmm6,nb130nf_c12(%rsp)        movq nb130nf_pos(%rbp),%rsi        ## base of pos[]         lea (%rax,%rax,2),%rax    ## replace jnr with j3         ## move two coordinates to xmm0-xmm2            movlpd (%rsi,%rax,8),%xmm0        movlpd 8(%rsi,%rax,8),%xmm1        movlpd 16(%rsi,%rax,8),%xmm2        ## move ix-iz to xmm4-xmm6         movapd nb130nf_ix(%rsp),%xmm4        movapd nb130nf_iy(%rsp),%xmm5        movapd nb130nf_iz(%rsp),%xmm6        ## calc dr         subsd %xmm0,%xmm4        subsd %xmm1,%xmm5        subsd %xmm2,%xmm6        ## square it         mulsd %xmm4,%xmm4        mulsd %xmm5,%xmm5        mulsd %xmm6,%xmm6        addsd %xmm5,%xmm4        addsd %xmm6,%xmm4        ## rsq in xmm4         cvtsd2ss %xmm4,%xmm5        rsqrtss %xmm5,%xmm5        cvtss2sd %xmm5,%xmm2    ## lu in low xmm2         ## lookup seed in xmm2         movapd %xmm2,%xmm5      ## copy of lu         mulsd %xmm2,%xmm2       ## lu*lu         movapd nb130nf_three(%rsp),%xmm1        mulsd %xmm4,%xmm2       ## rsq*lu*lu                            movapd nb130nf_half(%rsp),%xmm0        subsd %xmm2,%xmm1       ## 30-rsq*lu*lu         mulsd %xmm5,%xmm1        mulsd %xmm0,%xmm1       ## xmm0=iter1 of rinv (new lu)         movapd %xmm1,%xmm5      ## copy of lu         mulsd %xmm1,%xmm1       ## lu*lu         movapd nb130nf_three(%rsp),%xmm2        mulsd %xmm4,%xmm1       ## rsq*lu*lu                            movapd nb130nf_half(%rsp),%xmm0        subsd %xmm1,%xmm2       ## 30-rsq*lu*lu         mulsd %xmm5,%xmm2        mulsd %xmm2,%xmm0       ## xmm0=rinv         movapd %xmm0,%xmm6        movapd %xmm4,%xmm1        mulsd  %xmm3,%xmm6      ## xmm6=vcoul=qq*rinv        addsd  nb130nf_vctot(%rsp),%xmm6        movsd %xmm6,nb130nf_vctot(%rsp)        ## LJ table interaction. xmm0=rinv, cmm4=rsq        mulsd %xmm0,%xmm4       ## xmm4=r         mulsd nb130nf_tsc(%rsp),%xmm4        cvttsd2si %xmm4,%ebx    ## mm6 = lu idx         cvtsi2sd %ebx,%xmm5        subsd %xmm5,%xmm4        movsd %xmm4,%xmm1       ## xmm1=eps         movsd %xmm1,%xmm2        mulsd  %xmm2,%xmm2      ## xmm2=eps2         shll  $3,%ebx        movq nb130nf_VFtab(%rbp),%rsi        ## dispersion         movlpd (%rsi,%rbx,8),%xmm4      ## Y1           movhpd 8(%rsi,%rbx,8),%xmm4     ## Y1 F1                movapd %xmm4,%xmm5        unpcklpd %xmm3,%xmm4    ## Y1 Y2         unpckhpd %xmm3,%xmm5    ## F1 F2         movlpd 16(%rsi,%rbx,8),%xmm6    ## G1        movhpd 24(%rsi,%rbx,8),%xmm6    ## G1 H1                movapd %xmm6,%xmm7        unpcklpd %xmm3,%xmm6    ## G1 G2         unpckhpd %xmm3,%xmm7    ## H1 H2         ## dispersion table ready, in xmm4-xmm7                 mulsd  %xmm1,%xmm6      ## xmm6=Geps         mulsd  %xmm2,%xmm7      ## xmm7=Heps2         addsd  %xmm6,%xmm5        addsd  %xmm7,%xmm5      ## xmm5=Fp              mulsd  %xmm1,%xmm5 ## xmm5=eps*Fp         addsd  %xmm4,%xmm5 ## xmm5=VV         movsd nb130nf_c6(%rsp),%xmm4        mulsd  %xmm4,%xmm5       ## Vvdw6         ## put scalar force on stack Update Vvdwtot directly         addsd  nb130nf_Vvdwtot(%rsp),%xmm5        movsd %xmm5,nb130nf_Vvdwtot(%rsp)        ## repulsion         movlpd 32(%rsi,%rbx,8),%xmm4    ## Y1           movhpd 40(%rsi,%rbx,8),%xmm4    ## Y1 F1                movapd %xmm4,%xmm5        unpcklpd %xmm3,%xmm4    ## Y1 Y2         unpckhpd %xmm3,%xmm5    ## F1 F2         movlpd 48(%rsi,%rbx,8),%xmm6    ## G1        movhpd 56(%rsi,%rbx,8),%xmm6    ## G1 H1                movapd %xmm6,%xmm7        unpcklpd %xmm3,%xmm6    ## G1 G2         unpckhpd %xmm3,%xmm7    ## H1 H2         ## table ready, in xmm4-xmm7            mulsd  %xmm1,%xmm6      ## xmm6=Geps         mulsd  %xmm2,%xmm7      ## xmm7=Heps2         addsd  %xmm6,%xmm5        addsd  %xmm7,%xmm5      ## xmm5=Fp              mulsd  %xmm1,%xmm5 ## xmm5=eps*Fp         addsd  %xmm4,%xmm5 ## xmm5=VV         movsd nb130nf_c12(%rsp),%xmm4        mulsd  %xmm4,%xmm5        addsd  nb130nf_Vvdwtot(%rsp),%xmm5        movsd %xmm5,nb130nf_Vvdwtot(%rsp)_nb_kernel130nf_x86_64_sse2.nb130nf_updateouterdata:         ## get n from stack        movl nb130nf_n(%rsp),%esi        ## get group index for i particle         movq  nb130nf_gid(%rbp),%rdx            ## base of gid[]        movl  (%rdx,%rsi,4),%edx                ## ggid=gid[n]        ## accumulate total potential energy and update it         movapd nb130nf_vctot(%rsp),%xmm7        ## accumulate         movhlps %xmm7,%xmm6        addsd  %xmm6,%xmm7      ## low xmm7 has the sum now         ## add earlier value from mem         movq  nb130nf_Vc(%rbp),%rax        addsd (%rax,%rdx,8),%xmm7        ## move back to mem         movsd %xmm7,(%rax,%rdx,8)        ## accumulate total lj energy and update it         movapd nb130nf_Vvdwtot(%rsp),%xmm7        ## accumulate         movhlps %xmm7,%xmm6        addsd  %xmm6,%xmm7      ## low xmm7 has the sum now         ## add earlier value from mem         movq  nb130nf_Vvdw(%rbp),%rax        addsd (%rax,%rdx,8),%xmm7        ## move back to mem         movsd %xmm7,(%rax,%rdx,8)        ## finish if last         movl nb130nf_nn1(%rsp),%ecx        ## esi already loaded with n        incl %esi        subl %esi,%ecx        jz _nb_kernel130nf_x86_64_sse2.nb130nf_outerend        ## not last, iterate outer loop once more!          movl %esi,nb130nf_n(%rsp)        jmp _nb_kernel130nf_x86_64_sse2.nb130nf_outer_nb_kernel130nf_x86_64_sse2.nb130nf_outerend:         ## check if more outer neighborlists remain        movl  nb130nf_nri(%rsp),%ecx        ## esi already loaded with n above        subl  %esi,%ecx        jz _nb_kernel130nf_x86_64_sse2.nb130nf_end        ## non-zero, do one more workunit        jmp   _nb_kernel130nf_x86_64_sse2.nb130nf_threadloop_nb_kernel130nf_x86_64_sse2.nb130nf_end:         movl nb130nf_nouter(%rsp),%eax        movl nb130nf_ninner(%rsp),%ebx        movq nb130nf_outeriter(%rbp),%rcx        movq nb130nf_inneriter(%rbp),%rdx        movl %eax,(%rcx)        movl %ebx,(%rdx)        addq $328,%rsp        emms        pop %r15        pop %r14        pop %r13        pop %r12        pop %rbx        pop    %rbp        ret

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?