nb_kernel313_x86_64_sse.s

来自「最著名最快的分子模拟软件」· S 代码 · 共 2,144 行 · 第 1/5 页

S
2,144
字号
    ## xmm0 = jx    ## xmm1 = jy    ## xmm2 = jz    ## O interaction    ## copy to xmm3-xmm5    movaps %xmm0,%xmm3    movaps %xmm1,%xmm4    movaps %xmm2,%xmm5    subps nb313_ixO(%rsp),%xmm3    subps nb313_iyO(%rsp),%xmm4    subps nb313_izO(%rsp),%xmm5    movaps %xmm3,%xmm13    movaps %xmm4,%xmm14    movaps %xmm5,%xmm15        mulps  %xmm3,%xmm3        mulps  %xmm4,%xmm4        mulps  %xmm5,%xmm5        addps  %xmm4,%xmm3        addps  %xmm5,%xmm3    ## calc 1/rsq    rcpps %xmm3,%xmm5    movaps nb313_two(%rsp),%xmm4    mulps %xmm5,%xmm3    subps %xmm3,%xmm4    mulps %xmm5,%xmm4       ## xmm4=rinvsq    movaps %xmm4,%xmm3      ## rinvsq    mulps  %xmm4,%xmm4      ## rinv4    mulps  %xmm3,%xmm4      ## rinv6    movaps %xmm4,%xmm5    mulps  %xmm5,%xmm5      ## rinv12    mulps  nb313_c6(%rsp),%xmm4    mulps  nb313_c12(%rsp),%xmm5    movaps %xmm5,%xmm6    subps  %xmm4,%xmm6 ## Vvdw=vvdw12-vvdw6    mulps  nb313_six(%rsp),%xmm4    mulps  nb313_twelve(%rsp),%xmm5    subps  %xmm4,%xmm5    mulps  %xmm5,%xmm3  ## fscal    addps  nb313_Vvdwtot(%rsp),%xmm6    movaps %xmm6,nb313_Vvdwtot(%rsp)    mulps  %xmm3,%xmm13 ## fx    mulps  %xmm3,%xmm14 ## fy    mulps  %xmm3,%xmm15 ## fz    ## save j force temporarily    movaps %xmm13,nb313_fjx(%rsp)    movaps %xmm14,nb313_fjy(%rsp)    movaps %xmm15,nb313_fjz(%rsp)    ## increment i O force    addps nb313_fixO(%rsp),%xmm13    addps nb313_fiyO(%rsp),%xmm14    addps nb313_fizO(%rsp),%xmm15    movaps %xmm13,nb313_fixO(%rsp)    movaps %xmm14,nb313_fiyO(%rsp)    movaps %xmm15,nb313_fizO(%rsp)    ## finished O LJ interaction.    ## do H1, H2, and M interactions in parallel.    ## xmm0-xmm2 still contain j coordinates.            movaps %xmm0,%xmm3    movaps %xmm1,%xmm4    movaps %xmm2,%xmm5    movaps %xmm0,%xmm6    movaps %xmm1,%xmm7    movaps %xmm2,%xmm8    subps nb313_ixH1(%rsp),%xmm0    subps nb313_iyH1(%rsp),%xmm1    subps nb313_izH1(%rsp),%xmm2    subps nb313_ixH2(%rsp),%xmm3    subps nb313_iyH2(%rsp),%xmm4    subps nb313_izH2(%rsp),%xmm5    subps nb313_ixM(%rsp),%xmm6    subps nb313_iyM(%rsp),%xmm7    subps nb313_izM(%rsp),%xmm8        movd  %eax,%mm0         ## use mmx registers as temp storage         movd  %ebx,%mm1        movd  %ecx,%mm2        movd  %edx,%mm3        movaps %xmm0,nb313_dxH1(%rsp)        movaps %xmm1,nb313_dyH1(%rsp)        movaps %xmm2,nb313_dzH1(%rsp)        mulps  %xmm0,%xmm0        mulps  %xmm1,%xmm1        mulps  %xmm2,%xmm2        movaps %xmm3,nb313_dxH2(%rsp)        movaps %xmm4,nb313_dyH2(%rsp)        movaps %xmm5,nb313_dzH2(%rsp)        mulps  %xmm3,%xmm3        mulps  %xmm4,%xmm4        mulps  %xmm5,%xmm5        movaps %xmm6,nb313_dxM(%rsp)        movaps %xmm7,nb313_dyM(%rsp)        movaps %xmm8,nb313_dzM(%rsp)        mulps  %xmm6,%xmm6        mulps  %xmm7,%xmm7        mulps  %xmm8,%xmm8        addps  %xmm1,%xmm0        addps  %xmm2,%xmm0        addps  %xmm4,%xmm3        addps  %xmm5,%xmm3    addps  %xmm7,%xmm6    addps  %xmm8,%xmm6        ## start doing invsqrt for j atoms        rsqrtps %xmm0,%xmm1        rsqrtps %xmm3,%xmm4    rsqrtps %xmm6,%xmm7        movaps  %xmm1,%xmm2        movaps  %xmm4,%xmm5    movaps  %xmm7,%xmm8        mulps   %xmm1,%xmm1 ## lu*lu        mulps   %xmm4,%xmm4 ## lu*lu    mulps   %xmm7,%xmm7 ## lu*lu        movaps  nb313_three(%rsp),%xmm9        movaps  %xmm9,%xmm10    movaps  %xmm9,%xmm11        mulps   %xmm0,%xmm1 ## rsq*lu*lu        mulps   %xmm3,%xmm4 ## rsq*lu*lu     mulps   %xmm6,%xmm7 ## rsq*lu*lu        subps   %xmm1,%xmm9        subps   %xmm4,%xmm10    subps   %xmm7,%xmm11 ## 3-rsq*lu*lu        mulps   %xmm2,%xmm9        mulps   %xmm5,%xmm10    mulps   %xmm8,%xmm11 ## lu*(3-rsq*lu*lu)        movaps  nb313_half(%rsp),%xmm4        mulps   %xmm4,%xmm9 ## rinvH1         mulps   %xmm4,%xmm10 ## rinvH2    mulps   %xmm4,%xmm11 ## rinvM        movaps  %xmm9,nb313_rinvH1(%rsp)        movaps  %xmm10,nb313_rinvH2(%rsp)        movaps  %xmm11,nb313_rinvM(%rsp)        ## interactions     ## rsq in xmm0,xmm3,xmm6      ## rinv in xmm9, xmm10, xmm11    movaps nb313_tsc(%rsp),%xmm1    mulps  %xmm9,%xmm0 ## r    mulps  %xmm10,%xmm3    mulps  %xmm11,%xmm6    mulps  %xmm1,%xmm0 ## rtab    mulps  %xmm1,%xmm3    mulps  %xmm1,%xmm6    ## truncate and convert to integers    cvttps2dq %xmm0,%xmm1    cvttps2dq %xmm3,%xmm4    cvttps2dq %xmm6,%xmm7    ## convert back to float    cvtdq2ps  %xmm1,%xmm2    cvtdq2ps  %xmm4,%xmm5    cvtdq2ps  %xmm7,%xmm8    ## multiply by 4    pslld   $2,%xmm1    pslld   $2,%xmm4    pslld   $2,%xmm7    ## move to integer registers    movhlps %xmm1,%xmm13    movhlps %xmm4,%xmm14    movhlps %xmm7,%xmm15    movd    %xmm1,%eax    movd    %xmm4,%r8d    movd    %xmm7,%r12d    movd    %xmm13,%ecx    movd    %xmm14,%r10d    movd    %xmm15,%r14d    pshufd $1,%xmm1,%xmm1    pshufd $1,%xmm4,%xmm4    pshufd $1,%xmm7,%xmm7    pshufd $1,%xmm13,%xmm13    pshufd $1,%xmm14,%xmm14    pshufd $1,%xmm15,%xmm15    movd    %xmm1,%ebx    movd    %xmm4,%r9d    movd    %xmm7,%r13d    movd    %xmm13,%edx    movd    %xmm14,%r11d    movd    %xmm15,%r15d    movq nb313_VFtab(%rbp),%rsi    ## calculate eps    subps     %xmm2,%xmm0    subps     %xmm5,%xmm3    subps     %xmm8,%xmm6    movaps    %xmm0,nb313_epsH1(%rsp)    movaps    %xmm3,nb313_epsH2(%rsp)    movaps    %xmm6,nb313_epsM(%rsp)    ## Load LOTS of table data        movlps (%rsi,%rax,4),%xmm1        movlps (%rsi,%r8,4),%xmm5        movlps (%rsi,%r12,4),%xmm9        movlps (%rsi,%rcx,4),%xmm3        movlps (%rsi,%r10,4),%xmm7        movlps (%rsi,%r14,4),%xmm11        movhps (%rsi,%rbx,4),%xmm1        movhps (%rsi,%r9,4),%xmm5        movhps (%rsi,%r13,4),%xmm9        movhps (%rsi,%rdx,4),%xmm3        movhps (%rsi,%r11,4),%xmm7        movhps (%rsi,%r15,4),%xmm11    movaps %xmm1,%xmm0    movaps %xmm5,%xmm4    movaps %xmm9,%xmm8        shufps $136,%xmm3,%xmm0 ## 10001000        shufps $136,%xmm7,%xmm4 ## 10001000        shufps $136,%xmm11,%xmm8 ## 10001000        shufps $221,%xmm3,%xmm1 ## 11011101        shufps $221,%xmm7,%xmm5 ## 11011101        shufps $221,%xmm11,%xmm9 ## 11011101        movlps 8(%rsi,%rax,4),%xmm3        movlps 8(%rsi,%r8,4),%xmm7        movlps 8(%rsi,%r12,4),%xmm11        movlps 8(%rsi,%rcx,4),%xmm12        movlps 8(%rsi,%r10,4),%xmm13        movlps 8(%rsi,%r14,4),%xmm14        movhps 8(%rsi,%rbx,4),%xmm3        movhps 8(%rsi,%r9,4),%xmm7        movhps 8(%rsi,%r13,4),%xmm11        movhps 8(%rsi,%rdx,4),%xmm12        movhps 8(%rsi,%r11,4),%xmm13        movhps 8(%rsi,%r15,4),%xmm14    movaps %xmm3,%xmm2    movaps %xmm7,%xmm6    movaps %xmm11,%xmm10        shufps $136,%xmm12,%xmm2 ## 10001000        shufps $136,%xmm13,%xmm6 ## 10001000        shufps $136,%xmm14,%xmm10 ## 10001000        shufps $221,%xmm12,%xmm3 ## 11011101        shufps $221,%xmm13,%xmm7 ## 11011101        shufps $221,%xmm14,%xmm11 ## 11011101    ## table data ready in xmm0-xmm3 , xmm4-xmm7 , and xmm8-xmm11    movaps nb313_epsH1(%rsp),%xmm12    movaps nb313_epsH2(%rsp),%xmm13    movaps nb313_epsM(%rsp),%xmm14    mulps  %xmm12,%xmm3  ## Heps    mulps  %xmm13,%xmm7    mulps  %xmm14,%xmm11    mulps  %xmm12,%xmm2  ## Geps    mulps  %xmm13,%xmm6    mulps  %xmm14,%xmm10    mulps  %xmm12,%xmm3  ## Heps2    mulps  %xmm13,%xmm7    mulps  %xmm14,%xmm11    addps  %xmm2,%xmm1  ## F+Geps    addps  %xmm6,%xmm5    addps  %xmm10,%xmm9    addps  %xmm3,%xmm1  ## F+Geps+Heps2 = Fp    addps  %xmm7,%xmm5    addps  %xmm11,%xmm9    addps  %xmm3,%xmm3   ## 2*Heps2    addps  %xmm7,%xmm7    addps  %xmm11,%xmm11    addps  %xmm2,%xmm3   ## 2*Heps2+Geps    addps  %xmm6,%xmm7    addps  %xmm10,%xmm11    addps  %xmm1,%xmm3  ## FF = Fp + 2*Heps2 + Geps    addps  %xmm5,%xmm7    addps  %xmm9,%xmm11    mulps  %xmm12,%xmm1  ## eps*Fp    mulps  %xmm13,%xmm5    mulps  %xmm14,%xmm9    movaps nb313_qqH(%rsp),%xmm12    movaps nb313_qqM(%rsp),%xmm13    addps  %xmm0,%xmm1    ## VV    addps  %xmm4,%xmm5    addps  %xmm8,%xmm9    mulps  %xmm12,%xmm1  ## VV*qq = vcoul    mulps  %xmm12,%xmm5    mulps  %xmm13,%xmm9    mulps  %xmm12,%xmm3   ## FF*qq = fij    mulps  %xmm12,%xmm7    mulps  %xmm13,%xmm11    ## accumulate vctot    addps  nb313_vctot(%rsp),%xmm1    addps  %xmm9,%xmm5    addps  %xmm5,%xmm1    movaps %xmm1,nb313_vctot(%rsp)    movaps nb313_tsc(%rsp),%xmm10    mulps  %xmm10,%xmm3 ## fscal    mulps  %xmm10,%xmm7    mulps  %xmm11,%xmm10    movd %mm0,%eax    movd %mm1,%ebx    movd %mm2,%ecx    movd %mm3,%edx        ## move j forces to local temp variables     movq nb313_faction(%rbp),%rdi    movlps (%rdi,%rax,4),%xmm11 ## jxa jya  -   -    movlps (%rdi,%rcx,4),%xmm12 ## jxc jyc  -   -    movhps (%rdi,%rbx,4),%xmm11 ## jxa jya jxb jyb     movhps (%rdi,%rdx,4),%xmm12 ## jxc jyc jxd jyd     movss  8(%rdi,%rax,4),%xmm13    ## jza  -  -  -    movss  8(%rdi,%rcx,4),%xmm14    ## jzc  -  -  -    movss  8(%rdi,%rbx,4),%xmm2     ## jzb    movss  8(%rdi,%rdx,4),%xmm5     ## jzd    movlhps %xmm2,%xmm13 ## jza  -  jzb  -    movlhps %xmm5,%xmm14 ## jzc  -  jzd -    shufps $136,%xmm14,%xmm13 ## 10001000 => jza jzb jzc jzd    ## xmm11: jxa jya jxb jyb     ## xmm12: jxc jyc jxd jyd    ## xmm13: jza jzb jzc jzd    xorps  %xmm0,%xmm0    xorps  %xmm4,%xmm4    xorps  %xmm8,%xmm8    mulps  nb313_rinvH1(%rsp),%xmm3    mulps  nb313_rinvH2(%rsp),%xmm7    mulps  nb313_rinvM(%rsp),%xmm10    subps  %xmm3,%xmm0    subps  %xmm7,%xmm4    subps  %xmm10,%xmm8    movaps %xmm0,%xmm1    movaps %xmm0,%xmm2    movaps %xmm4,%xmm3    movaps %xmm4,%xmm5    movaps %xmm8,%xmm6    movaps %xmm8,%xmm7        mulps nb313_dxH1(%rsp),%xmm0        mulps nb313_dyH1(%rsp),%xmm1        mulps nb313_dzH1(%rsp),%xmm2        mulps nb313_dxH2(%rsp),%xmm3        mulps nb313_dyH2(%rsp),%xmm4        mulps nb313_dzH2(%rsp),%xmm5        mulps nb313_dxM(%rsp),%xmm6        mulps nb313_dyM(%rsp),%xmm7        mulps nb313_dzM(%rsp),%xmm8    ## fetch forces from O interaction    movaps nb313_fjx(%rsp),%xmm14    movaps nb313_fjy(%rsp),%xmm15    addps  nb313_fjz(%rsp),%xmm13    addps %xmm0,%xmm14    addps %xmm1,%xmm15    addps %xmm2,%xmm13    addps nb313_fixH1(%rsp),%xmm0    addps nb313_fiyH1(%rsp),%xmm1    addps nb313_fizH1(%rsp),%xmm2    addps %xmm3,%xmm14    addps %xmm4,%xmm15    addps %xmm5,%xmm13    addps nb313_fixH2(%rsp),%xmm3    addps nb313_fiyH2(%rsp),%xmm4    addps nb313_fizH2(%rsp),%xmm5    addps %xmm6,%xmm14    addps %xmm7,%xmm15    addps %xmm8,%xmm13    addps nb313_fixM(%rsp),%xmm6    addps nb313_fiyM(%rsp),%xmm7    addps nb313_fizM(%rsp),%xmm8    movaps %xmm0,nb313_fixH1(%rsp)    movaps %xmm1,nb313_fiyH1(%rsp)    movaps %xmm2,nb313_fizH1(%rsp)    movaps %xmm3,nb313_fixH2(%rsp)    movaps %xmm4,nb313_fiyH2(%rsp)    movaps %xmm5,nb313_fizH2(%rsp)    movaps %xmm6,nb313_fixM(%rsp)    movaps %xmm7,nb313_fiyM(%rsp)    movaps %xmm8,nb313_fizM(%rsp)    ## xmm14 = fjx    ## xmm15 = fjy    ## xmm13 = fjz    movaps %xmm14,%xmm0    unpcklps %xmm15,%xmm14    unpckhps %xmm15,%xmm0    addps  %xmm14,%xmm11    addps  %xmm0,%xmm12    movhlps  %xmm13,%xmm14 ## fjzc fjzd    movlps %xmm11,(%rdi,%rax,4)

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?