nb_kernel313_x86_64_sse.s
来自「最著名最快的分子模拟软件」· S 代码 · 共 2,144 行 · 第 1/5 页
S
2,144 行
## xmm0 = jx ## xmm1 = jy ## xmm2 = jz ## O interaction ## copy to xmm3-xmm5 movaps %xmm0,%xmm3 movaps %xmm1,%xmm4 movaps %xmm2,%xmm5 subps nb313_ixO(%rsp),%xmm3 subps nb313_iyO(%rsp),%xmm4 subps nb313_izO(%rsp),%xmm5 movaps %xmm3,%xmm13 movaps %xmm4,%xmm14 movaps %xmm5,%xmm15 mulps %xmm3,%xmm3 mulps %xmm4,%xmm4 mulps %xmm5,%xmm5 addps %xmm4,%xmm3 addps %xmm5,%xmm3 ## calc 1/rsq rcpps %xmm3,%xmm5 movaps nb313_two(%rsp),%xmm4 mulps %xmm5,%xmm3 subps %xmm3,%xmm4 mulps %xmm5,%xmm4 ## xmm4=rinvsq movaps %xmm4,%xmm3 ## rinvsq mulps %xmm4,%xmm4 ## rinv4 mulps %xmm3,%xmm4 ## rinv6 movaps %xmm4,%xmm5 mulps %xmm5,%xmm5 ## rinv12 mulps nb313_c6(%rsp),%xmm4 mulps nb313_c12(%rsp),%xmm5 movaps %xmm5,%xmm6 subps %xmm4,%xmm6 ## Vvdw=vvdw12-vvdw6 mulps nb313_six(%rsp),%xmm4 mulps nb313_twelve(%rsp),%xmm5 subps %xmm4,%xmm5 mulps %xmm5,%xmm3 ## fscal addps nb313_Vvdwtot(%rsp),%xmm6 movaps %xmm6,nb313_Vvdwtot(%rsp) mulps %xmm3,%xmm13 ## fx mulps %xmm3,%xmm14 ## fy mulps %xmm3,%xmm15 ## fz ## save j force temporarily movaps %xmm13,nb313_fjx(%rsp) movaps %xmm14,nb313_fjy(%rsp) movaps %xmm15,nb313_fjz(%rsp) ## increment i O force addps nb313_fixO(%rsp),%xmm13 addps nb313_fiyO(%rsp),%xmm14 addps nb313_fizO(%rsp),%xmm15 movaps %xmm13,nb313_fixO(%rsp) movaps %xmm14,nb313_fiyO(%rsp) movaps %xmm15,nb313_fizO(%rsp) ## finished O LJ interaction. ## do H1, H2, and M interactions in parallel. ## xmm0-xmm2 still contain j coordinates. movaps %xmm0,%xmm3 movaps %xmm1,%xmm4 movaps %xmm2,%xmm5 movaps %xmm0,%xmm6 movaps %xmm1,%xmm7 movaps %xmm2,%xmm8 subps nb313_ixH1(%rsp),%xmm0 subps nb313_iyH1(%rsp),%xmm1 subps nb313_izH1(%rsp),%xmm2 subps nb313_ixH2(%rsp),%xmm3 subps nb313_iyH2(%rsp),%xmm4 subps nb313_izH2(%rsp),%xmm5 subps nb313_ixM(%rsp),%xmm6 subps nb313_iyM(%rsp),%xmm7 subps nb313_izM(%rsp),%xmm8 movd %eax,%mm0 ## use mmx registers as temp storage movd %ebx,%mm1 movd %ecx,%mm2 movd %edx,%mm3 movaps %xmm0,nb313_dxH1(%rsp) movaps %xmm1,nb313_dyH1(%rsp) movaps %xmm2,nb313_dzH1(%rsp) mulps %xmm0,%xmm0 mulps %xmm1,%xmm1 mulps %xmm2,%xmm2 movaps %xmm3,nb313_dxH2(%rsp) movaps %xmm4,nb313_dyH2(%rsp) movaps %xmm5,nb313_dzH2(%rsp) mulps %xmm3,%xmm3 mulps %xmm4,%xmm4 mulps %xmm5,%xmm5 movaps %xmm6,nb313_dxM(%rsp) movaps %xmm7,nb313_dyM(%rsp) movaps %xmm8,nb313_dzM(%rsp) mulps %xmm6,%xmm6 mulps %xmm7,%xmm7 mulps %xmm8,%xmm8 addps %xmm1,%xmm0 addps %xmm2,%xmm0 addps %xmm4,%xmm3 addps %xmm5,%xmm3 addps %xmm7,%xmm6 addps %xmm8,%xmm6 ## start doing invsqrt for j atoms rsqrtps %xmm0,%xmm1 rsqrtps %xmm3,%xmm4 rsqrtps %xmm6,%xmm7 movaps %xmm1,%xmm2 movaps %xmm4,%xmm5 movaps %xmm7,%xmm8 mulps %xmm1,%xmm1 ## lu*lu mulps %xmm4,%xmm4 ## lu*lu mulps %xmm7,%xmm7 ## lu*lu movaps nb313_three(%rsp),%xmm9 movaps %xmm9,%xmm10 movaps %xmm9,%xmm11 mulps %xmm0,%xmm1 ## rsq*lu*lu mulps %xmm3,%xmm4 ## rsq*lu*lu mulps %xmm6,%xmm7 ## rsq*lu*lu subps %xmm1,%xmm9 subps %xmm4,%xmm10 subps %xmm7,%xmm11 ## 3-rsq*lu*lu mulps %xmm2,%xmm9 mulps %xmm5,%xmm10 mulps %xmm8,%xmm11 ## lu*(3-rsq*lu*lu) movaps nb313_half(%rsp),%xmm4 mulps %xmm4,%xmm9 ## rinvH1 mulps %xmm4,%xmm10 ## rinvH2 mulps %xmm4,%xmm11 ## rinvM movaps %xmm9,nb313_rinvH1(%rsp) movaps %xmm10,nb313_rinvH2(%rsp) movaps %xmm11,nb313_rinvM(%rsp) ## interactions ## rsq in xmm0,xmm3,xmm6 ## rinv in xmm9, xmm10, xmm11 movaps nb313_tsc(%rsp),%xmm1 mulps %xmm9,%xmm0 ## r mulps %xmm10,%xmm3 mulps %xmm11,%xmm6 mulps %xmm1,%xmm0 ## rtab mulps %xmm1,%xmm3 mulps %xmm1,%xmm6 ## truncate and convert to integers cvttps2dq %xmm0,%xmm1 cvttps2dq %xmm3,%xmm4 cvttps2dq %xmm6,%xmm7 ## convert back to float cvtdq2ps %xmm1,%xmm2 cvtdq2ps %xmm4,%xmm5 cvtdq2ps %xmm7,%xmm8 ## multiply by 4 pslld $2,%xmm1 pslld $2,%xmm4 pslld $2,%xmm7 ## move to integer registers movhlps %xmm1,%xmm13 movhlps %xmm4,%xmm14 movhlps %xmm7,%xmm15 movd %xmm1,%eax movd %xmm4,%r8d movd %xmm7,%r12d movd %xmm13,%ecx movd %xmm14,%r10d movd %xmm15,%r14d pshufd $1,%xmm1,%xmm1 pshufd $1,%xmm4,%xmm4 pshufd $1,%xmm7,%xmm7 pshufd $1,%xmm13,%xmm13 pshufd $1,%xmm14,%xmm14 pshufd $1,%xmm15,%xmm15 movd %xmm1,%ebx movd %xmm4,%r9d movd %xmm7,%r13d movd %xmm13,%edx movd %xmm14,%r11d movd %xmm15,%r15d movq nb313_VFtab(%rbp),%rsi ## calculate eps subps %xmm2,%xmm0 subps %xmm5,%xmm3 subps %xmm8,%xmm6 movaps %xmm0,nb313_epsH1(%rsp) movaps %xmm3,nb313_epsH2(%rsp) movaps %xmm6,nb313_epsM(%rsp) ## Load LOTS of table data movlps (%rsi,%rax,4),%xmm1 movlps (%rsi,%r8,4),%xmm5 movlps (%rsi,%r12,4),%xmm9 movlps (%rsi,%rcx,4),%xmm3 movlps (%rsi,%r10,4),%xmm7 movlps (%rsi,%r14,4),%xmm11 movhps (%rsi,%rbx,4),%xmm1 movhps (%rsi,%r9,4),%xmm5 movhps (%rsi,%r13,4),%xmm9 movhps (%rsi,%rdx,4),%xmm3 movhps (%rsi,%r11,4),%xmm7 movhps (%rsi,%r15,4),%xmm11 movaps %xmm1,%xmm0 movaps %xmm5,%xmm4 movaps %xmm9,%xmm8 shufps $136,%xmm3,%xmm0 ## 10001000 shufps $136,%xmm7,%xmm4 ## 10001000 shufps $136,%xmm11,%xmm8 ## 10001000 shufps $221,%xmm3,%xmm1 ## 11011101 shufps $221,%xmm7,%xmm5 ## 11011101 shufps $221,%xmm11,%xmm9 ## 11011101 movlps 8(%rsi,%rax,4),%xmm3 movlps 8(%rsi,%r8,4),%xmm7 movlps 8(%rsi,%r12,4),%xmm11 movlps 8(%rsi,%rcx,4),%xmm12 movlps 8(%rsi,%r10,4),%xmm13 movlps 8(%rsi,%r14,4),%xmm14 movhps 8(%rsi,%rbx,4),%xmm3 movhps 8(%rsi,%r9,4),%xmm7 movhps 8(%rsi,%r13,4),%xmm11 movhps 8(%rsi,%rdx,4),%xmm12 movhps 8(%rsi,%r11,4),%xmm13 movhps 8(%rsi,%r15,4),%xmm14 movaps %xmm3,%xmm2 movaps %xmm7,%xmm6 movaps %xmm11,%xmm10 shufps $136,%xmm12,%xmm2 ## 10001000 shufps $136,%xmm13,%xmm6 ## 10001000 shufps $136,%xmm14,%xmm10 ## 10001000 shufps $221,%xmm12,%xmm3 ## 11011101 shufps $221,%xmm13,%xmm7 ## 11011101 shufps $221,%xmm14,%xmm11 ## 11011101 ## table data ready in xmm0-xmm3 , xmm4-xmm7 , and xmm8-xmm11 movaps nb313_epsH1(%rsp),%xmm12 movaps nb313_epsH2(%rsp),%xmm13 movaps nb313_epsM(%rsp),%xmm14 mulps %xmm12,%xmm3 ## Heps mulps %xmm13,%xmm7 mulps %xmm14,%xmm11 mulps %xmm12,%xmm2 ## Geps mulps %xmm13,%xmm6 mulps %xmm14,%xmm10 mulps %xmm12,%xmm3 ## Heps2 mulps %xmm13,%xmm7 mulps %xmm14,%xmm11 addps %xmm2,%xmm1 ## F+Geps addps %xmm6,%xmm5 addps %xmm10,%xmm9 addps %xmm3,%xmm1 ## F+Geps+Heps2 = Fp addps %xmm7,%xmm5 addps %xmm11,%xmm9 addps %xmm3,%xmm3 ## 2*Heps2 addps %xmm7,%xmm7 addps %xmm11,%xmm11 addps %xmm2,%xmm3 ## 2*Heps2+Geps addps %xmm6,%xmm7 addps %xmm10,%xmm11 addps %xmm1,%xmm3 ## FF = Fp + 2*Heps2 + Geps addps %xmm5,%xmm7 addps %xmm9,%xmm11 mulps %xmm12,%xmm1 ## eps*Fp mulps %xmm13,%xmm5 mulps %xmm14,%xmm9 movaps nb313_qqH(%rsp),%xmm12 movaps nb313_qqM(%rsp),%xmm13 addps %xmm0,%xmm1 ## VV addps %xmm4,%xmm5 addps %xmm8,%xmm9 mulps %xmm12,%xmm1 ## VV*qq = vcoul mulps %xmm12,%xmm5 mulps %xmm13,%xmm9 mulps %xmm12,%xmm3 ## FF*qq = fij mulps %xmm12,%xmm7 mulps %xmm13,%xmm11 ## accumulate vctot addps nb313_vctot(%rsp),%xmm1 addps %xmm9,%xmm5 addps %xmm5,%xmm1 movaps %xmm1,nb313_vctot(%rsp) movaps nb313_tsc(%rsp),%xmm10 mulps %xmm10,%xmm3 ## fscal mulps %xmm10,%xmm7 mulps %xmm11,%xmm10 movd %mm0,%eax movd %mm1,%ebx movd %mm2,%ecx movd %mm3,%edx ## move j forces to local temp variables movq nb313_faction(%rbp),%rdi movlps (%rdi,%rax,4),%xmm11 ## jxa jya - - movlps (%rdi,%rcx,4),%xmm12 ## jxc jyc - - movhps (%rdi,%rbx,4),%xmm11 ## jxa jya jxb jyb movhps (%rdi,%rdx,4),%xmm12 ## jxc jyc jxd jyd movss 8(%rdi,%rax,4),%xmm13 ## jza - - - movss 8(%rdi,%rcx,4),%xmm14 ## jzc - - - movss 8(%rdi,%rbx,4),%xmm2 ## jzb movss 8(%rdi,%rdx,4),%xmm5 ## jzd movlhps %xmm2,%xmm13 ## jza - jzb - movlhps %xmm5,%xmm14 ## jzc - jzd - shufps $136,%xmm14,%xmm13 ## 10001000 => jza jzb jzc jzd ## xmm11: jxa jya jxb jyb ## xmm12: jxc jyc jxd jyd ## xmm13: jza jzb jzc jzd xorps %xmm0,%xmm0 xorps %xmm4,%xmm4 xorps %xmm8,%xmm8 mulps nb313_rinvH1(%rsp),%xmm3 mulps nb313_rinvH2(%rsp),%xmm7 mulps nb313_rinvM(%rsp),%xmm10 subps %xmm3,%xmm0 subps %xmm7,%xmm4 subps %xmm10,%xmm8 movaps %xmm0,%xmm1 movaps %xmm0,%xmm2 movaps %xmm4,%xmm3 movaps %xmm4,%xmm5 movaps %xmm8,%xmm6 movaps %xmm8,%xmm7 mulps nb313_dxH1(%rsp),%xmm0 mulps nb313_dyH1(%rsp),%xmm1 mulps nb313_dzH1(%rsp),%xmm2 mulps nb313_dxH2(%rsp),%xmm3 mulps nb313_dyH2(%rsp),%xmm4 mulps nb313_dzH2(%rsp),%xmm5 mulps nb313_dxM(%rsp),%xmm6 mulps nb313_dyM(%rsp),%xmm7 mulps nb313_dzM(%rsp),%xmm8 ## fetch forces from O interaction movaps nb313_fjx(%rsp),%xmm14 movaps nb313_fjy(%rsp),%xmm15 addps nb313_fjz(%rsp),%xmm13 addps %xmm0,%xmm14 addps %xmm1,%xmm15 addps %xmm2,%xmm13 addps nb313_fixH1(%rsp),%xmm0 addps nb313_fiyH1(%rsp),%xmm1 addps nb313_fizH1(%rsp),%xmm2 addps %xmm3,%xmm14 addps %xmm4,%xmm15 addps %xmm5,%xmm13 addps nb313_fixH2(%rsp),%xmm3 addps nb313_fiyH2(%rsp),%xmm4 addps nb313_fizH2(%rsp),%xmm5 addps %xmm6,%xmm14 addps %xmm7,%xmm15 addps %xmm8,%xmm13 addps nb313_fixM(%rsp),%xmm6 addps nb313_fiyM(%rsp),%xmm7 addps nb313_fizM(%rsp),%xmm8 movaps %xmm0,nb313_fixH1(%rsp) movaps %xmm1,nb313_fiyH1(%rsp) movaps %xmm2,nb313_fizH1(%rsp) movaps %xmm3,nb313_fixH2(%rsp) movaps %xmm4,nb313_fiyH2(%rsp) movaps %xmm5,nb313_fizH2(%rsp) movaps %xmm6,nb313_fixM(%rsp) movaps %xmm7,nb313_fiyM(%rsp) movaps %xmm8,nb313_fizM(%rsp) ## xmm14 = fjx ## xmm15 = fjy ## xmm13 = fjz movaps %xmm14,%xmm0 unpcklps %xmm15,%xmm14 unpckhps %xmm15,%xmm0 addps %xmm14,%xmm11 addps %xmm0,%xmm12 movhlps %xmm13,%xmm14 ## fjzc fjzd movlps %xmm11,(%rdi,%rax,4)
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?