nb_kernel332_x86_64_sse2.s

来自「最著名最快的分子模拟软件」· S 代码 · 共 2,381 行 · 第 1/5 页

S
2,381
字号
    mulpd   %xmm15,%xmm11 ## first iteration for rinvH2O     ## second iteration step            movapd  %xmm9,%xmm2        movapd  %xmm10,%xmm5    movapd  %xmm11,%xmm8        mulpd   %xmm2,%xmm2 ## lu*lu        mulpd   %xmm5,%xmm5 ## lu*lu    mulpd   %xmm8,%xmm8 ## lu*lu        movapd  nb332_three(%rsp),%xmm1        movapd  %xmm1,%xmm4    movapd  %xmm1,%xmm7        mulpd   %xmm0,%xmm2 ## rsq*lu*lu        mulpd   %xmm3,%xmm5 ## rsq*lu*lu     mulpd   %xmm6,%xmm8 ## rsq*lu*lu        subpd   %xmm2,%xmm1        subpd   %xmm5,%xmm4    subpd   %xmm8,%xmm7 ## 3-rsq*lu*lu        mulpd   %xmm1,%xmm9        mulpd   %xmm4,%xmm10    mulpd   %xmm7,%xmm11 ## lu*(3-rsq*lu*lu)        movapd  nb332_half(%rsp),%xmm15        mulpd   %xmm15,%xmm9 ##  rinvOO         mulpd   %xmm15,%xmm10 ##   rinvH1O    mulpd   %xmm15,%xmm11 ##   rinvH2O        movapd  %xmm9,nb332_rinvOO(%rsp)        movapd  %xmm10,nb332_rinvH1O(%rsp)        movapd  %xmm11,nb332_rinvH2O(%rsp)        ## O interactions     ## rsq in xmm0,xmm3,xmm6      ## rinv in xmm9, xmm10, xmm11    movapd nb332_tsc(%rsp),%xmm1    mulpd  %xmm9,%xmm0 ## r    mulpd  %xmm10,%xmm3    mulpd  %xmm11,%xmm6    mulpd  %xmm1,%xmm0 ## rtab    mulpd  %xmm1,%xmm3    mulpd  %xmm1,%xmm6    ## truncate and convert to integers    cvttpd2dq %xmm0,%xmm1    cvttpd2dq %xmm3,%xmm4    cvttpd2dq %xmm6,%xmm7    ## convert back to float    cvtdq2pd  %xmm1,%xmm2    cvtdq2pd  %xmm4,%xmm5    cvtdq2pd  %xmm7,%xmm8    ## multiply by 4    pslld   $2,%xmm1    pslld   $2,%xmm4    pslld   $2,%xmm7    ## multiply by three (copy, mult. by two, add back)    movapd  %xmm1,%xmm10    movapd  %xmm4,%xmm11    movapd  %xmm7,%xmm12    pslld   $1,%xmm1    pslld   $1,%xmm4    pslld   $1,%xmm7    paddd   %xmm10,%xmm1    paddd   %xmm11,%xmm4    paddd   %xmm12,%xmm7    ## move to integer registers    pshufd $1,%xmm1,%xmm13    pshufd $1,%xmm4,%xmm14    pshufd $1,%xmm7,%xmm15    movd    %xmm1,%r8d    movd    %xmm4,%r10d    movd    %xmm7,%r12d    movd    %xmm13,%r9d    movd    %xmm14,%r11d    movd    %xmm15,%r13d    movq nb332_VFtab(%rbp),%rsi    ## calculate eps    subpd     %xmm2,%xmm0    subpd     %xmm5,%xmm3    subpd     %xmm8,%xmm6    movapd    %xmm0,nb332_epsO(%rsp)    movapd    %xmm3,nb332_epsH1(%rsp)    movapd    %xmm6,nb332_epsH2(%rsp)    ## Load LOTS of table data    movlpd (%rsi,%r8,8),%xmm0    movlpd 8(%rsi,%r8,8),%xmm1    movlpd 16(%rsi,%r8,8),%xmm2    movlpd 24(%rsi,%r8,8),%xmm3    movlpd (%rsi,%r10,8),%xmm4    movlpd 8(%rsi,%r10,8),%xmm5    movlpd 16(%rsi,%r10,8),%xmm6    movlpd 24(%rsi,%r10,8),%xmm7    movlpd (%rsi,%r12,8),%xmm8    movlpd 8(%rsi,%r12,8),%xmm9    movlpd 16(%rsi,%r12,8),%xmm10    movlpd 24(%rsi,%r12,8),%xmm11    movhpd (%rsi,%r9,8),%xmm0    movhpd 8(%rsi,%r9,8),%xmm1    movhpd 16(%rsi,%r9,8),%xmm2    movhpd 24(%rsi,%r9,8),%xmm3    movhpd (%rsi,%r11,8),%xmm4    movhpd 8(%rsi,%r11,8),%xmm5    movhpd 16(%rsi,%r11,8),%xmm6    movhpd 24(%rsi,%r11,8),%xmm7    movhpd (%rsi,%r13,8),%xmm8    movhpd 8(%rsi,%r13,8),%xmm9    movhpd 16(%rsi,%r13,8),%xmm10    movhpd 24(%rsi,%r13,8),%xmm11    ## table data ready in xmm0-xmm3 , xmm4-xmm7 , and xmm8-xmm11    mulpd  nb332_epsO(%rsp),%xmm3     ## Heps    mulpd  nb332_epsH1(%rsp),%xmm7    mulpd  nb332_epsH2(%rsp),%xmm11    mulpd  nb332_epsO(%rsp),%xmm2     ## Geps    mulpd  nb332_epsH1(%rsp),%xmm6    mulpd  nb332_epsH2(%rsp),%xmm10    mulpd  nb332_epsO(%rsp),%xmm3     ## Heps2    mulpd  nb332_epsH1(%rsp),%xmm7    mulpd  nb332_epsH2(%rsp),%xmm11    addpd  %xmm2,%xmm1  ## F+Geps    addpd  %xmm6,%xmm5    addpd  %xmm10,%xmm9    addpd  %xmm3,%xmm1  ## F+Geps+Heps2 = Fp    addpd  %xmm7,%xmm5    addpd  %xmm11,%xmm9    addpd  %xmm3,%xmm3   ## 2*Heps2    addpd  %xmm7,%xmm7    addpd  %xmm11,%xmm11    addpd  %xmm2,%xmm3   ## 2*Heps2+Geps    addpd  %xmm6,%xmm7    addpd  %xmm10,%xmm11    addpd  %xmm1,%xmm3  ## FF = Fp + 2*Heps2 + Geps    addpd  %xmm5,%xmm7    addpd  %xmm9,%xmm11    mulpd  nb332_epsO(%rsp),%xmm1     ## eps*Fp    mulpd  nb332_epsH1(%rsp),%xmm5    mulpd  nb332_epsH2(%rsp),%xmm9    addpd  %xmm0,%xmm1    ## VV    addpd  %xmm4,%xmm5    addpd  %xmm8,%xmm9    mulpd  nb332_qqOO(%rsp),%xmm1     ## VV*qq = vcoul    mulpd  nb332_qqOH(%rsp),%xmm5    mulpd  nb332_qqOH(%rsp),%xmm9    mulpd  nb332_qqOO(%rsp),%xmm3      ## FF*qq = fij    mulpd  nb332_qqOH(%rsp),%xmm7    mulpd  nb332_qqOH(%rsp),%xmm11    ## accumulate vctot    addpd  nb332_vctot(%rsp),%xmm1    addpd  %xmm9,%xmm5    addpd  %xmm5,%xmm1    movapd %xmm1,nb332_vctot(%rsp)    movapd %xmm7,%xmm2    movapd %xmm11,%xmm1    ## fij coul in xmm3, xmm2, xmm1        ## calculate LJ table    movlpd 32(%rsi,%r8,8),%xmm4    movlpd 40(%rsi,%r8,8),%xmm5    movlpd 48(%rsi,%r8,8),%xmm6    movlpd 56(%rsi,%r8,8),%xmm7    movlpd 64(%rsi,%r8,8),%xmm8    movlpd 72(%rsi,%r8,8),%xmm9    movlpd 80(%rsi,%r8,8),%xmm10    movlpd 88(%rsi,%r8,8),%xmm11    movhpd 32(%rsi,%r9,8),%xmm4    movhpd 40(%rsi,%r9,8),%xmm5    movhpd 48(%rsi,%r9,8),%xmm6    movhpd 56(%rsi,%r9,8),%xmm7    movhpd 64(%rsi,%r9,8),%xmm8    movhpd 72(%rsi,%r9,8),%xmm9    movhpd 80(%rsi,%r9,8),%xmm10    movhpd 88(%rsi,%r9,8),%xmm11    ## dispersion table in xmm4-xmm7, repulsion table in xmm8-xmm11    movapd nb332_epsO(%rsp),%xmm0    mulpd  %xmm0,%xmm7   ## Heps    mulpd  %xmm0,%xmm11    mulpd  %xmm0,%xmm6  ## Geps    mulpd  %xmm0,%xmm10    mulpd  %xmm0,%xmm7  ## Heps2    mulpd  %xmm0,%xmm11    addpd  %xmm6,%xmm5 ## F+Geps    addpd  %xmm10,%xmm9    addpd  %xmm7,%xmm5  ## F+Geps+Heps2 = Fp    addpd  %xmm11,%xmm9    addpd  %xmm7,%xmm7   ## 2*Heps2    addpd  %xmm11,%xmm11    addpd  %xmm6,%xmm7  ## 2*Heps2+Geps    addpd  %xmm10,%xmm11    addpd  %xmm5,%xmm7 ## FF = Fp + 2*Heps2 + Geps    addpd  %xmm9,%xmm11    mulpd  %xmm0,%xmm5 ## eps*Fp    mulpd  %xmm0,%xmm9    movapd nb332_c6(%rsp),%xmm12    movapd nb332_c12(%rsp),%xmm13    addpd  %xmm4,%xmm5 ## VV    addpd  %xmm8,%xmm9    mulpd  %xmm12,%xmm5 ## VV*c6 = vnb6    mulpd  %xmm13,%xmm9 ## VV*c12 = vnb12    addpd  %xmm9,%xmm5    addpd  nb332_Vvdwtot(%rsp),%xmm5    movapd %xmm5,nb332_Vvdwtot(%rsp)    mulpd  %xmm12,%xmm7  ## FF*c6 = fnb6    mulpd  %xmm13,%xmm11  ## FF*c12  = fnb12    addpd  %xmm11,%xmm7    addpd  %xmm7,%xmm3    movapd nb332_tsc(%rsp),%xmm10    mulpd  %xmm10,%xmm3 ## fscal    mulpd  %xmm10,%xmm2    mulpd  %xmm10,%xmm1    ## move j O forces to xmm11-xmm13    movq nb332_faction(%rbp),%rdi        movlpd (%rdi,%rax,8),%xmm11        movlpd 8(%rdi,%rax,8),%xmm12        movlpd 16(%rdi,%rax,8),%xmm13        movhpd (%rdi,%rbx,8),%xmm11        movhpd 8(%rdi,%rbx,8),%xmm12        movhpd 16(%rdi,%rbx,8),%xmm13    xorpd  %xmm0,%xmm0    xorpd  %xmm4,%xmm4    xorpd  %xmm8,%xmm8    subpd  %xmm3,%xmm0    subpd  %xmm2,%xmm4    subpd  %xmm1,%xmm8    mulpd  nb332_rinvOO(%rsp),%xmm0    mulpd  nb332_rinvH1O(%rsp),%xmm4    mulpd  nb332_rinvH2O(%rsp),%xmm8    movapd %xmm0,%xmm1    movapd %xmm0,%xmm2    movapd %xmm4,%xmm3    movapd %xmm4,%xmm5    movapd %xmm8,%xmm6    movapd %xmm8,%xmm7        mulpd nb332_dxOO(%rsp),%xmm0        mulpd nb332_dyOO(%rsp),%xmm1        mulpd nb332_dzOO(%rsp),%xmm2        mulpd nb332_dxH1O(%rsp),%xmm3        mulpd nb332_dyH1O(%rsp),%xmm4        mulpd nb332_dzH1O(%rsp),%xmm5        mulpd nb332_dxH2O(%rsp),%xmm6        mulpd nb332_dyH2O(%rsp),%xmm7        mulpd nb332_dzH2O(%rsp),%xmm8    addpd %xmm0,%xmm11    addpd %xmm1,%xmm12    addpd %xmm2,%xmm13    addpd nb332_fixO(%rsp),%xmm0    addpd nb332_fiyO(%rsp),%xmm1    addpd nb332_fizO(%rsp),%xmm2    addpd %xmm3,%xmm11    addpd %xmm4,%xmm12    addpd %xmm5,%xmm13    addpd nb332_fixH1(%rsp),%xmm3    addpd nb332_fiyH1(%rsp),%xmm4    addpd nb332_fizH1(%rsp),%xmm5    addpd %xmm6,%xmm11    addpd %xmm7,%xmm12    addpd %xmm8,%xmm13    addpd nb332_fixH2(%rsp),%xmm6    addpd nb332_fiyH2(%rsp),%xmm7    addpd nb332_fizH2(%rsp),%xmm8    movapd %xmm0,nb332_fixO(%rsp)    movapd %xmm1,nb332_fiyO(%rsp)    movapd %xmm2,nb332_fizO(%rsp)    movapd %xmm3,nb332_fixH1(%rsp)    movapd %xmm4,nb332_fiyH1(%rsp)    movapd %xmm5,nb332_fizH1(%rsp)    movapd %xmm6,nb332_fixH2(%rsp)    movapd %xmm7,nb332_fiyH2(%rsp)    movapd %xmm8,nb332_fizH2(%rsp)    ## store back j O forces from xmm11-xmm13        movlpd %xmm11,(%rdi,%rax,8)        movlpd %xmm12,8(%rdi,%rax,8)        movlpd %xmm13,16(%rdi,%rax,8)        movhpd %xmm11,(%rdi,%rbx,8)        movhpd %xmm12,8(%rdi,%rbx,8)        movhpd %xmm13,16(%rdi,%rbx,8)        ## move j H1 coordinates to local temp variables     movq nb332_pos(%rbp),%rsi    movlpd 24(%rsi,%rax,8),%xmm0    movlpd 32(%rsi,%rax,8),%xmm1    movlpd 40(%rsi,%rax,8),%xmm2    movhpd 24(%rsi,%rbx,8),%xmm0    movhpd 32(%rsi,%rbx,8),%xmm1    movhpd 40(%rsi,%rbx,8),%xmm2    ## xmm0 = H1x    ## xmm1 = H1y    ## xmm2 = H1z    movapd %xmm0,%xmm3    movapd %xmm1,%xmm4    movapd %xmm2,%xmm5    movapd %xmm0,%xmm6    movapd %xmm1,%xmm7    movapd %xmm2,%xmm8    subpd nb332_ixO(%rsp),%xmm0    subpd nb332_iyO(%rsp),%xmm1    subpd nb332_izO(%rsp),%xmm2    subpd nb332_ixH1(%rsp),%xmm3    subpd nb332_iyH1(%rsp),%xmm4    subpd nb332_izH1(%rsp),%xmm5    subpd nb332_ixH2(%rsp),%xmm6    subpd nb332_iyH2(%rsp),%xmm7    subpd nb332_izH2(%rsp),%xmm8        movapd %xmm0,nb332_dxOH1(%rsp)        movapd %xmm1,nb332_dyOH1(%rsp)        movapd %xmm2,nb332_dzOH1(%rsp)        mulpd  %xmm0,%xmm0        mulpd  %xmm1,%xmm1        mulpd  %xmm2,%xmm2        movapd %xmm3,nb332_dxH1H1(%rsp)        movapd %xmm4,nb332_dyH1H1(%rsp)        movapd %xmm5,nb332_dzH1H1(%rsp)        mulpd  %xmm3,%xmm3        mulpd  %xmm4,%xmm4        mulpd  %xmm5,%xmm5        movapd %xmm6,nb332_dxH2H1(%rsp)        movapd %xmm7,nb332_dyH2H1(%rsp)        movapd %xmm8,nb332_dzH2H1(%rsp)        mulpd  %xmm6,%xmm6        mulpd  %xmm7,%xmm7        mulpd  %xmm8,%xmm8        addpd  %xmm1,%xmm0        addpd  %xmm2,%xmm0        addpd  %xmm4,%xmm3        addpd  %xmm5,%xmm3    addpd  %xmm7,%xmm6    addpd  %xmm8,%xmm6        ## start doing invsqrt for jH1 atoms    cvtpd2ps %xmm0,%xmm1    cvtpd2ps %xmm3,%xmm4    cvtpd2ps %xmm6,%xmm7        rsqrtps %xmm1,%xmm1        rsqrtps %xmm4,%xmm4    rsqrtps %xmm7,%xmm7    cvtps2pd %xmm1,%xmm1    cvtps2pd %xmm4,%xmm4    cvtps2pd %xmm7,%xmm7        movapd  %xmm1,%xmm2        movapd  %xmm4,%xmm5    movapd  %xmm7,%xmm8        mulpd   %xmm1,%xmm1 ## lu*lu        mulpd   %xmm4,%xmm4 ## lu*lu    mulpd   %xmm7,%xmm7 ## lu*lu        movapd  nb332_three(%rsp),%xmm9        movapd  %xmm9,%xmm10    movapd  %xmm9,%xmm11        mulpd   %xmm0,%xmm1 ## rsq*lu*lu        mulpd   %xmm3,%xmm4 ## rsq*lu*lu     mulpd   %xmm6,%xmm7 ## rsq*lu*lu        subpd   %xmm1,%xmm9        subpd   %xmm4,%xmm10    subpd   %xmm7,%xmm11 ## 3-rsq*lu*lu        mulpd   %xmm2,%xmm9        mulpd   %xmm5,%xmm10    mulpd   %xmm8,%xmm11 ## lu*(3-rsq*lu*lu)        movapd  nb332_half(%rsp),%xmm15        mulpd   %xmm15,%xmm9 ## first iteration for rinvOH1         mulpd   %xmm15,%xmm10 ## first iteration for rinvH1H1    mulpd   %xmm15,%xmm11 ## first iteration for rinvH2OH1    ## second iteration step            movapd  %xmm9,%xmm2        movapd  %xmm10,%xmm5    movapd  %xmm11,%xmm8        mulpd   %xmm2,%xmm2 ## lu*lu        mulpd   %xmm5,%xmm5 ## lu*lu    mulpd   %xmm8,%xmm8 ## lu*lu        movapd  nb332_three(%rsp),%xmm1        movapd  %xmm1,%xmm4    movapd  %xmm1,%xmm7        mulpd   %xmm0,%xmm2 ## rsq*lu*lu        mulpd   %xmm3,%xmm5 ## rsq*lu*lu     mulpd   %xmm6,%xmm8 ## rsq*lu*lu        subpd   %xmm2,%xmm1        subpd   %xmm5,%xmm4    subpd   %xmm8,%xmm7 ## 3-rsq*lu*lu        mulpd   %xmm1,%xmm9        mulpd   %xmm4,%xmm10    mulpd   %xmm7,%xmm11 ## lu*(3-rsq*lu*lu)        movapd  nb332_half(%rsp),%xmm15        mulpd   %xmm15,%xmm9 ##  rinvOH1        mulpd   %xmm15,%xmm10 ##   rinvH1H1    mulpd   %xmm15,%xmm11 ##   rinvH2H1        movapd  %xmm9,nb332_rinvOH1(%rsp)        movapd  %xmm10,nb332_rinvH1H1(%rsp)        movapd  %xmm11,nb332_rinvH2H1(%rsp)        ## H1 interactions     ## rsq in xmm0,xmm3,xmm6      ## rinv in xmm9, xmm10, xmm11    movapd nb332_tsc(%rsp),%xmm1    mulpd  %xmm9,%xmm0 ## r    mulpd  %xmm10,%xmm3    mulpd  %xmm11,%xmm6    mulpd  %xmm1,%xmm0 ## rtab    mulpd  %xmm1,%xmm3    mulpd  %xmm1,%xmm6    ## truncate and convert to integers    cvttpd2dq %xmm0,%xmm1    cvttpd2dq %xmm3,%xmm4    cvttpd2dq %xmm6,%xmm7    ## convert back to float    cvtdq2pd  %xmm1,%xmm2    cvtdq2pd  %xmm4,%xmm5    cvtdq2pd  %xmm7,%xmm8    ## multiply by 4    pslld   $2,%xmm1    pslld   $2,%xmm4    pslld   $2,%xmm7    ## multiply by three (copy, mult. by two, add back)    movdqa  %xmm1,%xmm10    movdqa  %xmm4,%xmm11    movdqa  %xmm7,%xmm12    pslld   $1,%xmm1    pslld   $1,%xmm4    pslld   $1,%xmm7    paddd   %xmm10,%xmm1    paddd   %xmm11,%xmm4    paddd   %xmm12,%xmm7

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?