nb_kernel133_x86_64_sse2.s

来自「最著名最快的分子模拟软件」· S 代码 · 共 2,141 行 · 第 1/5 页

S
2,141
字号
    mulpd %xmm5,%xmm7    mulpd %xmm6,%xmm7       ## xmm0=iter1 of rinv (new lu)     movapd %xmm7,%xmm5      ## copy of lu     mulpd %xmm7,%xmm7       ## lu*lu     movapd nb133_three(%rsp),%xmm15    mulpd %xmm3,%xmm7       ## rsq*lu*lu                        movapd nb133_half(%rsp),%xmm6    subpd %xmm7,%xmm15       ## 30-rsq*lu*lu     mulpd %xmm5,%xmm15    mulpd %xmm6,%xmm15       ## xmm15=rinv    mulpd %xmm15,%xmm3       ## xmm3=r     ## xmm15=rinv    ## xmm3=r    mulpd nb133_tsc(%rsp),%xmm3   ## rtab    ## truncate and convert to integers    cvttpd2pi %xmm3,%mm6    ## convert back to float    cvtpi2pd  %mm6,%xmm4    ## multiply by 8    pslld   $3,%mm6    ## calculate eps    subpd     %xmm4,%xmm3   ## xmm3=eps    ## move to integer registers    movd %mm6,%r10d    psrlq $32,%mm6    movd %mm6,%r11d    ## xmm3=eps    ## xmm15=rinv        movq nb133_VFtab(%rbp),%rsi    ## indices in r10, r11. Load dispersion and repulsion tables in parallel.    movapd (%rsi,%r10,8),%xmm4          ## Y1d F1d      movapd (%rsi,%r11,8),%xmm12         ## Y2d F2d     movapd 32(%rsi,%r10,8),%xmm8        ## Y1r F1r      movapd 32(%rsi,%r11,8),%xmm13       ## Y2r F2r     movapd %xmm4,%xmm5    movapd %xmm8,%xmm9    unpcklpd %xmm12,%xmm4   ## Y1d Y2d     unpckhpd %xmm12,%xmm5   ## F1d F2d     unpcklpd %xmm13,%xmm8   ## Y1r Y2r     unpckhpd %xmm13,%xmm9   ## F1r F2r     movapd 16(%rsi,%r10,8),%xmm6        ## G1d H1d      movapd 16(%rsi,%r11,8),%xmm12           ## G2d H2d     movapd 48(%rsi,%r10,8),%xmm10           ## G1r H1r          movapd 48(%rsi,%r11,8),%xmm13           ## G2r H2r     movapd %xmm6,%xmm7    movapd %xmm10,%xmm11    unpcklpd %xmm12,%xmm6   ## G1d G2d     unpckhpd %xmm12,%xmm7   ## H1d H2d     unpcklpd %xmm13,%xmm10  ## G1r G2r     unpckhpd %xmm13,%xmm11  ## H1r H2r     ## dispersion table in xmm4-xmm7, repulsion table in xmm8-xmm11    mulpd  %xmm3,%xmm7   ## Heps    mulpd  %xmm3,%xmm11    mulpd  %xmm3,%xmm6  ## Geps    mulpd  %xmm3,%xmm10    mulpd  %xmm3,%xmm7  ## Heps2    mulpd  %xmm3,%xmm11    addpd  %xmm6,%xmm5 ## F+Geps    addpd  %xmm10,%xmm9    addpd  %xmm7,%xmm5  ## F+Geps+Heps2 = Fp    addpd  %xmm11,%xmm9    addpd  %xmm7,%xmm7   ## 2*Heps2    addpd  %xmm11,%xmm11    addpd  %xmm6,%xmm7  ## 2*Heps2+Geps    addpd  %xmm10,%xmm11    addpd  %xmm5,%xmm7 ## FF = Fp + 2*Heps2 + Geps    addpd  %xmm9,%xmm11    mulpd  %xmm3,%xmm5 ## eps*Fp    mulpd  %xmm3,%xmm9    movapd nb133_c6(%rsp),%xmm12    movapd nb133_c12(%rsp),%xmm13    addpd  %xmm4,%xmm5 ## VV    addpd  %xmm8,%xmm9    mulpd  %xmm12,%xmm5 ## VV*c6 = vnb6    mulpd  %xmm13,%xmm9 ## VV*c12 = vnb12    addpd  %xmm9,%xmm5    addpd  nb133_Vvdwtot(%rsp),%xmm5    movapd %xmm5,nb133_Vvdwtot(%rsp)    mulpd  %xmm12,%xmm7  ## FF*c6 = fnb6    mulpd  %xmm13,%xmm11  ## FF*c12  = fnb12    addpd  %xmm11,%xmm7    mulpd  nb133_tsc(%rsp),%xmm7    mulpd  %xmm15,%xmm7  ## -fscal    xorpd  %xmm9,%xmm9    subpd  %xmm7,%xmm9    ## fscal    movapd %xmm9,%xmm10    movapd %xmm9,%xmm11    mulpd  nb133_dxO(%rsp),%xmm9    ## fx/fy/fz    mulpd  nb133_dyO(%rsp),%xmm10    mulpd  nb133_dzO(%rsp),%xmm11    ## save j force temporarily    movapd %xmm9,nb133_fjx(%rsp)    movapd %xmm10,nb133_fjy(%rsp)    movapd %xmm11,nb133_fjz(%rsp)    ## increment i O force    addpd nb133_fixO(%rsp),%xmm9    addpd nb133_fiyO(%rsp),%xmm10    addpd nb133_fizO(%rsp),%xmm11    movapd %xmm9,nb133_fixO(%rsp)    movapd %xmm10,nb133_fiyO(%rsp)    movapd %xmm11,nb133_fizO(%rsp)    ## finished O LJ interaction.    ## do H1, H2, and M interactions in parallel.    ## xmm0-xmm2 still contain j coordinates.            movapd %xmm0,%xmm3    movapd %xmm1,%xmm4    movapd %xmm2,%xmm5    movapd %xmm0,%xmm6    movapd %xmm1,%xmm7    movapd %xmm2,%xmm8    subpd nb133_ixH1(%rsp),%xmm0    subpd nb133_iyH1(%rsp),%xmm1    subpd nb133_izH1(%rsp),%xmm2    subpd nb133_ixH2(%rsp),%xmm3    subpd nb133_iyH2(%rsp),%xmm4    subpd nb133_izH2(%rsp),%xmm5    subpd nb133_ixM(%rsp),%xmm6    subpd nb133_iyM(%rsp),%xmm7    subpd nb133_izM(%rsp),%xmm8        movapd %xmm0,nb133_dxH1(%rsp)        movapd %xmm1,nb133_dyH1(%rsp)        movapd %xmm2,nb133_dzH1(%rsp)        mulpd  %xmm0,%xmm0        mulpd  %xmm1,%xmm1        mulpd  %xmm2,%xmm2        movapd %xmm3,nb133_dxH2(%rsp)        movapd %xmm4,nb133_dyH2(%rsp)        movapd %xmm5,nb133_dzH2(%rsp)        mulpd  %xmm3,%xmm3        mulpd  %xmm4,%xmm4        mulpd  %xmm5,%xmm5        movapd %xmm6,nb133_dxM(%rsp)        movapd %xmm7,nb133_dyM(%rsp)        movapd %xmm8,nb133_dzM(%rsp)        mulpd  %xmm6,%xmm6        mulpd  %xmm7,%xmm7        mulpd  %xmm8,%xmm8        addpd  %xmm1,%xmm0        addpd  %xmm2,%xmm0        addpd  %xmm4,%xmm3        addpd  %xmm5,%xmm3    addpd  %xmm7,%xmm6    addpd  %xmm8,%xmm6        ## start doing invsqrt for j atoms    cvtpd2ps %xmm0,%xmm1    cvtpd2ps %xmm3,%xmm4    cvtpd2ps %xmm6,%xmm7        rsqrtps %xmm1,%xmm1        rsqrtps %xmm4,%xmm4    rsqrtps %xmm7,%xmm7    cvtps2pd %xmm1,%xmm1    cvtps2pd %xmm4,%xmm4    cvtps2pd %xmm7,%xmm7        movapd  %xmm1,%xmm2        movapd  %xmm4,%xmm5    movapd  %xmm7,%xmm8        mulpd   %xmm1,%xmm1 ## lu*lu        mulpd   %xmm4,%xmm4 ## lu*lu    mulpd   %xmm7,%xmm7 ## lu*lu        movapd  nb133_three(%rsp),%xmm9        movapd  %xmm9,%xmm10    movapd  %xmm9,%xmm11        mulpd   %xmm0,%xmm1 ## rsq*lu*lu        mulpd   %xmm3,%xmm4 ## rsq*lu*lu     mulpd   %xmm6,%xmm7 ## rsq*lu*lu        subpd   %xmm1,%xmm9        subpd   %xmm4,%xmm10    subpd   %xmm7,%xmm11 ## 3-rsq*lu*lu        mulpd   %xmm2,%xmm9        mulpd   %xmm5,%xmm10    mulpd   %xmm8,%xmm11 ## lu*(3-rsq*lu*lu)        movapd  nb133_half(%rsp),%xmm15        mulpd   %xmm15,%xmm9 ## first iteration for rinvH1        mulpd   %xmm15,%xmm10 ## first iteration for rinvH2    mulpd   %xmm15,%xmm11 ## first iteration for rinvM    ## second iteration step            movapd  %xmm9,%xmm2        movapd  %xmm10,%xmm5    movapd  %xmm11,%xmm8        mulpd   %xmm2,%xmm2 ## lu*lu        mulpd   %xmm5,%xmm5 ## lu*lu    mulpd   %xmm8,%xmm8 ## lu*lu        movapd  nb133_three(%rsp),%xmm1        movapd  %xmm1,%xmm4    movapd  %xmm1,%xmm7        mulpd   %xmm0,%xmm2 ## rsq*lu*lu        mulpd   %xmm3,%xmm5 ## rsq*lu*lu     mulpd   %xmm6,%xmm8 ## rsq*lu*lu        subpd   %xmm2,%xmm1        subpd   %xmm5,%xmm4    subpd   %xmm8,%xmm7 ## 3-rsq*lu*lu        mulpd   %xmm1,%xmm9        mulpd   %xmm4,%xmm10    mulpd   %xmm7,%xmm11 ## lu*(3-rsq*lu*lu)        movapd  nb133_half(%rsp),%xmm15        mulpd   %xmm15,%xmm9 ##  rinvH1        mulpd   %xmm15,%xmm10 ##   rinvH2    mulpd   %xmm15,%xmm11 ##   rinvM        ## interactions     movapd %xmm9,%xmm0    movapd %xmm10,%xmm1    movapd %xmm11,%xmm2    mulpd  %xmm9,%xmm9    mulpd  %xmm10,%xmm10    mulpd  %xmm11,%xmm11    mulpd  nb133_qqH(%rsp),%xmm0    mulpd  nb133_qqH(%rsp),%xmm1    mulpd  nb133_qqM(%rsp),%xmm2    mulpd  %xmm0,%xmm9    mulpd  %xmm1,%xmm10    mulpd  %xmm2,%xmm11    addpd nb133_vctot(%rsp),%xmm0    addpd %xmm2,%xmm1    addpd %xmm1,%xmm0    movapd %xmm0,nb133_vctot(%rsp)    ## move j forces to xmm0-xmm2        movq  nb133_faction(%rbp),%rdi        movlpd (%rdi,%rax,8),%xmm0        movlpd 8(%rdi,%rax,8),%xmm1        movlpd 16(%rdi,%rax,8),%xmm2        movhpd (%rdi,%rbx,8),%xmm0        movhpd 8(%rdi,%rbx,8),%xmm1        movhpd 16(%rdi,%rbx,8),%xmm2    movapd %xmm9,%xmm7    movapd %xmm9,%xmm8    movapd %xmm11,%xmm13    movapd %xmm11,%xmm14    movapd %xmm11,%xmm15    movapd %xmm10,%xmm11    movapd %xmm10,%xmm12    ## add forces from O interaction    addpd nb133_fjx(%rsp),%xmm0    addpd nb133_fjy(%rsp),%xmm1    addpd nb133_fjz(%rsp),%xmm2        mulpd nb133_dxH1(%rsp),%xmm7        mulpd nb133_dyH1(%rsp),%xmm8        mulpd nb133_dzH1(%rsp),%xmm9        mulpd nb133_dxH2(%rsp),%xmm10        mulpd nb133_dyH2(%rsp),%xmm11        mulpd nb133_dzH2(%rsp),%xmm12        mulpd nb133_dxM(%rsp),%xmm13        mulpd nb133_dyM(%rsp),%xmm14        mulpd nb133_dzM(%rsp),%xmm15    addpd %xmm7,%xmm0    addpd %xmm8,%xmm1    addpd %xmm9,%xmm2    addpd nb133_fixH1(%rsp),%xmm7    addpd nb133_fiyH1(%rsp),%xmm8    addpd nb133_fizH1(%rsp),%xmm9    addpd %xmm10,%xmm0    addpd %xmm11,%xmm1    addpd %xmm12,%xmm2    addpd nb133_fixH2(%rsp),%xmm10    addpd nb133_fiyH2(%rsp),%xmm11    addpd nb133_fizH2(%rsp),%xmm12    addpd %xmm13,%xmm0    addpd %xmm14,%xmm1    addpd %xmm15,%xmm2    addpd nb133_fixM(%rsp),%xmm13    addpd nb133_fiyM(%rsp),%xmm14    addpd nb133_fizM(%rsp),%xmm15    movapd %xmm7,nb133_fixH1(%rsp)    movapd %xmm8,nb133_fiyH1(%rsp)    movapd %xmm9,nb133_fizH1(%rsp)    movapd %xmm10,nb133_fixH2(%rsp)    movapd %xmm11,nb133_fiyH2(%rsp)    movapd %xmm12,nb133_fizH2(%rsp)    movapd %xmm13,nb133_fixM(%rsp)    movapd %xmm14,nb133_fiyM(%rsp)    movapd %xmm15,nb133_fizM(%rsp)    ## store back j forces from xmm0-xmm2        movlpd %xmm0,(%rdi,%rax,8)        movlpd %xmm1,8(%rdi,%rax,8)        movlpd %xmm2,16(%rdi,%rax,8)        movhpd %xmm0,(%rdi,%rbx,8)        movhpd %xmm1,8(%rdi,%rbx,8)        movhpd %xmm2,16(%rdi,%rbx,8)        ## should we do one more iteration?         subl $2,nb133_innerk(%rsp)        jl   _nb_kernel133_x86_64_sse2.nb133_checksingle        jmp  _nb_kernel133_x86_64_sse2.nb133_unroll_loop_nb_kernel133_x86_64_sse2.nb133_checksingle:         movl  nb133_innerk(%rsp),%edx        andl  $1,%edx        jnz  _nb_kernel133_x86_64_sse2.nb133_dosingle        jmp  _nb_kernel133_x86_64_sse2.nb133_updateouterdata_nb_kernel133_x86_64_sse2.nb133_dosingle:         movq  nb133_innerjjnr(%rsp),%rdx       ## pointer to jjnr[k]         movl  (%rdx),%eax        addq $4,nb133_innerjjnr(%rsp)        movq nb133_charge(%rbp),%rsi     ## base of charge[]         xorpd %xmm3,%xmm3        movlpd (%rsi,%rax,8),%xmm3        movapd %xmm3,%xmm4        mulsd  nb133_iqM(%rsp),%xmm3        mulsd  nb133_iqH(%rsp),%xmm4        movapd  %xmm3,nb133_qqM(%rsp)        movapd  %xmm4,nb133_qqH(%rsp)        movq nb133_type(%rbp),%rsi        movl (%rsi,%rax,4),%r8d        movq nb133_vdwparam(%rbp),%rsi        shll %r8d        movl nb133_ntia(%rsp),%edi        addl %edi,%r8d        movlpd (%rsi,%r8,8),%xmm6       ## c6a        movhpd 8(%rsi,%r8,8),%xmm6      ## c6a c12a         xorpd %xmm7,%xmm7        movapd %xmm6,%xmm4        unpcklpd %xmm7,%xmm4        unpckhpd %xmm7,%xmm6        movapd %xmm4,nb133_c6(%rsp)        movapd %xmm6,nb133_c12(%rsp)        movq nb133_pos(%rbp),%rsi        ## base of pos[]         lea  (%rax,%rax,2),%rax     ## replace jnr with j3         ## move coordinates to xmm0-xmm2  and xmm4-xmm6        movlpd (%rsi,%rax,8),%xmm4        movlpd 8(%rsi,%rax,8),%xmm5        movlpd 16(%rsi,%rax,8),%xmm6    movapd %xmm4,%xmm0    movapd %xmm5,%xmm1    movapd %xmm6,%xmm2        ## calc dr         subsd nb133_ixO(%rsp),%xmm4        subsd nb133_iyO(%rsp),%xmm5        subsd nb133_izO(%rsp),%xmm6        ## store dr         movapd %xmm4,nb133_dxO(%rsp)        movapd %xmm5,nb133_dyO(%rsp)        movapd %xmm6,nb133_dzO(%rsp)        ## square it         mulsd %xmm4,%xmm4        mulsd %xmm5,%xmm5        mulsd %xmm6,%xmm6        addsd %xmm5,%xmm4        addsd %xmm6,%xmm4        movapd %xmm4,%xmm7        ## rsqO in xmm7         movapd %xmm7,nb133_rsqO(%rsp)        ## move j coords to xmm4-xmm6         movapd %xmm0,%xmm4        movapd %xmm1,%xmm5        movapd %xmm2,%xmm6        ## calc dr         subsd nb133_ixH1(%rsp),%xmm4        subsd nb133_iyH1(%rsp),%xmm5        subsd nb133_izH1(%rsp),%xmm6        ## store dr         movapd %xmm4,nb133_dxH1(%rsp)        movapd %xmm5,nb133_dyH1(%rsp)        movapd %xmm6,nb133_dzH1(%rsp)        ## square it         mulsd %xmm4,%xmm4        mulsd %xmm5,%xmm5        mulsd %xmm6,%xmm6        addsd %xmm5,%xmm6        addsd %xmm4,%xmm6        ## rsqH1 in xmm6         ## move j coords to xmm3-xmm5         movapd %xmm0,%xmm3        movapd %xmm1,%xmm4        movapd %xmm2,%xmm5

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?