nb_kernel131_x86_64_sse2.s

来自「最著名最快的分子模拟软件」· S 代码 · 共 2,044 行 · 第 1/4 页

S
2,044
字号
    movd    %xmm1,%r8d    movd    %xmm13,%r10d    ## calculate eps    subpd     %xmm2,%xmm0    movq nb131_VFtab(%rbp),%rsi    movlpd (%rsi,%r8,8),%xmm4        movlpd 8(%rsi,%r8,8),%xmm5        movlpd 16(%rsi,%r8,8),%xmm6        movlpd 24(%rsi,%r8,8),%xmm7    movlpd 32(%rsi,%r8,8),%xmm8        movlpd 40(%rsi,%r8,8),%xmm9        movlpd 48(%rsi,%r8,8),%xmm10        movlpd 56(%rsi,%r8,8),%xmm11    movhpd (%rsi,%r10,8),%xmm4        movhpd 8(%rsi,%r10,8),%xmm5        movhpd 16(%rsi,%r10,8),%xmm6        movhpd 24(%rsi,%r10,8),%xmm7    movhpd 32(%rsi,%r10,8),%xmm8        movhpd 40(%rsi,%r10,8),%xmm9        movhpd 48(%rsi,%r10,8),%xmm10        movhpd 56(%rsi,%r10,8),%xmm11    ## dispersion table in xmm4-xmm7, repulsion table in xmm8-xmm11    mulpd  %xmm0,%xmm7   ## Heps    mulpd  %xmm0,%xmm11    mulpd  %xmm0,%xmm6  ## Geps    mulpd  %xmm0,%xmm10    mulpd  %xmm0,%xmm7  ## Heps2    mulpd  %xmm0,%xmm11    addpd  %xmm6,%xmm5 ## F+Geps    addpd  %xmm10,%xmm9    addpd  %xmm7,%xmm5  ## F+Geps+Heps2 = Fp    addpd  %xmm11,%xmm9    addpd  %xmm7,%xmm7   ## 2*Heps2    addpd  %xmm11,%xmm11    addpd  %xmm6,%xmm7  ## 2*Heps2+Geps    addpd  %xmm10,%xmm11    addpd  %xmm5,%xmm7 ## FF = Fp + 2*Heps2 + Geps    addpd  %xmm9,%xmm11    mulpd  %xmm0,%xmm5 ## eps*Fp    mulpd  %xmm0,%xmm9    movapd nb131_c6(%rsp),%xmm12    movapd nb131_c12(%rsp),%xmm13    addpd  %xmm4,%xmm5 ## VV    addpd  %xmm8,%xmm9    mulpd  %xmm12,%xmm5 ## VV*c6 = vnb6    mulpd  %xmm13,%xmm9 ## VV*c12 = vnb12    addpd  %xmm9,%xmm5    addpd  nb131_Vvdwtot(%rsp),%xmm5    movapd %xmm5,nb131_Vvdwtot(%rsp)    mulpd  %xmm12,%xmm7  ## FF*c6 = fnb6    mulpd  %xmm13,%xmm11  ## FF*c12  = fnb12    addpd  %xmm11,%xmm7    mulpd  nb131_tsc(%rsp),%xmm7    movapd nb131_rinvO(%rsp),%xmm9    movapd nb131_rinvH1(%rsp),%xmm10    movapd nb131_rinvH2(%rsp),%xmm11    movapd %xmm9,%xmm0    movapd %xmm10,%xmm1    movapd %xmm11,%xmm2    mulpd  %xmm10,%xmm10    mulpd  %xmm11,%xmm11    mulpd  nb131_qqO(%rsp),%xmm0    mulpd  nb131_qqH(%rsp),%xmm1    mulpd  nb131_qqH(%rsp),%xmm2    mulpd  %xmm0,%xmm9    mulpd  %xmm1,%xmm10    mulpd  %xmm2,%xmm11    subpd  %xmm7,%xmm9    mulpd  nb131_rinvO(%rsp),%xmm9    addpd nb131_vctot(%rsp),%xmm0    addpd %xmm2,%xmm1    addpd %xmm1,%xmm0    movapd %xmm0,nb131_vctot(%rsp)    ## move j forces to xmm0-xmm2    movq nb131_faction(%rbp),%rdi        movlpd (%rdi,%rax,8),%xmm0        movlpd 8(%rdi,%rax,8),%xmm1        movlpd 16(%rdi,%rax,8),%xmm2        movhpd (%rdi,%rbx,8),%xmm0        movhpd 8(%rdi,%rbx,8),%xmm1        movhpd 16(%rdi,%rbx,8),%xmm2    movapd %xmm9,%xmm7    movapd %xmm9,%xmm8    movapd %xmm11,%xmm13    movapd %xmm11,%xmm14    movapd %xmm11,%xmm15    movapd %xmm10,%xmm11    movapd %xmm10,%xmm12        mulpd nb131_dxO(%rsp),%xmm7        mulpd nb131_dyO(%rsp),%xmm8        mulpd nb131_dzO(%rsp),%xmm9        mulpd nb131_dxH1(%rsp),%xmm10        mulpd nb131_dyH1(%rsp),%xmm11        mulpd nb131_dzH1(%rsp),%xmm12        mulpd nb131_dxH2(%rsp),%xmm13        mulpd nb131_dyH2(%rsp),%xmm14        mulpd nb131_dzH2(%rsp),%xmm15    addpd %xmm7,%xmm0    addpd %xmm8,%xmm1    addpd %xmm9,%xmm2    addpd nb131_fixO(%rsp),%xmm7    addpd nb131_fiyO(%rsp),%xmm8    addpd nb131_fizO(%rsp),%xmm9    addpd %xmm10,%xmm0    addpd %xmm11,%xmm1    addpd %xmm12,%xmm2    addpd nb131_fixH1(%rsp),%xmm10    addpd nb131_fiyH1(%rsp),%xmm11    addpd nb131_fizH1(%rsp),%xmm12    addpd %xmm13,%xmm0    addpd %xmm14,%xmm1    addpd %xmm15,%xmm2    addpd nb131_fixH2(%rsp),%xmm13    addpd nb131_fiyH2(%rsp),%xmm14    addpd nb131_fizH2(%rsp),%xmm15    movapd %xmm7,nb131_fixO(%rsp)    movapd %xmm8,nb131_fiyO(%rsp)    movapd %xmm9,nb131_fizO(%rsp)    movapd %xmm10,nb131_fixH1(%rsp)    movapd %xmm11,nb131_fiyH1(%rsp)    movapd %xmm12,nb131_fizH1(%rsp)    movapd %xmm13,nb131_fixH2(%rsp)    movapd %xmm14,nb131_fiyH2(%rsp)    movapd %xmm15,nb131_fizH2(%rsp)    ## store back j forces from xmm0-xmm2        movlpd %xmm0,(%rdi,%rax,8)        movlpd %xmm1,8(%rdi,%rax,8)        movlpd %xmm2,16(%rdi,%rax,8)        movhpd %xmm0,(%rdi,%rbx,8)        movhpd %xmm1,8(%rdi,%rbx,8)        movhpd %xmm2,16(%rdi,%rbx,8)        ## should we do one more iteration?         subl $2,nb131_innerk(%rsp)        jl    _nb_kernel131_x86_64_sse2.nb131_checksingle        jmp   _nb_kernel131_x86_64_sse2.nb131_unroll_loop_nb_kernel131_x86_64_sse2.nb131_checksingle:            movl  nb131_innerk(%rsp),%edx        andl  $1,%edx        jnz    _nb_kernel131_x86_64_sse2.nb131_dosingle        jmp    _nb_kernel131_x86_64_sse2.nb131_updateouterdata_nb_kernel131_x86_64_sse2.nb131_dosingle:         movq  nb131_innerjjnr(%rsp),%rdx       ## pointer to jjnr[k]         movl  (%rdx),%eax        addq $8,nb131_innerjjnr(%rsp)                   ## advance pointer (unrolled 2)         movq nb131_charge(%rbp),%rsi     ## base of charge[]         movsd (%rsi,%rax,8),%xmm3        movapd %xmm3,%xmm4        mulsd  nb131_iqO(%rsp),%xmm3        mulsd  nb131_iqH(%rsp),%xmm4        movapd  %xmm3,nb131_qqO(%rsp)        movapd  %xmm4,nb131_qqH(%rsp)        movq nb131_type(%rbp),%rsi        movl (%rsi,%rax,4),%r8d        movq nb131_vdwparam(%rbp),%rsi        shll %r8d        movl nb131_ntia(%rsp),%edi        addl %edi,%r8d        movsd (%rsi,%r8,8),%xmm6        ## c6a        movsd 8(%rsi,%r8,8),%xmm7       ## c12a        movapd %xmm6,nb131_c6(%rsp)        movapd %xmm7,nb131_c12(%rsp)        movq nb131_pos(%rbp),%rsi        ## base of pos[]         lea  (%rax,%rax,2),%rax     ## replace jnr with j3         ## move j coordinates to local temp variables     movsd (%rsi,%rax,8),%xmm0    movsd 8(%rsi,%rax,8),%xmm1    movsd 16(%rsi,%rax,8),%xmm2    ## xmm0 = jx    ## xmm1 = jy    ## xmm2 = jz    movapd %xmm0,%xmm3    movapd %xmm1,%xmm4    movapd %xmm2,%xmm5    movapd %xmm0,%xmm6    movapd %xmm1,%xmm7    movapd %xmm2,%xmm8    subsd nb131_ixO(%rsp),%xmm0    subsd nb131_iyO(%rsp),%xmm1    subsd nb131_izO(%rsp),%xmm2    subsd nb131_ixH1(%rsp),%xmm3    subsd nb131_iyH1(%rsp),%xmm4    subsd nb131_izH1(%rsp),%xmm5    subsd nb131_ixH2(%rsp),%xmm6    subsd nb131_iyH2(%rsp),%xmm7    subsd nb131_izH2(%rsp),%xmm8        movapd %xmm0,nb131_dxO(%rsp)        movapd %xmm1,nb131_dyO(%rsp)        movapd %xmm2,nb131_dzO(%rsp)        mulsd  %xmm0,%xmm0        mulsd  %xmm1,%xmm1        mulsd  %xmm2,%xmm2        movapd %xmm3,nb131_dxH1(%rsp)        movapd %xmm4,nb131_dyH1(%rsp)        movapd %xmm5,nb131_dzH1(%rsp)        mulsd  %xmm3,%xmm3        mulsd  %xmm4,%xmm4        mulsd  %xmm5,%xmm5        movapd %xmm6,nb131_dxH2(%rsp)        movapd %xmm7,nb131_dyH2(%rsp)        movapd %xmm8,nb131_dzH2(%rsp)        mulsd  %xmm6,%xmm6        mulsd  %xmm7,%xmm7        mulsd  %xmm8,%xmm8        addsd  %xmm1,%xmm0        addsd  %xmm2,%xmm0        addsd  %xmm4,%xmm3        addsd  %xmm5,%xmm3    addsd  %xmm7,%xmm6    addsd  %xmm8,%xmm6        ## start doing invsqrt for j atoms    cvtsd2ss %xmm0,%xmm1    cvtsd2ss %xmm3,%xmm4    cvtsd2ss %xmm6,%xmm7        rsqrtss %xmm1,%xmm1        rsqrtss %xmm4,%xmm4    rsqrtss %xmm7,%xmm7    cvtss2sd %xmm1,%xmm1    cvtss2sd %xmm4,%xmm4    cvtss2sd %xmm7,%xmm7        movapd  %xmm1,%xmm2        movapd  %xmm4,%xmm5    movapd  %xmm7,%xmm8        mulsd   %xmm1,%xmm1 ## lu*lu        mulsd   %xmm4,%xmm4 ## lu*lu    mulsd   %xmm7,%xmm7 ## lu*lu        movapd  nb131_three(%rsp),%xmm9        movapd  %xmm9,%xmm10    movapd  %xmm9,%xmm11        mulsd   %xmm0,%xmm1 ## rsq*lu*lu        mulsd   %xmm3,%xmm4 ## rsq*lu*lu     mulsd   %xmm6,%xmm7 ## rsq*lu*lu        subsd   %xmm1,%xmm9        subsd   %xmm4,%xmm10    subsd   %xmm7,%xmm11 ## 3-rsq*lu*lu        mulsd   %xmm2,%xmm9        mulsd   %xmm5,%xmm10    mulsd   %xmm8,%xmm11 ## lu*(3-rsq*lu*lu)        movapd  nb131_half(%rsp),%xmm15        mulsd   %xmm15,%xmm9 ## first iteration for rinvO        mulsd   %xmm15,%xmm10 ## first iteration for rinvH1    mulsd   %xmm15,%xmm11 ## first iteration for rinvH2    ## second iteration step            movapd  %xmm9,%xmm2        movapd  %xmm10,%xmm5    movapd  %xmm11,%xmm8        mulsd   %xmm2,%xmm2 ## lu*lu        mulsd   %xmm5,%xmm5 ## lu*lu    mulsd   %xmm8,%xmm8 ## lu*lu        movapd  nb131_three(%rsp),%xmm1        movapd  %xmm1,%xmm4    movapd  %xmm1,%xmm7        mulsd   %xmm0,%xmm2 ## rsq*lu*lu        mulsd   %xmm3,%xmm5 ## rsq*lu*lu     mulsd   %xmm6,%xmm8 ## rsq*lu*lu        subsd   %xmm2,%xmm1        subsd   %xmm5,%xmm4    subsd   %xmm8,%xmm7 ## 3-rsq*lu*lu        mulsd   %xmm1,%xmm9        mulsd   %xmm4,%xmm10    mulsd   %xmm7,%xmm11 ## lu*(3-rsq*lu*lu)        movapd  nb131_half(%rsp),%xmm15        mulsd   %xmm15,%xmm9 ##  rinvO         mulsd   %xmm15,%xmm10 ##   rinvH1    mulsd   %xmm15,%xmm11 ##   rinvH2        ## O interactions     ## rsq in xmm0,xmm3,xmm6      ## rinv in xmm9, xmm10, xmm11    movapd %xmm0,nb131_rsqO(%rsp)    movapd %xmm3,nb131_rsqH1(%rsp)    movapd %xmm6,nb131_rsqH2(%rsp)    movapd %xmm9,nb131_rinvO(%rsp)    movapd %xmm10,nb131_rinvH1(%rsp)    movapd %xmm11,nb131_rinvH2(%rsp)    ## table LJ interaction    mulsd  %xmm9,%xmm0    mulsd  nb131_tsc(%rsp),%xmm0   ## rtab    ## truncate and convert to integers    cvttsd2si %xmm0,%r8d    ## convert back to float    cvtsi2sd  %r8d,%xmm2    ## mult. by 8    shll $3,%r8d    ## calculate eps    subsd     %xmm2,%xmm0    movq nb131_VFtab(%rbp),%rsi    movsd (%rsi,%r8,8),%xmm4        movsd 8(%rsi,%r8,8),%xmm5        movsd 16(%rsi,%r8,8),%xmm6        movsd 24(%rsi,%r8,8),%xmm7    movsd 32(%rsi,%r8,8),%xmm8        movsd 40(%rsi,%r8,8),%xmm9        movsd 48(%rsi,%r8,8),%xmm10        movsd 56(%rsi,%r8,8),%xmm11    ## dispersion table in xmm4-xmm7, repulsion table in xmm8-xmm11    mulsd  %xmm0,%xmm7   ## Heps    mulsd  %xmm0,%xmm11    mulsd  %xmm0,%xmm6  ## Geps    mulsd  %xmm0,%xmm10    mulsd  %xmm0,%xmm7  ## Heps2    mulsd  %xmm0,%xmm11    addsd  %xmm6,%xmm5 ## F+Geps    addsd  %xmm10,%xmm9    addsd  %xmm7,%xmm5  ## F+Geps+Heps2 = Fp    addsd  %xmm11,%xmm9    addsd  %xmm7,%xmm7   ## 2*Heps2    addsd  %xmm11,%xmm11    addsd  %xmm6,%xmm7  ## 2*Heps2+Geps    addsd  %xmm10,%xmm11    addsd  %xmm5,%xmm7 ## FF = Fp + 2*Heps2 + Geps    addsd  %xmm9,%xmm11    mulsd  %xmm0,%xmm5 ## eps*Fp    mulsd  %xmm0,%xmm9    movapd nb131_c6(%rsp),%xmm12    movapd nb131_c12(%rsp),%xmm13    addsd  %xmm4,%xmm5 ## VV    addsd  %xmm8,%xmm9    mulsd  %xmm12,%xmm5 ## VV*c6 = vnb6    mulsd  %xmm13,%xmm9 ## VV*c12 = vnb12    addsd  %xmm9,%xmm5    addsd  nb131_Vvdwtot(%rsp),%xmm5    movsd %xmm5,nb131_Vvdwtot(%rsp)    mulsd  %xmm12,%xmm7  ## FF*c6 = fnb6    mulsd  %xmm13,%xmm11  ## FF*c12  = fnb12    addsd  %xmm11,%xmm7    mulsd  nb131_tsc(%rsp),%xmm7    movapd nb131_rinvO(%rsp),%xmm9    movapd nb131_rinvH1(%rsp),%xmm10    movapd nb131_rinvH2(%rsp),%xmm11    movapd %xmm9,%xmm0    movapd %xmm10,%xmm1    movapd %xmm11,%xmm2    mulsd  %xmm10,%xmm10    mulsd  %xmm11,%xmm11    mulsd  nb131_qqO(%rsp),%xmm0    mulsd  nb131_qqH(%rsp),%xmm1    mulsd  nb131_qqH(%rsp),%xmm2    mulsd  %xmm0,%xmm9    mulsd  %xmm1,%xmm10    mulsd  %xmm2,%xmm11    subsd  %xmm7,%xmm9    mulsd  nb131_rinvO(%rsp),%xmm9    addsd nb131_vctot(%rsp),%xmm0    addsd %xmm2,%xmm1    addsd %xmm1,%xmm0    movsd %xmm0,nb131_vctot(%rsp)    ## move j forces to xmm0-xmm2    movq nb131_faction(%rbp),%rdi        movsd (%rdi,%rax,8),%xmm0        movsd 8(%rdi,%rax,8),%xmm1        movsd 16(%rdi,%rax,8),%xmm2    movapd %xmm9,%xmm7    movapd %xmm9,%xmm8    movapd %xmm11,%xmm13    movapd %xmm11,%xmm14    movapd %xmm11,%xmm15    movapd %xmm10,%xmm11    movapd %xmm10,%xmm12        mulsd nb131_dxO(%rsp),%xmm7        mulsd nb131_dyO(%rsp),%xmm8        mulsd nb131_dzO(%rsp),%xmm9        mulsd nb131_dxH1(%rsp),%xmm10        mulsd nb131_dyH1(%rsp),%xmm11        mulsd nb131_dzH1(%rsp),%xmm12        mulsd nb131_dxH2(%rsp),%xmm13        mulsd nb131_dyH2(%rsp),%xmm14        mulsd nb131_dzH2(%rsp),%xmm15    addsd %xmm7,%xmm0    addsd %xmm8,%xmm1    addsd %xmm9,%xmm2    addsd nb131_fixO(%rsp),%xmm7    addsd nb131_fiyO(%rsp),%xmm8    addsd nb131_fizO(%rsp),%xmm9    addsd %xmm10,%xmm0    addsd %xmm11,%xmm1    addsd %xmm12,%xmm2    addsd nb131_fixH1(%rsp),%xmm10    addsd nb131_fiyH1(%rsp),%xmm11    addsd nb131_fizH1(%rsp),%xmm12    addsd %xmm13,%xmm0    addsd %xmm14,%xmm1    addsd %xmm15,%xmm2    addsd nb131_fixH2(%rsp),%xmm13    addsd nb131_fiyH2(%rsp),%xmm14    addsd nb131_fizH2(%rsp),%xmm15    movsd %xmm7,nb131_fixO(%rsp)    movsd %xmm8,nb131_fiyO(%rsp)    movsd %xmm9,nb131_fizO(%rsp)    movsd %xmm10,nb131_fixH1(%rsp)    movsd %xmm11,nb131_fiyH1(%rsp)    movsd %xmm12,nb131_fizH1(%rsp)    movsd %xmm13,nb131_fixH2(%rsp)    movsd %xmm14,nb131_fiyH2(%rsp)    movsd %xmm15,nb131_fizH2(%rsp)    ## store back j forces from xmm0-xmm2        movsd %xmm0,(%rdi,%rax,8)        movsd %xmm1,8(%rdi,%rax,8)        movsd %xmm2,16(%rdi,%rax,8)_nb_kernel131_x86_64_sse2.nb131_updateouterdata:         movl  nb131_ii3(%rsp),%ecx        movq  nb131_faction(%rbp),%rdi        movq  nb131_fshift(%rbp),%rsi        movl  nb131_is3(%rsp),%edx        ## accumulate  Oi forces in xmm0, xmm1, xmm2         movapd nb131_fixO(%rsp),%xmm0        movapd nb131_fiyO(%rsp),%xmm1        movapd nb131_fizO(%rsp),%xmm2        movhlps %xmm0,%xmm3        movhlps %xmm1,%xmm4        movhlps %xmm2,%xmm5        addsd  %xmm3,%xmm0        addsd  %xmm4,%xmm1        addsd  %xmm5,%xmm2 ## sum is in low xmm0-xmm2         movapd %xmm0,%xmm3        movapd %xmm1,%xmm4        movapd %xmm2,%xmm5        ## increment i force         movsd  (%rdi,%rcx,8),%xmm3        movsd  8(%rdi,%rcx,8),%xmm4        movsd  16(%rdi,%rcx,8),%xmm5        subsd  %xmm0,%xmm3        subsd  %xmm1,%xmm4        subsd  %xmm2,%xmm5        movsd  %xmm3,(%rdi,%rcx,8)        movsd  %xmm4,8(%rdi,%rcx,8)        movsd  %xmm5,16(%rdi,%rcx,8)        ## accumulate force in xmm6/xmm7 for fshift         movapd %xmm0,%xmm6

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?