nb_kernel313_x86_64_sse.s
来自「最著名最快的分子模拟软件」· S 代码 · 共 2,144 行 · 第 1/5 页
S
2,144 行
movhps %xmm11,(%rdi,%rbx,4) movlps %xmm12,(%rdi,%rcx,4) movhps %xmm12,(%rdi,%rdx,4) movss %xmm13,8(%rdi,%rax,4) movss %xmm14,8(%rdi,%rcx,4) shufps $1,%xmm13,%xmm13 shufps $1,%xmm14,%xmm14 movss %xmm13,8(%rdi,%rbx,4) movss %xmm14,8(%rdi,%rdx,4) ## should we do one more iteration? subl $4,nb313_innerk(%rsp) jl _nb_kernel313_x86_64_sse.nb313_odd_inner jmp _nb_kernel313_x86_64_sse.nb313_unroll_loop_nb_kernel313_x86_64_sse.nb313_odd_inner: addl $4,nb313_innerk(%rsp) jnz _nb_kernel313_x86_64_sse.nb313_odd_loop jmp _nb_kernel313_x86_64_sse.nb313_updateouterdata_nb_kernel313_x86_64_sse.nb313_odd_loop: movq nb313_innerjjnr(%rsp),%rdx ## pointer to jjnr[k] movl (%rdx),%eax addq $4,nb313_innerjjnr(%rsp) xorps %xmm4,%xmm4 ## clear reg. movss nb313_iqM(%rsp),%xmm4 movq nb313_charge(%rbp),%rsi movhps nb313_iqH(%rsp),%xmm4 ## [qM 0 qH qH] shufps $41,%xmm4,%xmm4 ## [0 qH qH qM] movss (%rsi,%rax,4),%xmm3 ## charge in xmm3 shufps $0,%xmm3,%xmm3 mulps %xmm4,%xmm3 movaps %xmm3,nb313_qqM(%rsp) ## use dummy qq for storage xorps %xmm6,%xmm6 movq nb313_type(%rbp),%rsi movl (%rsi,%rax,4),%ebx movq nb313_vdwparam(%rbp),%rsi shll %ebx addl nb313_ntia(%rsp),%ebx movlps (%rsi,%rbx,4),%xmm6 movaps %xmm6,%xmm7 shufps $252,%xmm6,%xmm6 ## 11111100 shufps $253,%xmm7,%xmm7 ## 11111101 movaps %xmm6,nb313_c6(%rsp) movaps %xmm7,nb313_c12(%rsp) movq nb313_pos(%rbp),%rsi lea (%rax,%rax,2),%rax movss nb313_ixO(%rsp),%xmm0 movss nb313_iyO(%rsp),%xmm1 movss nb313_izO(%rsp),%xmm2 movss nb313_ixH1(%rsp),%xmm3 movss nb313_iyH1(%rsp),%xmm4 movss nb313_izH1(%rsp),%xmm5 unpcklps nb313_ixH2(%rsp),%xmm0 ## ixO ixH2 - - unpcklps nb313_iyH2(%rsp),%xmm1 ## iyO iyH2 - - unpcklps nb313_izH2(%rsp),%xmm2 ## izO izH2 - - unpcklps nb313_ixM(%rsp),%xmm3 ## ixH1 ixM - - unpcklps nb313_iyM(%rsp),%xmm4 ## iyH1 iyM - - unpcklps nb313_izM(%rsp),%xmm5 ## izH1 izM - - unpcklps %xmm3,%xmm0 ## ixO ixH1 ixH2 ixM unpcklps %xmm4,%xmm1 ## same for y unpcklps %xmm5,%xmm2 ## same for z ## move j coords to xmm0-xmm2 movss (%rsi,%rax,4),%xmm3 movss 4(%rsi,%rax,4),%xmm4 movss 8(%rsi,%rax,4),%xmm5 shufps $0,%xmm3,%xmm3 shufps $0,%xmm4,%xmm4 shufps $0,%xmm5,%xmm5 subps %xmm0,%xmm3 subps %xmm1,%xmm4 subps %xmm2,%xmm5 ## use M distances for storage movaps %xmm3,nb313_dxM(%rsp) movaps %xmm4,nb313_dyM(%rsp) movaps %xmm5,nb313_dzM(%rsp) mulps %xmm3,%xmm3 mulps %xmm4,%xmm4 mulps %xmm5,%xmm5 addps %xmm3,%xmm4 addps %xmm5,%xmm4 ## rsq in xmm4 rsqrtps %xmm4,%xmm5 ## lookup seed in xmm5 movaps %xmm5,%xmm2 mulps %xmm5,%xmm5 movaps nb313_three(%rsp),%xmm1 mulps %xmm4,%xmm5 ## rsq*lu*lu movaps nb313_half(%rsp),%xmm0 subps %xmm5,%xmm1 ## 30-rsq*lu*lu mulps %xmm2,%xmm1 mulps %xmm1,%xmm0 ## xmm0=rinv movaps %xmm0,nb313_rinvM(%rsp) mulps %xmm0,%xmm4 ## r mulps nb313_tsc(%rsp),%xmm4 movhlps %xmm4,%xmm7 cvttps2pi %xmm4,%mm6 cvttps2pi %xmm7,%mm7 ## mm6/mm7 contain lu indices cvtpi2ps %mm6,%xmm3 cvtpi2ps %mm7,%xmm7 movlhps %xmm7,%xmm3 subps %xmm3,%xmm4 movaps %xmm4,%xmm1 ## xmm1=eps movaps %xmm1,%xmm2 mulps %xmm2,%xmm2 ## xmm2=eps2 pslld $2,%mm6 pslld $2,%mm7 movq nb313_VFtab(%rbp),%rsi psrlq $32,%mm6 movd %mm6,%ebx movd %mm7,%ecx psrlq $32,%mm7 movd %mm7,%edx xorps %xmm5,%xmm5 movlps (%rsi,%rcx,4),%xmm3 ## data: Y3 F3 - - movhps (%rsi,%rbx,4),%xmm5 ## data: 0 0 Y2 F2 movhps (%rsi,%rdx,4),%xmm3 ## data: Y3 F3 Y4 F4 movaps %xmm5,%xmm4 ## data: 0 0 Y2 F2 shufps $0x88,%xmm3,%xmm4 ## data: 0 Y2 Y3 Y3 shufps $0xDD,%xmm3,%xmm5 ## data: 0 F2 F3 F4 xorps %xmm7,%xmm7 movlps 8(%rsi,%rcx,4),%xmm3 ## data: G3 H3 - - movhps 8(%rsi,%rbx,4),%xmm7 ## data: 0 0 G2 H2 movhps 8(%rsi,%rdx,4),%xmm3 ## data: G3 H3 G4 H4 movaps %xmm7,%xmm6 ## data: 0 0 G2 H2 shufps $0x88,%xmm3,%xmm6 ## data: 0 G2 G3 G3 shufps $0xDD,%xmm3,%xmm7 ## data: 0 H2 H3 H4 ## xmm4 = 0 Y2 Y3 Y4 ## xmm5 = 0 F2 F3 F4 ## xmm6 = 0 G2 G3 G4 ## xmm7 = 0 H2 H3 H4 ## coulomb table ready, in xmm4-xmm7 mulps %xmm1,%xmm6 ## xmm6=Geps mulps %xmm2,%xmm7 ## xmm7=Heps2 addps %xmm6,%xmm5 addps %xmm7,%xmm5 ## xmm5=Fp mulps nb313_two(%rsp),%xmm7 ## two*Heps2 movaps nb313_qqM(%rsp),%xmm0 addps %xmm6,%xmm7 addps %xmm5,%xmm7 ## xmm7=FF mulps %xmm1,%xmm5 ## xmm5=eps*Fp addps %xmm4,%xmm5 ## xmm5=VV mulps %xmm0,%xmm5 ## vcoul=qq*VV mulps %xmm7,%xmm0 ## fijC=FF*qq ## at this point mm5 contains vcoul and xmm0 fijC ## increment vcoul - then we can get rid of mm5 addps nb313_vctot(%rsp),%xmm5 movaps %xmm5,nb313_vctot(%rsp) ## do nontable L-J in first element only. movaps nb313_rinvM(%rsp),%xmm2 mulss %xmm2,%xmm2 movaps %xmm2,%xmm1 mulss %xmm1,%xmm1 mulss %xmm2,%xmm1 ## xmm1=rinvsix xorps %xmm4,%xmm4 movss %xmm1,%xmm4 mulss %xmm4,%xmm4 ## xmm4=rinvtwelve mulss nb313_c6(%rsp),%xmm1 mulss nb313_c12(%rsp),%xmm4 movaps %xmm4,%xmm3 subss %xmm1,%xmm3 ## xmm3=Vvdw12-Vvdw6 mulss nb313_six(%rsp),%xmm1 mulss nb313_twelve(%rsp),%xmm4 subss %xmm1,%xmm4 addss nb313_Vvdwtot(%rsp),%xmm3 mulss nb313_rinvM(%rsp),%xmm4 ## add back coul stuff from memory, and work on all elements again mulps nb313_tsc(%rsp),%xmm0 subps %xmm0,%xmm4 movss %xmm3,nb313_Vvdwtot(%rsp) mulps nb313_rinvM(%rsp),%xmm4 movaps nb313_dxM(%rsp),%xmm0 movaps nb313_dyM(%rsp),%xmm1 movaps nb313_dzM(%rsp),%xmm2 mulps %xmm4,%xmm0 mulps %xmm4,%xmm1 mulps %xmm4,%xmm2 ## xmm0-xmm2 now contains tx-tz (partial force) movss nb313_fixO(%rsp),%xmm3 movss nb313_fiyO(%rsp),%xmm4 movss nb313_fizO(%rsp),%xmm5 addss %xmm0,%xmm3 addss %xmm1,%xmm4 addss %xmm2,%xmm5 movss %xmm3,nb313_fixO(%rsp) movss %xmm4,nb313_fiyO(%rsp) movss %xmm5,nb313_fizO(%rsp) ## updated the O force now do the H's movaps %xmm0,%xmm3 movaps %xmm1,%xmm4 movaps %xmm2,%xmm5 shufps $0x39,%xmm3,%xmm3 ## shift right shufps $0x39,%xmm4,%xmm4 shufps $0x39,%xmm5,%xmm5 addss nb313_fixH1(%rsp),%xmm3 addss nb313_fiyH1(%rsp),%xmm4 addss nb313_fizH1(%rsp),%xmm5 movss %xmm3,nb313_fixH1(%rsp) movss %xmm4,nb313_fiyH1(%rsp) movss %xmm5,nb313_fizH1(%rsp) ## updated the H1 force shufps $0x39,%xmm3,%xmm3 shufps $0x39,%xmm4,%xmm4 shufps $0x39,%xmm5,%xmm5 addss nb313_fixH2(%rsp),%xmm3 addss nb313_fiyH2(%rsp),%xmm4 addss nb313_fizH2(%rsp),%xmm5 movss %xmm3,nb313_fixH2(%rsp) movss %xmm4,nb313_fiyH2(%rsp) movss %xmm5,nb313_fizH2(%rsp) ## updated the H2 force movq nb313_faction(%rbp),%rdi shufps $0x39,%xmm3,%xmm3 shufps $0x39,%xmm4,%xmm4 shufps $0x39,%xmm5,%xmm5 addss nb313_fixM(%rsp),%xmm3 addss nb313_fiyM(%rsp),%xmm4 addss nb313_fizM(%rsp),%xmm5 movss %xmm3,nb313_fixM(%rsp) movss %xmm4,nb313_fiyM(%rsp) movss %xmm5,nb313_fizM(%rsp) ## updated the M force ## the fj's - move in from mem start by acc. tx/ty/tz in xmm0, xmm1 movlps (%rdi,%rax,4),%xmm6 movss 8(%rdi,%rax,4),%xmm7 movhlps %xmm0,%xmm3 movhlps %xmm1,%xmm4 movhlps %xmm2,%xmm5 addps %xmm0,%xmm3 addps %xmm1,%xmm4 addps %xmm2,%xmm5 movaps %xmm3,%xmm0 movaps %xmm4,%xmm1 movaps %xmm5,%xmm2 shufps $0x39,%xmm3,%xmm3 ## shift right shufps $0x39,%xmm4,%xmm4 shufps $0x39,%xmm5,%xmm5 addss %xmm3,%xmm0 addss %xmm4,%xmm1 addss %xmm5,%xmm2 unpcklps %xmm1,%xmm0 ## x,y sum in xmm0, z sum in xmm2 addps %xmm0,%xmm6 addss %xmm2,%xmm7 movlps %xmm6,(%rdi,%rax,4) movss %xmm7,8(%rdi,%rax,4) decl nb313_innerk(%rsp) jz _nb_kernel313_x86_64_sse.nb313_updateouterdata jmp _nb_kernel313_x86_64_sse.nb313_odd_loop_nb_kernel313_x86_64_sse.nb313_updateouterdata: movl nb313_ii3(%rsp),%ecx movq nb313_faction(%rbp),%rdi movq nb313_fshift(%rbp),%rsi movl nb313_is3(%rsp),%edx ## accumulate Oi forces in xmm0, xmm1, xmm2 movaps nb313_fixO(%rsp),%xmm0 movaps nb313_fiyO(%rsp),%xmm1 movaps nb313_fizO(%rsp),%xmm2 movhlps %xmm0,%xmm3 movhlps %xmm1,%xmm4 movhlps %xmm2,%xmm5 addps %xmm3,%xmm0 addps %xmm4,%xmm1 addps %xmm5,%xmm2 ## sum is in 1/2 in xmm0-xmm2 movaps %xmm0,%xmm3 movaps %xmm1,%xmm4 movaps %xmm2,%xmm5 shufps $1,%xmm3,%xmm3 shufps $1,%xmm4,%xmm4 shufps $1,%xmm5,%xmm5 addss %xmm3,%xmm0 addss %xmm4,%xmm1 addss %xmm5,%xmm2 ## xmm0-xmm2 has single force in pos0 ## increment i force movss (%rdi,%rcx,4),%xmm3 movss 4(%rdi,%rcx,4),%xmm4 movss 8(%rdi,%rcx,4),%xmm5 subss %xmm0,%xmm3 subss %xmm1,%xmm4 subss %xmm2,%xmm5 movss %xmm3,(%rdi,%rcx,4) movss %xmm4,4(%rdi,%rcx,4) movss %xmm5,8(%rdi,%rcx,4) ## accumulate force in xmm6/xmm7 for fshift movaps %xmm0,%xmm6 movss %xmm2,%xmm7 movlhps %xmm1,%xmm6 shufps $8,%xmm6,%xmm6 ## 00001000 ## accumulate H1i forces in xmm0, xmm1, xmm2 movaps nb313_fixH1(%rsp),%xmm0 movaps nb313_fiyH1(%rsp),%xmm1 movaps nb313_fizH1(%rsp),%xmm2 movhlps %xmm0,%xmm3 movhlps %xmm1,%xmm4 movhlps %xmm2,%xmm5 addps %xmm3,%xmm0 addps %xmm4,%xmm1 addps %xmm5,%xmm2 ## sum is in 1/2 in xmm0-xmm2 movaps %xmm0,%xmm3 movaps %xmm1,%xmm4 movaps %xmm2,%xmm5 shufps $1,%xmm3,%xmm3 shufps $1,%xmm4,%xmm4 shufps $1,%xmm5,%xmm5 addss %xmm3,%xmm0 addss %xmm4,%xmm1 addss %xmm5,%xmm2 ## xmm0-xmm2 has single force in pos0 ## increment i force movss 12(%rdi,%rcx,4),%xmm3 movss 16(%rdi,%rcx,4),%xmm4 movss 20(%rdi,%rcx,4),%xmm5 subss %xmm0,%xmm3 subss %xmm1,%xmm4 subss %xmm2,%xmm5 movss %xmm3,12(%rdi,%rcx,4) movss %xmm4,16(%rdi,%rcx,4) movss %xmm5,20(%rdi,%rcx,4) ## accumulate force in xmm6/xmm7 for fshift addss %xmm2,%xmm7 movlhps %xmm1,%xmm0 shufps $8,%xmm0,%xmm0 ## 00001000 addps %xmm0,%xmm6 ## accumulate H2i forces in xmm0, xmm1, xmm2 movaps nb313_fixH2(%rsp),%xmm0 movaps nb313_fiyH2(%rsp),%xmm1 movaps nb313_fizH2(%rsp),%xmm2 movhlps %xmm0,%xmm3 movhlps %xmm1,%xmm4 movhlps %xmm2,%xmm5 addps %xmm3,%xmm0 addps %xmm4,%xmm1 addps %xmm5,%xmm2 ## sum is in 1/2 in xmm0-xmm2 movaps %xmm0,%xmm3 movaps %xmm1,%xmm4 movaps %xmm2,%xmm5 shufps $1,%xmm3,%xmm3 shufps $1,%xmm4,%xmm4 shufps $1,%xmm5,%xmm5 addss %xmm3,%xmm0 addss %xmm4,%xmm1 addss %xmm5,%xmm2 ## xmm0-xmm2 has single force in pos0 ## increment i force movss 24(%rdi,%rcx,4),%xmm3 movss 28(%rdi,%rcx,4),%xmm4 movss 32(%rdi,%rcx,4),%xmm5 subss %xmm0,%xmm3 subss %xmm1,%xmm4 subss %xmm2,%xmm5 movss %xmm3,24(%rdi,%rcx,4) movss %xmm4,28(%rdi,%rcx,4) movss %xmm5,32(%rdi,%rcx,4) ## accumulate force in xmm6/xmm7 for fshift addss %xmm2,%xmm7 movlhps %xmm1,%xmm0 shufps $8,%xmm0,%xmm0 ## 00001000 addps %xmm0,%xmm6 ## accumulate Mi forces in xmm0, xmm1, xmm2 movaps nb313_fixM(%rsp),%xmm0 movaps nb313_fiyM(%rsp),%xmm1 movaps nb313_fizM(%rsp),%xmm2 movhlps %xmm0,%xmm3 movhlps %xmm1,%xmm4 movhlps %xmm2,%xmm5 addps %xmm3,%xmm0 addps %xmm4,%xmm1 addps %xmm5,%xmm2 ## sum is in 1/2 in xmm0-xmm2 movaps %xmm0,%xmm3 movaps %xmm1,%xmm4 movaps %xmm2,%xmm5 shufps $1,%xmm3,%xmm3 shufps $1,%xmm4,%xmm4 shufps $1,%xmm5,%xmm5 addss %xmm3,%xmm0 addss %xmm4,%xmm1 addss %xmm5,%xmm2 ## xmm0-xmm2 has single force in pos0 ## increment i force movss 36(%rdi,%rcx,4),%xmm3 movss 40(%rdi,%rcx,4),%xmm4 movss 44(%rdi,%rcx,4),%xmm5
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?