nb_kernel211_ia32_sse2.s
来自「最著名最快的分子模拟软件」· S 代码 · 共 1,984 行 · 第 1/5 页
S
1,984 行
addpd %xmm6,%xmm4 movapd %xmm4,%xmm7 ## rsqO in xmm7 ## move ixH1-izH1 to xmm4-xmm6 movapd nb211_ixH1(%esp),%xmm4 movapd nb211_iyH1(%esp),%xmm5 movapd nb211_izH1(%esp),%xmm6 ## calc dr subpd %xmm0,%xmm4 subpd %xmm1,%xmm5 subpd %xmm2,%xmm6 ## store dr movapd %xmm4,nb211_dxH1(%esp) movapd %xmm5,nb211_dyH1(%esp) movapd %xmm6,nb211_dzH1(%esp) ## square it mulpd %xmm4,%xmm4 mulpd %xmm5,%xmm5 mulpd %xmm6,%xmm6 addpd %xmm5,%xmm6 addpd %xmm4,%xmm6 ## rsqH1 in xmm6 ## move ixH2-izH2 to xmm3-xmm5 movapd nb211_ixH2(%esp),%xmm3 movapd nb211_iyH2(%esp),%xmm4 movapd nb211_izH2(%esp),%xmm5 ## calc dr subpd %xmm0,%xmm3 subpd %xmm1,%xmm4 subpd %xmm2,%xmm5 ## store dr movapd %xmm3,nb211_dxH2(%esp) movapd %xmm4,nb211_dyH2(%esp) movapd %xmm5,nb211_dzH2(%esp) ## square it mulpd %xmm3,%xmm3 mulpd %xmm4,%xmm4 mulpd %xmm5,%xmm5 addpd %xmm4,%xmm5 addpd %xmm3,%xmm5 ## rsqH2 in xmm5, rsqH1 in xmm6, rsqO in xmm7 movapd %xmm5,%xmm0 movapd %xmm6,%xmm1 movapd %xmm7,%xmm2 mulpd nb211_krf(%esp),%xmm0 mulpd nb211_krf(%esp),%xmm1 mulpd nb211_krf(%esp),%xmm2 movapd %xmm0,nb211_krsqH2(%esp) movapd %xmm1,nb211_krsqH1(%esp) movapd %xmm2,nb211_krsqO(%esp) ## start with rsqO - put seed in xmm2 cvtpd2ps %xmm7,%xmm2 rsqrtps %xmm2,%xmm2 cvtps2pd %xmm2,%xmm2 movapd %xmm2,%xmm3 mulpd %xmm2,%xmm2 movapd nb211_three(%esp),%xmm4 mulpd %xmm7,%xmm2 ## rsq*lu*lu subpd %xmm2,%xmm4 ## 30-rsq*lu*lu mulpd %xmm3,%xmm4 ## lu*(3-rsq*lu*lu) mulpd nb211_half(%esp),%xmm4 ## iter1 ( new lu) movapd %xmm4,%xmm3 mulpd %xmm4,%xmm4 ## lu*lu mulpd %xmm4,%xmm7 ## rsq*lu*lu movapd nb211_three(%esp),%xmm4 subpd %xmm7,%xmm4 ## 3-rsq*lu*lu mulpd %xmm3,%xmm4 ## lu*( 3-rsq*lu*lu) mulpd nb211_half(%esp),%xmm4 ## rinv movapd %xmm4,%xmm7 ## rinvO in xmm7 ## rsqH1 - seed in xmm2 cvtpd2ps %xmm6,%xmm2 rsqrtps %xmm2,%xmm2 cvtps2pd %xmm2,%xmm2 movapd %xmm2,%xmm3 mulpd %xmm2,%xmm2 movapd nb211_three(%esp),%xmm4 mulpd %xmm6,%xmm2 ## rsq*lu*lu subpd %xmm2,%xmm4 ## 30-rsq*lu*lu mulpd %xmm3,%xmm4 ## lu*(3-rsq*lu*lu) mulpd nb211_half(%esp),%xmm4 ## iter1 ( new lu) movapd %xmm4,%xmm3 mulpd %xmm4,%xmm4 ## lu*lu mulpd %xmm4,%xmm6 ## rsq*lu*lu movapd nb211_three(%esp),%xmm4 subpd %xmm6,%xmm4 ## 3-rsq*lu*lu mulpd %xmm3,%xmm4 ## lu*( 3-rsq*lu*lu) mulpd nb211_half(%esp),%xmm4 ## rinv movapd %xmm4,%xmm6 ## rinvH1 in xmm6 ## rsqH2 - seed in xmm2 cvtpd2ps %xmm5,%xmm2 rsqrtps %xmm2,%xmm2 cvtps2pd %xmm2,%xmm2 movapd %xmm2,%xmm3 mulpd %xmm2,%xmm2 movapd nb211_three(%esp),%xmm4 mulpd %xmm5,%xmm2 ## rsq*lu*lu subpd %xmm2,%xmm4 ## 30-rsq*lu*lu mulpd %xmm3,%xmm4 ## lu*(3-rsq*lu*lu) mulpd nb211_half(%esp),%xmm4 ## iter1 ( new lu) movapd %xmm4,%xmm3 mulpd %xmm4,%xmm4 ## lu*lu mulpd %xmm4,%xmm5 ## rsq*lu*lu movapd nb211_three(%esp),%xmm4 subpd %xmm5,%xmm4 ## 3-rsq*lu*lu mulpd %xmm3,%xmm4 ## lu*( 3-rsq*lu*lu) mulpd nb211_half(%esp),%xmm4 ## rinv movapd %xmm4,%xmm5 ## rinvH2 in xmm5 ## do O interactions movapd %xmm7,%xmm4 mulpd %xmm4,%xmm4 ## xmm7=rinv, xmm4=rinvsq movapd %xmm4,%xmm1 mulpd %xmm4,%xmm1 mulpd %xmm4,%xmm1 ## xmm1=rinvsix movapd %xmm1,%xmm2 mulpd %xmm2,%xmm2 ## xmm2=rinvtwelve mulpd nb211_c6(%esp),%xmm1 mulpd nb211_c12(%esp),%xmm2 movapd %xmm2,%xmm3 subpd %xmm1,%xmm3 ## Vvdw=Vvdw12-Vvdw6 addpd nb211_Vvdwtot(%esp),%xmm3 mulpd nb211_six(%esp),%xmm1 mulpd nb211_twelve(%esp),%xmm2 subpd %xmm1,%xmm2 ## nb part of fs movapd %xmm7,%xmm0 movapd nb211_krsqO(%esp),%xmm1 addpd %xmm1,%xmm0 mulpd nb211_two(%esp),%xmm1 subpd nb211_crf(%esp),%xmm0 ## xmm0=rinv+ krsq-crf subpd %xmm1,%xmm7 mulpd nb211_qqO(%esp),%xmm0 mulpd nb211_qqO(%esp),%xmm7 addpd %xmm7,%xmm2 mulpd %xmm2,%xmm4 ## total fsO in xmm4 addpd nb211_vctot(%esp),%xmm0 movapd %xmm3,nb211_Vvdwtot(%esp) movapd %xmm0,nb211_vctot(%esp) movapd nb211_dxO(%esp),%xmm0 movapd nb211_dyO(%esp),%xmm1 movapd nb211_dzO(%esp),%xmm2 mulpd %xmm4,%xmm0 mulpd %xmm4,%xmm1 mulpd %xmm4,%xmm2 ## update O forces movapd nb211_fixO(%esp),%xmm3 movapd nb211_fiyO(%esp),%xmm4 movapd nb211_fizO(%esp),%xmm7 addpd %xmm0,%xmm3 addpd %xmm1,%xmm4 addpd %xmm2,%xmm7 movapd %xmm3,nb211_fixO(%esp) movapd %xmm4,nb211_fiyO(%esp) movapd %xmm7,nb211_fizO(%esp) ## update j forces with water O movapd %xmm0,nb211_fjx(%esp) movapd %xmm1,nb211_fjy(%esp) movapd %xmm2,nb211_fjz(%esp) ## H1 interactions movapd %xmm6,%xmm4 mulpd %xmm4,%xmm4 ## xmm6=rinv, xmm4=rinvsq movapd %xmm6,%xmm7 movapd nb211_krsqH1(%esp),%xmm0 addpd %xmm0,%xmm6 ## xmm6=rinv+ krsq mulpd nb211_two(%esp),%xmm0 subpd nb211_crf(%esp),%xmm6 subpd %xmm0,%xmm7 ## xmm7=rinv-2*krsq mulpd nb211_qqH(%esp),%xmm6 ## vcoul mulpd nb211_qqH(%esp),%xmm7 mulpd %xmm7,%xmm4 ## total fsH1 in xmm4 addpd nb211_vctot(%esp),%xmm6 movapd nb211_dxH1(%esp),%xmm0 movapd nb211_dyH1(%esp),%xmm1 movapd nb211_dzH1(%esp),%xmm2 movapd %xmm6,nb211_vctot(%esp) mulpd %xmm4,%xmm0 mulpd %xmm4,%xmm1 mulpd %xmm4,%xmm2 ## update H1 forces movapd nb211_fixH1(%esp),%xmm3 movapd nb211_fiyH1(%esp),%xmm4 movapd nb211_fizH1(%esp),%xmm7 addpd %xmm0,%xmm3 addpd %xmm1,%xmm4 addpd %xmm2,%xmm7 movapd %xmm3,nb211_fixH1(%esp) movapd %xmm4,nb211_fiyH1(%esp) movapd %xmm7,nb211_fizH1(%esp) ## update j forces with water H1 addpd nb211_fjx(%esp),%xmm0 addpd nb211_fjy(%esp),%xmm1 addpd nb211_fjz(%esp),%xmm2 movapd %xmm0,nb211_fjx(%esp) movapd %xmm1,nb211_fjy(%esp) movapd %xmm2,nb211_fjz(%esp) ## H2 interactions movapd %xmm5,%xmm4 mulpd %xmm4,%xmm4 ## xmm5=rinv, xmm4=rinvsq movapd %xmm5,%xmm7 movapd nb211_krsqH2(%esp),%xmm0 addpd %xmm0,%xmm5 ## xmm5=rinv+ krsq mulpd nb211_two(%esp),%xmm0 subpd nb211_crf(%esp),%xmm5 subpd %xmm0,%xmm7 ## xmm7=rinv-2*krsq mulpd nb211_qqH(%esp),%xmm5 ## vcoul mulpd nb211_qqH(%esp),%xmm7 mulpd %xmm7,%xmm4 ## total fsH2 in xmm4 addpd nb211_vctot(%esp),%xmm5 movapd nb211_dxH2(%esp),%xmm0 movapd nb211_dyH2(%esp),%xmm1 movapd nb211_dzH2(%esp),%xmm2 movapd %xmm5,nb211_vctot(%esp) mulpd %xmm4,%xmm0 mulpd %xmm4,%xmm1 mulpd %xmm4,%xmm2 ## update H2 forces movapd nb211_fixH2(%esp),%xmm3 movapd nb211_fiyH2(%esp),%xmm4 movapd nb211_fizH2(%esp),%xmm7 addpd %xmm0,%xmm3 addpd %xmm1,%xmm4 addpd %xmm2,%xmm7 movapd %xmm3,nb211_fixH2(%esp) movapd %xmm4,nb211_fiyH2(%esp) movapd %xmm7,nb211_fizH2(%esp) movl nb211_faction(%ebp),%edi ## update j forces addpd nb211_fjx(%esp),%xmm0 addpd nb211_fjy(%esp),%xmm1 addpd nb211_fjz(%esp),%xmm2 movlpd (%edi,%eax,8),%xmm3 movlpd 8(%edi,%eax,8),%xmm4 movlpd 16(%edi,%eax,8),%xmm5 movhpd (%edi,%ebx,8),%xmm3 movhpd 8(%edi,%ebx,8),%xmm4 movhpd 16(%edi,%ebx,8),%xmm5 subpd %xmm0,%xmm3 subpd %xmm1,%xmm4 subpd %xmm2,%xmm5 movlpd %xmm3,(%edi,%eax,8) movlpd %xmm4,8(%edi,%eax,8) movlpd %xmm5,16(%edi,%eax,8) movhpd %xmm3,(%edi,%ebx,8) movhpd %xmm4,8(%edi,%ebx,8) movhpd %xmm5,16(%edi,%ebx,8) ## should we do one more iteration? subl $2,nb211_innerk(%esp) jl _nb_kernel211_ia32_sse2.nb211_checksingle jmp _nb_kernel211_ia32_sse2.nb211_unroll_loop_nb_kernel211_ia32_sse2.nb211_checksingle: movl nb211_innerk(%esp),%edx andl $1,%edx jnz _nb_kernel211_ia32_sse2.nb211_dosingle jmp _nb_kernel211_ia32_sse2.nb211_updateouterdata_nb_kernel211_ia32_sse2.nb211_dosingle: movl nb211_innerjjnr(%esp),%edx ## pointer to jjnr[k] movl (%edx),%eax addl $4,nb211_innerjjnr(%esp) movl nb211_charge(%ebp),%esi ## base of charge[] xorpd %xmm3,%xmm3 movlpd (%esi,%eax,8),%xmm3 movapd %xmm3,%xmm4 mulpd nb211_iqO(%esp),%xmm3 mulpd nb211_iqH(%esp),%xmm4 movd %eax,%mm0 ## use mmx registers as temp storage movapd %xmm3,nb211_qqO(%esp) movapd %xmm4,nb211_qqH(%esp) movl nb211_type(%ebp),%esi movl (%esi,%eax,4),%eax movl nb211_vdwparam(%ebp),%esi shll %eax movl nb211_ntia(%esp),%edi addl %edi,%eax movlpd (%esi,%eax,8),%xmm6 ## c6a movhpd 8(%esi,%eax,8),%xmm6 ## c6a c12a xorpd %xmm7,%xmm7 movapd %xmm6,%xmm4 unpcklpd %xmm7,%xmm4 unpckhpd %xmm7,%xmm6 movd %mm0,%eax movapd %xmm4,nb211_c6(%esp) movapd %xmm6,nb211_c12(%esp) movl nb211_pos(%ebp),%esi ## base of pos[] leal (%eax,%eax,2),%eax ## replace jnr with j3 ## move coordinates to xmm0-xmm2 movlpd (%esi,%eax,8),%xmm0 movlpd 8(%esi,%eax,8),%xmm1 movlpd 16(%esi,%eax,8),%xmm2 ## move ixO-izO to xmm4-xmm6 movapd nb211_ixO(%esp),%xmm4 movapd nb211_iyO(%esp),%xmm5 movapd nb211_izO(%esp),%xmm6 ## calc dr subsd %xmm0,%xmm4 subsd %xmm1,%xmm5 subsd %xmm2,%xmm6 ## store dr movapd %xmm4,nb211_dxO(%esp) movapd %xmm5,nb211_dyO(%esp) movapd %xmm6,nb211_dzO(%esp) ## square it mulsd %xmm4,%xmm4 mulsd %xmm5,%xmm5 mulsd %xmm6,%xmm6 addsd %xmm5,%xmm4 addsd %xmm6,%xmm4 movapd %xmm4,%xmm7 ## rsqO in xmm7 ## move ixH1-izH1 to xmm4-xmm6 movapd nb211_ixH1(%esp),%xmm4 movapd nb211_iyH1(%esp),%xmm5 movapd nb211_izH1(%esp),%xmm6 ## calc dr subsd %xmm0,%xmm4 subsd %xmm1,%xmm5 subsd %xmm2,%xmm6 ## store dr movapd %xmm4,nb211_dxH1(%esp) movapd %xmm5,nb211_dyH1(%esp) movapd %xmm6,nb211_dzH1(%esp) ## square it mulsd %xmm4,%xmm4 mulsd %xmm5,%xmm5 mulsd %xmm6,%xmm6 addsd %xmm5,%xmm6 addsd %xmm4,%xmm6 ## rsqH1 in xmm6 ## move ixH2-izH2 to xmm3-xmm5 movapd nb211_ixH2(%esp),%xmm3 movapd nb211_iyH2(%esp),%xmm4 movapd nb211_izH2(%esp),%xmm5 ## calc dr subsd %xmm0,%xmm3 subsd %xmm1,%xmm4 subsd %xmm2,%xmm5 ## store dr movapd %xmm3,nb211_dxH2(%esp) movapd %xmm4,nb211_dyH2(%esp) movapd %xmm5,nb211_dzH2(%esp) ## square it mulsd %xmm3,%xmm3 mulsd %xmm4,%xmm4 mulsd %xmm5,%xmm5 addsd %xmm4,%xmm5 addsd %xmm3,%xmm5 ## rsqH2 in xmm5, rsqH1 in xmm6, rsqO in xmm7
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?