nb_kernel211_ia32_sse2.s
来自「最著名最快的分子模拟软件」· S 代码 · 共 1,984 行 · 第 1/5 页
S
1,984 行
movapd %xmm5,%xmm0 movapd %xmm6,%xmm1 movapd %xmm7,%xmm2 mulsd nb211_krf(%esp),%xmm0 mulsd nb211_krf(%esp),%xmm1 mulsd nb211_krf(%esp),%xmm2 movapd %xmm0,nb211_krsqH2(%esp) movapd %xmm1,nb211_krsqH1(%esp) movapd %xmm2,nb211_krsqO(%esp) ## start with rsqO - put seed in xmm2 cvtsd2ss %xmm7,%xmm2 rsqrtss %xmm2,%xmm2 cvtss2sd %xmm2,%xmm2 movapd %xmm2,%xmm3 mulsd %xmm2,%xmm2 movapd nb211_three(%esp),%xmm4 mulsd %xmm7,%xmm2 ## rsq*lu*lu subsd %xmm2,%xmm4 ## 30-rsq*lu*lu mulsd %xmm3,%xmm4 ## lu*(3-rsq*lu*lu) mulsd nb211_half(%esp),%xmm4 ## iter1 ( new lu) movapd %xmm4,%xmm3 mulsd %xmm4,%xmm4 ## lu*lu mulsd %xmm4,%xmm7 ## rsq*lu*lu movapd nb211_three(%esp),%xmm4 subsd %xmm7,%xmm4 ## 3-rsq*lu*lu mulsd %xmm3,%xmm4 ## lu*( 3-rsq*lu*lu) mulsd nb211_half(%esp),%xmm4 ## rinv movapd %xmm4,%xmm7 ## rinvO in xmm7 ## rsqH1 - seed in xmm2 cvtsd2ss %xmm6,%xmm2 rsqrtss %xmm2,%xmm2 cvtss2sd %xmm2,%xmm2 movapd %xmm2,%xmm3 mulsd %xmm2,%xmm2 movapd nb211_three(%esp),%xmm4 mulsd %xmm6,%xmm2 ## rsq*lu*lu subsd %xmm2,%xmm4 ## 30-rsq*lu*lu mulsd %xmm3,%xmm4 ## lu*(3-rsq*lu*lu) mulsd nb211_half(%esp),%xmm4 ## iter1 ( new lu) movapd %xmm4,%xmm3 mulsd %xmm4,%xmm4 ## lu*lu mulsd %xmm4,%xmm6 ## rsq*lu*lu movapd nb211_three(%esp),%xmm4 subsd %xmm6,%xmm4 ## 3-rsq*lu*lu mulsd %xmm3,%xmm4 ## lu*( 3-rsq*lu*lu) mulsd nb211_half(%esp),%xmm4 ## rinv movapd %xmm4,%xmm6 ## rinvH1 in xmm6 ## rsqH2 - seed in xmm2 cvtsd2ss %xmm5,%xmm2 rsqrtss %xmm2,%xmm2 cvtss2sd %xmm2,%xmm2 movapd %xmm2,%xmm3 mulsd %xmm2,%xmm2 movapd nb211_three(%esp),%xmm4 mulsd %xmm5,%xmm2 ## rsq*lu*lu subsd %xmm2,%xmm4 ## 30-rsq*lu*lu mulsd %xmm3,%xmm4 ## lu*(3-rsq*lu*lu) mulsd nb211_half(%esp),%xmm4 ## iter1 ( new lu) movapd %xmm4,%xmm3 mulsd %xmm4,%xmm4 ## lu*lu mulsd %xmm4,%xmm5 ## rsq*lu*lu movapd nb211_three(%esp),%xmm4 subsd %xmm5,%xmm4 ## 3-rsq*lu*lu mulsd %xmm3,%xmm4 ## lu*( 3-rsq*lu*lu) mulsd nb211_half(%esp),%xmm4 ## rinv movapd %xmm4,%xmm5 ## rinvH2 in xmm5 ## do O interactions movapd %xmm7,%xmm4 mulsd %xmm4,%xmm4 ## xmm7=rinv, xmm4=rinvsq movapd %xmm4,%xmm1 mulsd %xmm4,%xmm1 mulsd %xmm4,%xmm1 ## xmm1=rinvsix movapd %xmm1,%xmm2 mulsd %xmm2,%xmm2 ## xmm2=rinvtwelve mulsd nb211_c6(%esp),%xmm1 mulsd nb211_c12(%esp),%xmm2 movapd %xmm2,%xmm3 subsd %xmm1,%xmm3 ## Vvdw=Vvdw12-Vvdw6 addsd nb211_Vvdwtot(%esp),%xmm3 mulsd nb211_six(%esp),%xmm1 mulsd nb211_twelve(%esp),%xmm2 subsd %xmm1,%xmm2 ## nb part of fs movapd %xmm7,%xmm0 movapd nb211_krsqO(%esp),%xmm1 addsd %xmm1,%xmm0 mulsd nb211_two(%esp),%xmm1 subsd nb211_crf(%esp),%xmm0 ## xmm0=rinv+ krsq-crf subsd %xmm1,%xmm7 mulsd nb211_qqO(%esp),%xmm0 mulsd nb211_qqO(%esp),%xmm7 addsd %xmm7,%xmm2 mulsd %xmm2,%xmm4 ## total fsO in xmm4 addsd nb211_vctot(%esp),%xmm0 movlpd %xmm3,nb211_Vvdwtot(%esp) movlpd %xmm0,nb211_vctot(%esp) movapd nb211_dxO(%esp),%xmm0 movapd nb211_dyO(%esp),%xmm1 movapd nb211_dzO(%esp),%xmm2 mulsd %xmm4,%xmm0 mulsd %xmm4,%xmm1 mulsd %xmm4,%xmm2 ## update O forces movapd nb211_fixO(%esp),%xmm3 movapd nb211_fiyO(%esp),%xmm4 movapd nb211_fizO(%esp),%xmm7 addsd %xmm0,%xmm3 addsd %xmm1,%xmm4 addsd %xmm2,%xmm7 movlpd %xmm3,nb211_fixO(%esp) movlpd %xmm4,nb211_fiyO(%esp) movlpd %xmm7,nb211_fizO(%esp) ## update j forces with water O movlpd %xmm0,nb211_fjx(%esp) movlpd %xmm1,nb211_fjy(%esp) movlpd %xmm2,nb211_fjz(%esp) ## H1 interactions movapd %xmm6,%xmm4 mulsd %xmm4,%xmm4 ## xmm6=rinv, xmm4=rinvsq movapd %xmm6,%xmm7 movapd nb211_krsqH1(%esp),%xmm0 addsd %xmm0,%xmm6 ## xmm6=rinv+ krsq mulsd nb211_two(%esp),%xmm0 subsd nb211_crf(%esp),%xmm6 subsd %xmm0,%xmm7 ## xmm7=rinv-2*krsq mulsd nb211_qqH(%esp),%xmm6 ## vcoul mulsd nb211_qqH(%esp),%xmm7 mulsd %xmm7,%xmm4 ## total fsH1 in xmm4 addsd nb211_vctot(%esp),%xmm6 movapd nb211_dxH1(%esp),%xmm0 movapd nb211_dyH1(%esp),%xmm1 movapd nb211_dzH1(%esp),%xmm2 movlpd %xmm6,nb211_vctot(%esp) mulsd %xmm4,%xmm0 mulsd %xmm4,%xmm1 mulsd %xmm4,%xmm2 ## update H1 forces movapd nb211_fixH1(%esp),%xmm3 movapd nb211_fiyH1(%esp),%xmm4 movapd nb211_fizH1(%esp),%xmm7 addsd %xmm0,%xmm3 addsd %xmm1,%xmm4 addsd %xmm2,%xmm7 movlpd %xmm3,nb211_fixH1(%esp) movlpd %xmm4,nb211_fiyH1(%esp) movlpd %xmm7,nb211_fizH1(%esp) ## update j forces with water H1 addsd nb211_fjx(%esp),%xmm0 addsd nb211_fjy(%esp),%xmm1 addsd nb211_fjz(%esp),%xmm2 movlpd %xmm0,nb211_fjx(%esp) movlpd %xmm1,nb211_fjy(%esp) movlpd %xmm2,nb211_fjz(%esp) ## H2 interactions movapd %xmm5,%xmm4 mulsd %xmm4,%xmm4 ## xmm5=rinv, xmm4=rinvsq movapd %xmm5,%xmm7 movapd nb211_krsqH2(%esp),%xmm0 addsd %xmm0,%xmm5 ## xmm5=rinv+ krsq mulsd nb211_two(%esp),%xmm0 subsd nb211_crf(%esp),%xmm5 subsd %xmm0,%xmm7 ## xmm7=rinv-2*krsq mulsd nb211_qqH(%esp),%xmm5 ## vcoul mulsd nb211_qqH(%esp),%xmm7 mulsd %xmm7,%xmm4 ## total fsH2 in xmm4 addsd nb211_vctot(%esp),%xmm5 movapd nb211_dxH2(%esp),%xmm0 movapd nb211_dyH2(%esp),%xmm1 movapd nb211_dzH2(%esp),%xmm2 movlpd %xmm5,nb211_vctot(%esp) mulsd %xmm4,%xmm0 mulsd %xmm4,%xmm1 mulsd %xmm4,%xmm2 ## update H2 forces movapd nb211_fixH2(%esp),%xmm3 movapd nb211_fiyH2(%esp),%xmm4 movapd nb211_fizH2(%esp),%xmm7 addsd %xmm0,%xmm3 addsd %xmm1,%xmm4 addsd %xmm2,%xmm7 movlpd %xmm3,nb211_fixH2(%esp) movlpd %xmm4,nb211_fiyH2(%esp) movlpd %xmm7,nb211_fizH2(%esp) movl nb211_faction(%ebp),%edi ## update j forces addsd nb211_fjx(%esp),%xmm0 addsd nb211_fjy(%esp),%xmm1 addsd nb211_fjz(%esp),%xmm2 movlpd (%edi,%eax,8),%xmm3 movlpd 8(%edi,%eax,8),%xmm4 movlpd 16(%edi,%eax,8),%xmm5 subsd %xmm0,%xmm3 subsd %xmm1,%xmm4 subsd %xmm2,%xmm5 movlpd %xmm3,(%edi,%eax,8) movlpd %xmm4,8(%edi,%eax,8) movlpd %xmm5,16(%edi,%eax,8)_nb_kernel211_ia32_sse2.nb211_updateouterdata: movl nb211_ii3(%esp),%ecx movl nb211_faction(%ebp),%edi movl nb211_fshift(%ebp),%esi movl nb211_is3(%esp),%edx ## accumulate Oi forces in xmm0, xmm1, xmm2 movapd nb211_fixO(%esp),%xmm0 movapd nb211_fiyO(%esp),%xmm1 movapd nb211_fizO(%esp),%xmm2 movhlps %xmm0,%xmm3 movhlps %xmm1,%xmm4 movhlps %xmm2,%xmm5 addsd %xmm3,%xmm0 addsd %xmm4,%xmm1 addsd %xmm5,%xmm2 ## sum is in low xmm0-xmm2 movapd %xmm0,%xmm3 movapd %xmm1,%xmm4 movapd %xmm2,%xmm5 ## increment i force movsd (%edi,%ecx,8),%xmm3 movsd 8(%edi,%ecx,8),%xmm4 movsd 16(%edi,%ecx,8),%xmm5 addsd %xmm0,%xmm3 addsd %xmm1,%xmm4 addsd %xmm2,%xmm5 movsd %xmm3,(%edi,%ecx,8) movsd %xmm4,8(%edi,%ecx,8) movsd %xmm5,16(%edi,%ecx,8) ## accumulate force in xmm6/xmm7 for fshift movapd %xmm0,%xmm6 movsd %xmm2,%xmm7 unpcklpd %xmm1,%xmm6 ## accumulate H1i forces in xmm0, xmm1, xmm2 movapd nb211_fixH1(%esp),%xmm0 movapd nb211_fiyH1(%esp),%xmm1 movapd nb211_fizH1(%esp),%xmm2 movhlps %xmm0,%xmm3 movhlps %xmm1,%xmm4 movhlps %xmm2,%xmm5 addsd %xmm3,%xmm0 addsd %xmm4,%xmm1 addsd %xmm5,%xmm2 ## sum is in low xmm0-xmm2 ## increment i force movsd 24(%edi,%ecx,8),%xmm3 movsd 32(%edi,%ecx,8),%xmm4 movsd 40(%edi,%ecx,8),%xmm5 addsd %xmm0,%xmm3 addsd %xmm1,%xmm4 addsd %xmm2,%xmm5 movsd %xmm3,24(%edi,%ecx,8) movsd %xmm4,32(%edi,%ecx,8) movsd %xmm5,40(%edi,%ecx,8) ## accumulate force in xmm6/xmm7 for fshift addsd %xmm2,%xmm7 unpcklpd %xmm1,%xmm0 addpd %xmm0,%xmm6 ## accumulate H2i forces in xmm0, xmm1, xmm2 movapd nb211_fixH2(%esp),%xmm0 movapd nb211_fiyH2(%esp),%xmm1 movapd nb211_fizH2(%esp),%xmm2 movhlps %xmm0,%xmm3 movhlps %xmm1,%xmm4 movhlps %xmm2,%xmm5 addsd %xmm3,%xmm0 addsd %xmm4,%xmm1 addsd %xmm5,%xmm2 ## sum is in low xmm0-xmm2 movapd %xmm0,%xmm3 movapd %xmm1,%xmm4 movapd %xmm2,%xmm5 ## increment i force movsd 48(%edi,%ecx,8),%xmm3 movsd 56(%edi,%ecx,8),%xmm4 movsd 64(%edi,%ecx,8),%xmm5 addsd %xmm0,%xmm3 addsd %xmm1,%xmm4 addsd %xmm2,%xmm5 movsd %xmm3,48(%edi,%ecx,8) movsd %xmm4,56(%edi,%ecx,8) movsd %xmm5,64(%edi,%ecx,8) ## accumulate force in xmm6/xmm7 for fshift addsd %xmm2,%xmm7 unpcklpd %xmm1,%xmm0 addpd %xmm0,%xmm6 ## increment fshift force movlpd (%esi,%edx,8),%xmm3 movhpd 8(%esi,%edx,8),%xmm3 movsd 16(%esi,%edx,8),%xmm4 addpd %xmm6,%xmm3 addsd %xmm7,%xmm4 movlpd %xmm3,(%esi,%edx,8) movhpd %xmm3,8(%esi,%edx,8) movsd %xmm4,16(%esi,%edx,8) ## get n from stack movl nb211_n(%esp),%esi ## get group index for i particle movl nb211_gid(%ebp),%edx ## base of gid[] movl (%edx,%esi,4),%edx ## ggid=gid[n] ## accumulate total potential energy and update it movapd nb211_vctot(%esp),%xmm7 ## accumulate movhlps %xmm7,%xmm6 addsd %xmm6,%xmm7 ## low xmm7 has the sum now ## add earlier value from mem movl nb211_Vc(%ebp),%eax addsd (%eax,%edx,8),%xmm7 ## move back to mem movsd %xmm7,(%eax,%edx,8) ## accumulate total lj energy and update it movapd nb211_Vvdwtot(%esp),%xmm7 ## accumulate movhlps %xmm7,%xmm6 addsd %xmm6,%xmm7 ## low xmm7 has the sum now ## add earlier value from mem movl nb211_Vvdw(%ebp),%eax addsd (%eax,%edx,8),%xmm7 ## move back to mem movsd %xmm7,(%eax,%edx,8) ## finish if last movl nb211_nn1(%esp),%ecx ## esi already loaded with n incl %esi subl %esi,%ecx jz _nb_kernel211_ia32_sse2.nb211_outerend ## not last, iterate outer loop once more! movl %esi,nb211_n(%esp) jmp _nb_kernel211_ia32_sse2.nb211_outer_nb_kernel211_ia32_sse2.nb211_outerend: ## check if more outer neighborlists remain movl nb211_nri(%esp),%ecx ## esi already loaded with n above subl %esi,%ecx jz _nb_kernel211_ia32_sse2.nb211_end ## non-zero, do one more workunit jmp _nb_kernel211_ia32_sse2.nb211_threadloop_nb_kernel211_ia32_sse2.nb211_end: emms movl nb211_nouter(%esp),%eax movl nb211_ninner(%esp),%ebx movl nb211_outeriter(%ebp),%ecx movl nb211_inneriter(%ebp),%edx movl %eax,(%ecx) movl %ebx,(%edx) movl nb211_salign(%esp),%eax addl %eax,%esp addl $812,%esp popl %edi popl %esi popl %edx popl %ecx popl %ebx
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?