nb_kernel110_ia32_sse2.s
来自「最著名最快的分子模拟软件」· S 代码 · 共 1,175 行 · 第 1/3 页
S
1,175 行
lock cmpxchgl %ebx,(%esi) ## write nn1 to *counter, ## if it hasnt changed. ## or reread *counter to eax. pause ## -> better p4 performance jnz _nb_kernel110nf_ia32_sse2.nb110nf_spinlock ## if(nn1>nri) nn1=nri movl nb110nf_nri(%esp),%ecx movl %ecx,%edx subl %ebx,%ecx cmovlel %edx,%ebx ## if(nn1>nri) nn1=nri ## Cleared the spinlock if we got here. ## eax contains nn0, ebx contains nn1. movl %eax,nb110nf_n(%esp) movl %ebx,nb110nf_nn1(%esp) subl %eax,%ebx ## calc number of outer lists movl %eax,%esi ## copy n to esi jg _nb_kernel110nf_ia32_sse2.nb110nf_outerstart jmp _nb_kernel110nf_ia32_sse2.nb110nf_end_nb_kernel110nf_ia32_sse2.nb110nf_outerstart: ## ebx contains number of outer iterations addl nb110nf_nouter(%esp),%ebx movl %ebx,nb110nf_nouter(%esp)_nb_kernel110nf_ia32_sse2.nb110nf_outer: movl nb110nf_shift(%ebp),%eax ## eax = pointer into shift[] movl (%eax,%esi,4),%ebx ## ebx=shift[n] leal (%ebx,%ebx,2),%ebx ## ebx=3*is movl nb110nf_shiftvec(%ebp),%eax ## eax = base of shiftvec[] movsd (%eax,%ebx,8),%xmm0 movsd 8(%eax,%ebx,8),%xmm1 movsd 16(%eax,%ebx,8),%xmm2 movl nb110nf_iinr(%ebp),%ecx ## ecx = pointer into iinr[] movl (%ecx,%esi,4),%ebx ## ebx =ii movl nb110nf_charge(%ebp),%edx movsd (%edx,%ebx,8),%xmm3 mulsd nb110nf_facel(%esp),%xmm3 shufpd $0,%xmm3,%xmm3 movl nb110nf_type(%ebp),%edx movl (%edx,%ebx,4),%edx imull nb110nf_ntype(%esp),%edx shll %edx movl %edx,nb110nf_ntia(%esp) leal (%ebx,%ebx,2),%ebx ## ebx = 3*ii=ii3 movl nb110nf_pos(%ebp),%eax ## eax = base of pos[] addsd (%eax,%ebx,8),%xmm0 addsd 8(%eax,%ebx,8),%xmm1 addsd 16(%eax,%ebx,8),%xmm2 movapd %xmm3,nb110nf_iq(%esp) shufpd $0,%xmm0,%xmm0 shufpd $0,%xmm1,%xmm1 shufpd $0,%xmm2,%xmm2 movapd %xmm0,nb110nf_ix(%esp) movapd %xmm1,nb110nf_iy(%esp) movapd %xmm2,nb110nf_iz(%esp) movl %ebx,nb110nf_ii3(%esp) ## clear vctot xorpd %xmm4,%xmm4 movapd %xmm4,nb110nf_vctot(%esp) movapd %xmm4,nb110nf_Vvdwtot(%esp) movl nb110nf_jindex(%ebp),%eax movl (%eax,%esi,4),%ecx ## jindex[n] movl 4(%eax,%esi,4),%edx ## jindex[n+1] subl %ecx,%edx ## number of innerloop atoms movl nb110nf_pos(%ebp),%esi movl nb110nf_jjnr(%ebp),%eax shll $2,%ecx addl %ecx,%eax movl %eax,nb110nf_innerjjnr(%esp) ## pointer to jjnr[nj0] movl %edx,%ecx subl $2,%edx addl nb110nf_ninner(%esp),%ecx movl %ecx,nb110nf_ninner(%esp) addl $0,%edx movl %edx,nb110nf_innerk(%esp) ## number of innerloop atoms jge _nb_kernel110nf_ia32_sse2.nb110nf_unroll_loop jmp _nb_kernel110nf_ia32_sse2.nb110nf_checksingle_nb_kernel110nf_ia32_sse2.nb110nf_unroll_loop: ## twice unrolled innerloop here movl nb110nf_innerjjnr(%esp),%edx ## pointer to jjnr[k] movl (%edx),%eax movl 4(%edx),%ebx addl $8,nb110nf_innerjjnr(%esp) ## advance pointer (unrolled 2) movl nb110nf_charge(%ebp),%esi ## base of charge[] movlpd (%esi,%eax,8),%xmm3 movhpd (%esi,%ebx,8),%xmm3 movapd nb110nf_iq(%esp),%xmm5 mulpd %xmm5,%xmm3 ## qq movd %eax,%mm0 ## use mmx registers as temp storage movd %ebx,%mm1 movl nb110nf_type(%ebp),%esi movl (%esi,%eax,4),%eax movl (%esi,%ebx,4),%ebx movl nb110nf_vdwparam(%ebp),%esi shll %eax shll %ebx movl nb110nf_ntia(%esp),%edi addl %edi,%eax addl %edi,%ebx movlpd (%esi,%eax,8),%xmm6 ## c6a movlpd (%esi,%ebx,8),%xmm7 ## c6b movhpd 8(%esi,%eax,8),%xmm6 ## c6a c12a movhpd 8(%esi,%ebx,8),%xmm7 ## c6b c12b movapd %xmm6,%xmm4 unpcklpd %xmm7,%xmm4 unpckhpd %xmm7,%xmm6 movd %mm0,%eax movd %mm1,%ebx movapd %xmm4,nb110nf_c6(%esp) movapd %xmm6,nb110nf_c12(%esp) movl nb110nf_pos(%ebp),%esi ## base of pos[] leal (%eax,%eax,2),%eax ## replace jnr with j3 leal (%ebx,%ebx,2),%ebx ## move two coordinates to xmm0-xmm2 movlpd (%esi,%eax,8),%xmm0 movlpd 8(%esi,%eax,8),%xmm1 movlpd 16(%esi,%eax,8),%xmm2 movhpd (%esi,%ebx,8),%xmm0 movhpd 8(%esi,%ebx,8),%xmm1 movhpd 16(%esi,%ebx,8),%xmm2 ## move ix-iz to xmm4-xmm6 movapd nb110nf_ix(%esp),%xmm4 movapd nb110nf_iy(%esp),%xmm5 movapd nb110nf_iz(%esp),%xmm6 ## calc dr subpd %xmm0,%xmm4 subpd %xmm1,%xmm5 subpd %xmm2,%xmm6 ## square it mulpd %xmm4,%xmm4 mulpd %xmm5,%xmm5 mulpd %xmm6,%xmm6 addpd %xmm5,%xmm4 addpd %xmm6,%xmm4 ## rsq in xmm4 cvtpd2ps %xmm4,%xmm5 rsqrtps %xmm5,%xmm5 cvtps2pd %xmm5,%xmm2 ## lu in low xmm2 ## lookup seed in xmm2 movapd %xmm2,%xmm5 ## copy of lu mulpd %xmm2,%xmm2 ## lu*lu movapd nb110nf_three(%esp),%xmm1 mulpd %xmm4,%xmm2 ## rsq*lu*lu movapd nb110nf_half(%esp),%xmm0 subpd %xmm2,%xmm1 ## 30-rsq*lu*lu mulpd %xmm5,%xmm1 mulpd %xmm0,%xmm1 ## xmm0=iter1 of rinv (new lu) movapd %xmm1,%xmm5 ## copy of lu mulpd %xmm1,%xmm1 ## lu*lu movapd nb110nf_three(%esp),%xmm2 mulpd %xmm4,%xmm1 ## rsq*lu*lu movapd nb110nf_half(%esp),%xmm0 subpd %xmm1,%xmm2 ## 30-rsq*lu*lu mulpd %xmm5,%xmm2 mulpd %xmm2,%xmm0 ## xmm0=rinv movapd %xmm0,%xmm4 mulpd %xmm4,%xmm4 ## xmm4=rinvsq movapd %xmm4,%xmm1 mulpd %xmm4,%xmm1 mulpd %xmm4,%xmm1 ## xmm1=rinvsix movapd %xmm1,%xmm2 mulpd %xmm2,%xmm2 ## xmm2=rinvtwelve mulpd %xmm0,%xmm3 ## xmm3=vcoul mulpd nb110nf_c6(%esp),%xmm1 mulpd nb110nf_c12(%esp),%xmm2 movapd %xmm2,%xmm5 subpd %xmm1,%xmm5 ## Vvdw=Vvdw12-Vvdw6 addpd nb110nf_Vvdwtot(%esp),%xmm5 addpd nb110nf_vctot(%esp),%xmm3 movapd %xmm3,nb110nf_vctot(%esp) movapd %xmm5,nb110nf_Vvdwtot(%esp) ## should we do one more iteration? subl $2,nb110nf_innerk(%esp) jl _nb_kernel110nf_ia32_sse2.nb110nf_checksingle jmp _nb_kernel110nf_ia32_sse2.nb110nf_unroll_loop_nb_kernel110nf_ia32_sse2.nb110nf_checksingle: movl nb110nf_innerk(%esp),%edx andl $1,%edx jnz _nb_kernel110nf_ia32_sse2.nb110nf_dosingle jmp _nb_kernel110nf_ia32_sse2.nb110nf_updateouterdata_nb_kernel110nf_ia32_sse2.nb110nf_dosingle: movl nb110nf_charge(%ebp),%esi movl nb110nf_pos(%ebp),%edi movl nb110nf_innerjjnr(%esp),%ecx movl (%ecx),%eax xorpd %xmm3,%xmm3 movlpd (%esi,%eax,8),%xmm3 movapd nb110nf_iq(%esp),%xmm5 mulsd %xmm5,%xmm3 ## qq movd %eax,%mm0 ## use mmx registers as temp storage movl nb110nf_type(%ebp),%esi movl (%esi,%eax,4),%eax movl nb110nf_vdwparam(%ebp),%esi shll %eax movl nb110nf_ntia(%esp),%edi addl %edi,%eax movlpd (%esi,%eax,8),%xmm6 ## c6a movhpd 8(%esi,%eax,8),%xmm6 ## c6a c12a xorpd %xmm7,%xmm7 movapd %xmm6,%xmm4 unpcklpd %xmm7,%xmm4 unpckhpd %xmm7,%xmm6 movd %mm0,%eax movapd %xmm4,nb110nf_c6(%esp) movapd %xmm6,nb110nf_c12(%esp) movl nb110nf_pos(%ebp),%esi ## base of pos[] leal (%eax,%eax,2),%eax ## replace jnr with j3 ## move two coordinates to xmm0-xmm2 movlpd (%esi,%eax,8),%xmm0 movlpd 8(%esi,%eax,8),%xmm1 movlpd 16(%esi,%eax,8),%xmm2 ## move ix-iz to xmm4-xmm6 movapd nb110nf_ix(%esp),%xmm4 movapd nb110nf_iy(%esp),%xmm5 movapd nb110nf_iz(%esp),%xmm6 ## calc dr subsd %xmm0,%xmm4 subsd %xmm1,%xmm5 subsd %xmm2,%xmm6 ## square it mulsd %xmm4,%xmm4 mulsd %xmm5,%xmm5 mulsd %xmm6,%xmm6 addsd %xmm5,%xmm4 addsd %xmm6,%xmm4 ## rsq in xmm4 cvtsd2ss %xmm4,%xmm5 rsqrtss %xmm5,%xmm5 cvtss2sd %xmm5,%xmm2 ## lu in low xmm2 ## lookup seed in xmm2 movapd %xmm2,%xmm5 ## copy of lu mulsd %xmm2,%xmm2 ## lu*lu movapd nb110nf_three(%esp),%xmm1 mulsd %xmm4,%xmm2 ## rsq*lu*lu movapd nb110nf_half(%esp),%xmm0 subsd %xmm2,%xmm1 ## 30-rsq*lu*lu mulsd %xmm5,%xmm1 mulsd %xmm0,%xmm1 ## xmm0=iter1 of rinv (new lu) movapd %xmm1,%xmm5 ## copy of lu mulsd %xmm1,%xmm1 ## lu*lu movapd nb110nf_three(%esp),%xmm2 mulsd %xmm4,%xmm1 ## rsq*lu*lu movapd nb110nf_half(%esp),%xmm0 subsd %xmm1,%xmm2 ## 30-rsq*lu*lu mulsd %xmm5,%xmm2 mulsd %xmm2,%xmm0 ## xmm0=rinv movapd %xmm0,%xmm4 mulsd %xmm4,%xmm4 ## xmm4=rinvsq movapd %xmm4,%xmm1 mulsd %xmm4,%xmm1 mulsd %xmm4,%xmm1 ## xmm1=rinvsix movapd %xmm1,%xmm2 mulsd %xmm2,%xmm2 ## xmm2=rinvtwelve mulsd %xmm0,%xmm3 ## xmm3=vcoul mulsd nb110nf_c6(%esp),%xmm1 mulsd nb110nf_c12(%esp),%xmm2 movapd %xmm2,%xmm5 subsd %xmm1,%xmm5 ## Vvdw=Vvdw12-Vvdw6 addsd nb110nf_Vvdwtot(%esp),%xmm5 addsd nb110nf_vctot(%esp),%xmm3 movlpd %xmm3,nb110nf_vctot(%esp) movlpd %xmm5,nb110nf_Vvdwtot(%esp)_nb_kernel110nf_ia32_sse2.nb110nf_updateouterdata: ## get n from stack movl nb110nf_n(%esp),%esi ## get group index for i particle movl nb110nf_gid(%ebp),%edx ## base of gid[] movl (%edx,%esi,4),%edx ## ggid=gid[n] ## accumulate total potential energy and update it movapd nb110nf_vctot(%esp),%xmm7 ## accumulate movhlps %xmm7,%xmm6 addsd %xmm6,%xmm7 ## low xmm7 has the sum now ## add earlier value from mem movl nb110nf_Vc(%ebp),%eax addsd (%eax,%edx,8),%xmm7 ## move back to mem movsd %xmm7,(%eax,%edx,8) ## accumulate total lj energy and update it movapd nb110nf_Vvdwtot(%esp),%xmm7 ## accumulate movhlps %xmm7,%xmm6 addsd %xmm6,%xmm7 ## low xmm7 has the sum now ## add earlier value from mem movl nb110nf_Vvdw(%ebp),%eax addsd (%eax,%edx,8),%xmm7 ## move back to mem movsd %xmm7,(%eax,%edx,8) ## finish if last movl nb110nf_nn1(%esp),%ecx ## esi already loaded with n incl %esi subl %esi,%ecx jz _nb_kernel110nf_ia32_sse2.nb110nf_outerend ## not last, iterate outer loop once more! movl %esi,nb110nf_n(%esp) jmp _nb_kernel110nf_ia32_sse2.nb110nf_outer_nb_kernel110nf_ia32_sse2.nb110nf_outerend: ## check if more outer neighborlists remain movl nb110nf_nri(%esp),%ecx ## esi already loaded with n above subl %esi,%ecx jz _nb_kernel110nf_ia32_sse2.nb110nf_end ## non-zero, do one more workunit jmp _nb_kernel110nf_ia32_sse2.nb110nf_threadloop_nb_kernel110nf_ia32_sse2.nb110nf_end: emms movl nb110nf_nouter(%esp),%eax movl nb110nf_ninner(%esp),%ebx movl nb110nf_outeriter(%ebp),%ecx movl nb110nf_inneriter(%ebp),%edx movl %eax,(%ecx) movl %ebx,(%edx) movl nb110nf_salign(%esp),%eax addl %eax,%esp addl $192,%esp popl %edi popl %esi popl %edx popl %ecx popl %ebx popl %eax leave ret
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?