nb_kernel110_ia32_sse2.s
来自「最著名最快的分子模拟软件」· S 代码 · 共 1,175 行 · 第 1/3 页
S
1,175 行
movlpd 16(%edi,%eax,8),%xmm5 movhpd (%edi,%ebx,8),%xmm3 movhpd 8(%edi,%ebx,8),%xmm4 movhpd 16(%edi,%ebx,8),%xmm5 subpd %xmm0,%xmm3 subpd %xmm1,%xmm4 subpd %xmm2,%xmm5 movlpd %xmm3,(%edi,%eax,8) movlpd %xmm4,8(%edi,%eax,8) movlpd %xmm5,16(%edi,%eax,8) movhpd %xmm3,(%edi,%ebx,8) movhpd %xmm4,8(%edi,%ebx,8) movhpd %xmm5,16(%edi,%ebx,8) ## should we do one more iteration? subl $2,nb110_innerk(%esp) jl _nb_kernel110_ia32_sse2.nb110_checksingle jmp _nb_kernel110_ia32_sse2.nb110_unroll_loop_nb_kernel110_ia32_sse2.nb110_checksingle: movl nb110_innerk(%esp),%edx andl $1,%edx jnz _nb_kernel110_ia32_sse2.nb110_dosingle jmp _nb_kernel110_ia32_sse2.nb110_updateouterdata_nb_kernel110_ia32_sse2.nb110_dosingle: movl nb110_charge(%ebp),%esi movl nb110_pos(%ebp),%edi movl nb110_innerjjnr(%esp),%ecx movl (%ecx),%eax xorpd %xmm3,%xmm3 movlpd (%esi,%eax,8),%xmm3 movapd nb110_iq(%esp),%xmm5 mulsd %xmm5,%xmm3 ## qq movd %eax,%mm0 ## use mmx registers as temp storage movl nb110_type(%ebp),%esi movl (%esi,%eax,4),%eax movl nb110_vdwparam(%ebp),%esi shll %eax movl nb110_ntia(%esp),%edi addl %edi,%eax movlpd (%esi,%eax,8),%xmm6 ## c6a movhpd 8(%esi,%eax,8),%xmm6 ## c6a c12a xorpd %xmm7,%xmm7 movapd %xmm6,%xmm4 unpcklpd %xmm7,%xmm4 unpckhpd %xmm7,%xmm6 movd %mm0,%eax movapd %xmm4,nb110_c6(%esp) movapd %xmm6,nb110_c12(%esp) movl nb110_pos(%ebp),%esi ## base of pos[] leal (%eax,%eax,2),%eax ## replace jnr with j3 ## move two coordinates to xmm0-xmm2 movlpd (%esi,%eax,8),%xmm0 movlpd 8(%esi,%eax,8),%xmm1 movlpd 16(%esi,%eax,8),%xmm2 ## move ix-iz to xmm4-xmm6 movapd nb110_ix(%esp),%xmm4 movapd nb110_iy(%esp),%xmm5 movapd nb110_iz(%esp),%xmm6 ## calc dr subsd %xmm0,%xmm4 subsd %xmm1,%xmm5 subsd %xmm2,%xmm6 ## store dr movapd %xmm4,nb110_dx(%esp) movapd %xmm5,nb110_dy(%esp) movapd %xmm6,nb110_dz(%esp) ## square it mulsd %xmm4,%xmm4 mulsd %xmm5,%xmm5 mulsd %xmm6,%xmm6 addsd %xmm5,%xmm4 addsd %xmm6,%xmm4 ## rsq in xmm4 cvtsd2ss %xmm4,%xmm5 rsqrtss %xmm5,%xmm5 cvtss2sd %xmm5,%xmm2 ## lu in low xmm2 ## lookup seed in xmm2 movapd %xmm2,%xmm5 ## copy of lu mulsd %xmm2,%xmm2 ## lu*lu movapd nb110_three(%esp),%xmm1 mulsd %xmm4,%xmm2 ## rsq*lu*lu movapd nb110_half(%esp),%xmm0 subsd %xmm2,%xmm1 ## 30-rsq*lu*lu mulsd %xmm5,%xmm1 mulsd %xmm0,%xmm1 ## xmm0=iter1 of rinv (new lu) movapd %xmm1,%xmm5 ## copy of lu mulsd %xmm1,%xmm1 ## lu*lu movapd nb110_three(%esp),%xmm2 mulsd %xmm4,%xmm1 ## rsq*lu*lu movapd nb110_half(%esp),%xmm0 subsd %xmm1,%xmm2 ## 30-rsq*lu*lu mulsd %xmm5,%xmm2 mulsd %xmm2,%xmm0 ## xmm0=rinv movapd %xmm0,%xmm4 mulsd %xmm4,%xmm4 ## xmm4=rinvsq movapd %xmm4,%xmm1 mulsd %xmm4,%xmm1 mulsd %xmm4,%xmm1 ## xmm1=rinvsix movapd %xmm1,%xmm2 mulsd %xmm2,%xmm2 ## xmm2=rinvtwelve mulsd %xmm0,%xmm3 ## xmm3=vcoul mulsd nb110_c6(%esp),%xmm1 mulsd nb110_c12(%esp),%xmm2 movapd %xmm2,%xmm5 subsd %xmm1,%xmm5 ## Vvdw=Vvdw12-Vvdw6 addsd nb110_Vvdwtot(%esp),%xmm5 mulsd nb110_six(%esp),%xmm1 mulsd nb110_twelve(%esp),%xmm2 subsd %xmm1,%xmm2 addsd %xmm3,%xmm2 mulsd %xmm2,%xmm4 ## xmm4=total fscal addsd nb110_vctot(%esp),%xmm3 movapd nb110_dx(%esp),%xmm0 movapd nb110_dy(%esp),%xmm1 movapd nb110_dz(%esp),%xmm2 movlpd %xmm3,nb110_vctot(%esp) movlpd %xmm5,nb110_Vvdwtot(%esp) movl nb110_faction(%ebp),%edi mulsd %xmm4,%xmm0 mulsd %xmm4,%xmm1 mulsd %xmm4,%xmm2 ## xmm0-xmm2 contains tx-tz (partial force) ## now update f_i movlpd nb110_fix(%esp),%xmm3 movlpd nb110_fiy(%esp),%xmm4 movlpd nb110_fiz(%esp),%xmm5 addsd %xmm0,%xmm3 addsd %xmm1,%xmm4 addsd %xmm2,%xmm5 movlpd %xmm3,nb110_fix(%esp) movlpd %xmm4,nb110_fiy(%esp) movlpd %xmm5,nb110_fiz(%esp) ## the fj's - start by accumulating forces from memory movlpd (%edi,%eax,8),%xmm3 movlpd 8(%edi,%eax,8),%xmm4 movlpd 16(%edi,%eax,8),%xmm5 subsd %xmm0,%xmm3 subsd %xmm1,%xmm4 subsd %xmm2,%xmm5 movlpd %xmm3,(%edi,%eax,8) movlpd %xmm4,8(%edi,%eax,8) movlpd %xmm5,16(%edi,%eax,8)_nb_kernel110_ia32_sse2.nb110_updateouterdata: movl nb110_ii3(%esp),%ecx movl nb110_faction(%ebp),%edi movl nb110_fshift(%ebp),%esi movl nb110_is3(%esp),%edx ## accumulate i forces in xmm0, xmm1, xmm2 movapd nb110_fix(%esp),%xmm0 movapd nb110_fiy(%esp),%xmm1 movapd nb110_fiz(%esp),%xmm2 movhlps %xmm0,%xmm3 movhlps %xmm1,%xmm4 movhlps %xmm2,%xmm5 addsd %xmm3,%xmm0 addsd %xmm4,%xmm1 addsd %xmm5,%xmm2 ## sum is in low xmm0-xmm2 ## increment i force movsd (%edi,%ecx,8),%xmm3 movsd 8(%edi,%ecx,8),%xmm4 movsd 16(%edi,%ecx,8),%xmm5 addsd %xmm0,%xmm3 addsd %xmm1,%xmm4 addsd %xmm2,%xmm5 movsd %xmm3,(%edi,%ecx,8) movsd %xmm4,8(%edi,%ecx,8) movsd %xmm5,16(%edi,%ecx,8) ## increment fshift force movsd (%esi,%edx,8),%xmm3 movsd 8(%esi,%edx,8),%xmm4 movsd 16(%esi,%edx,8),%xmm5 addsd %xmm0,%xmm3 addsd %xmm1,%xmm4 addsd %xmm2,%xmm5 movsd %xmm3,(%esi,%edx,8) movsd %xmm4,8(%esi,%edx,8) movsd %xmm5,16(%esi,%edx,8) ## get n from stack movl nb110_n(%esp),%esi ## get group index for i particle movl nb110_gid(%ebp),%edx ## base of gid[] movl (%edx,%esi,4),%edx ## ggid=gid[n] ## accumulate total potential energy and update it movapd nb110_vctot(%esp),%xmm7 ## accumulate movhlps %xmm7,%xmm6 addsd %xmm6,%xmm7 ## low xmm7 has the sum now ## add earlier value from mem movl nb110_Vc(%ebp),%eax addsd (%eax,%edx,8),%xmm7 ## move back to mem movsd %xmm7,(%eax,%edx,8) ## accumulate total lj energy and update it movapd nb110_Vvdwtot(%esp),%xmm7 ## accumulate movhlps %xmm7,%xmm6 addsd %xmm6,%xmm7 ## low xmm7 has the sum now ## add earlier value from mem movl nb110_Vvdw(%ebp),%eax addsd (%eax,%edx,8),%xmm7 ## move back to mem movsd %xmm7,(%eax,%edx,8) ## finish if last movl nb110_nn1(%esp),%ecx ## esi already loaded with n incl %esi subl %esi,%ecx jz _nb_kernel110_ia32_sse2.nb110_outerend ## not last, iterate outer loop once more! movl %esi,nb110_n(%esp) jmp _nb_kernel110_ia32_sse2.nb110_outer_nb_kernel110_ia32_sse2.nb110_outerend: ## check if more outer neighborlists remain movl nb110_nri(%esp),%ecx ## esi already loaded with n above subl %esi,%ecx jz _nb_kernel110_ia32_sse2.nb110_end ## non-zero, do one more workunit jmp _nb_kernel110_ia32_sse2.nb110_threadloop_nb_kernel110_ia32_sse2.nb110_end: emms movl nb110_nouter(%esp),%eax movl nb110_ninner(%esp),%ebx movl nb110_outeriter(%ebp),%ecx movl nb110_inneriter(%ebp),%edx movl %eax,(%ecx) movl %ebx,(%edx) movl nb110_salign(%esp),%eax addl %eax,%esp addl $352,%esp popl %edi popl %esi popl %edx popl %ecx popl %ebx popl %eax leave ret.globl nb_kernel110nf_ia32_sse2.globl _nb_kernel110nf_ia32_sse2nb_kernel110nf_ia32_sse2: _nb_kernel110nf_ia32_sse2: .set nb110nf_p_nri, 8.set nb110nf_iinr, 12.set nb110nf_jindex, 16.set nb110nf_jjnr, 20.set nb110nf_shift, 24.set nb110nf_shiftvec, 28.set nb110nf_fshift, 32.set nb110nf_gid, 36.set nb110nf_pos, 40.set nb110nf_faction, 44.set nb110nf_charge, 48.set nb110nf_p_facel, 52.set nb110nf_argkrf, 56.set nb110nf_argcrf, 60.set nb110nf_Vc, 64.set nb110nf_type, 68.set nb110nf_p_ntype, 72.set nb110nf_vdwparam, 76.set nb110nf_Vvdw, 80.set nb110nf_p_tabscale, 84.set nb110nf_VFtab, 88.set nb110nf_invsqrta, 92.set nb110nf_dvda, 96.set nb110nf_p_gbtabscale, 100.set nb110nf_GBtab, 104.set nb110nf_p_nthreads, 108.set nb110nf_count, 112.set nb110nf_mtx, 116.set nb110nf_outeriter, 120.set nb110nf_inneriter, 124.set nb110nf_work, 128 ## stack offsets for local variables ## bottom of stack is cache-aligned for sse2 use .set nb110nf_ix, 0.set nb110nf_iy, 16.set nb110nf_iz, 32.set nb110nf_iq, 48.set nb110nf_c6, 64.set nb110nf_c12, 80.set nb110nf_vctot, 96.set nb110nf_Vvdwtot, 112.set nb110nf_half, 128.set nb110nf_three, 144.set nb110nf_is3, 160.set nb110nf_ii3, 164.set nb110nf_ntia, 168.set nb110nf_innerjjnr, 172.set nb110nf_innerk, 176.set nb110nf_n, 180.set nb110nf_nn1, 184.set nb110nf_nri, 188.set nb110nf_facel, 192 ## uses 8 bytes.set nb110nf_ntype, 200.set nb110nf_nouter, 204.set nb110nf_ninner, 208.set nb110nf_salign, 212 pushl %ebp movl %esp,%ebp pushl %eax pushl %ebx pushl %ecx pushl %edx pushl %esi pushl %edi subl $192,%esp ## local stack space movl %esp,%eax andl $0xf,%eax subl %eax,%esp movl %eax,nb110nf_salign(%esp) emms ## Move args passed by reference to stack movl nb110nf_p_nri(%ebp),%ecx movl nb110nf_p_facel(%ebp),%esi movl nb110nf_p_ntype(%ebp),%edi movl (%ecx),%ecx movsd (%esi),%xmm7 movl (%edi),%edi movl %ecx,nb110nf_nri(%esp) movsd %xmm7,nb110nf_facel(%esp) movl %edi,nb110nf_ntype(%esp) ## zero iteration counters movl $0,%eax movl %eax,nb110nf_nouter(%esp) movl %eax,nb110nf_ninner(%esp) ## create constant floating-point factors on stack movl $0x00000000,%eax ## lower half of double 0.5 IEEE (hex) movl $0x3fe00000,%ebx movl %eax,nb110nf_half(%esp) movl %ebx,nb110nf_half+4(%esp) movsd nb110nf_half(%esp),%xmm1 shufpd $0,%xmm1,%xmm1 ## splat to all elements movapd %xmm1,%xmm3 addpd %xmm3,%xmm3 ## 1.0 movapd %xmm3,%xmm2 addpd %xmm2,%xmm2 ## 2.0 addpd %xmm2,%xmm3 ## 3.0 movapd %xmm1,nb110nf_half(%esp) movapd %xmm3,nb110nf_three(%esp)_nb_kernel110nf_ia32_sse2.nb110nf_threadloop: movl nb110nf_count(%ebp),%esi ## pointer to sync counter movl (%esi),%eax_nb_kernel110nf_ia32_sse2.nb110nf_spinlock: movl %eax,%ebx ## ebx=*count=nn0 addl $1,%ebx ## ebx=nn1=nn0+10
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?