nb_kernel110_ia32_sse2.s

来自「最著名最快的分子模拟软件」· S 代码 · 共 1,175 行 · 第 1/3 页

S
1,175
字号
        movlpd 16(%edi,%eax,8),%xmm5        movhpd (%edi,%ebx,8),%xmm3        movhpd 8(%edi,%ebx,8),%xmm4        movhpd 16(%edi,%ebx,8),%xmm5        subpd %xmm0,%xmm3        subpd %xmm1,%xmm4        subpd %xmm2,%xmm5        movlpd %xmm3,(%edi,%eax,8)        movlpd %xmm4,8(%edi,%eax,8)        movlpd %xmm5,16(%edi,%eax,8)        movhpd %xmm3,(%edi,%ebx,8)        movhpd %xmm4,8(%edi,%ebx,8)        movhpd %xmm5,16(%edi,%ebx,8)        ## should we do one more iteration?         subl $2,nb110_innerk(%esp)        jl    _nb_kernel110_ia32_sse2.nb110_checksingle        jmp   _nb_kernel110_ia32_sse2.nb110_unroll_loop_nb_kernel110_ia32_sse2.nb110_checksingle:         movl  nb110_innerk(%esp),%edx        andl  $1,%edx        jnz    _nb_kernel110_ia32_sse2.nb110_dosingle        jmp    _nb_kernel110_ia32_sse2.nb110_updateouterdata_nb_kernel110_ia32_sse2.nb110_dosingle:         movl nb110_charge(%ebp),%esi        movl nb110_pos(%ebp),%edi        movl nb110_innerjjnr(%esp),%ecx        movl  (%ecx),%eax        xorpd %xmm3,%xmm3        movlpd (%esi,%eax,8),%xmm3        movapd nb110_iq(%esp),%xmm5        mulsd %xmm5,%xmm3               ## qq         movd  %eax,%mm0         ## use mmx registers as temp storage         movl nb110_type(%ebp),%esi        movl (%esi,%eax,4),%eax        movl nb110_vdwparam(%ebp),%esi        shll %eax        movl nb110_ntia(%esp),%edi        addl %edi,%eax        movlpd (%esi,%eax,8),%xmm6      ## c6a        movhpd 8(%esi,%eax,8),%xmm6     ## c6a c12a         xorpd %xmm7,%xmm7        movapd %xmm6,%xmm4        unpcklpd %xmm7,%xmm4        unpckhpd %xmm7,%xmm6        movd  %mm0,%eax        movapd %xmm4,nb110_c6(%esp)        movapd %xmm6,nb110_c12(%esp)        movl nb110_pos(%ebp),%esi        ## base of pos[]         leal  (%eax,%eax,2),%eax     ## replace jnr with j3         ## move two coordinates to xmm0-xmm2            movlpd (%esi,%eax,8),%xmm0        movlpd 8(%esi,%eax,8),%xmm1        movlpd 16(%esi,%eax,8),%xmm2        ## move ix-iz to xmm4-xmm6         movapd nb110_ix(%esp),%xmm4        movapd nb110_iy(%esp),%xmm5        movapd nb110_iz(%esp),%xmm6        ## calc dr         subsd %xmm0,%xmm4        subsd %xmm1,%xmm5        subsd %xmm2,%xmm6        ## store dr         movapd %xmm4,nb110_dx(%esp)        movapd %xmm5,nb110_dy(%esp)        movapd %xmm6,nb110_dz(%esp)        ## square it         mulsd %xmm4,%xmm4        mulsd %xmm5,%xmm5        mulsd %xmm6,%xmm6        addsd %xmm5,%xmm4        addsd %xmm6,%xmm4        ## rsq in xmm4         cvtsd2ss %xmm4,%xmm5        rsqrtss %xmm5,%xmm5        cvtss2sd %xmm5,%xmm2    ## lu in low xmm2         ## lookup seed in xmm2         movapd %xmm2,%xmm5      ## copy of lu         mulsd %xmm2,%xmm2       ## lu*lu         movapd nb110_three(%esp),%xmm1        mulsd %xmm4,%xmm2       ## rsq*lu*lu                            movapd nb110_half(%esp),%xmm0        subsd %xmm2,%xmm1       ## 30-rsq*lu*lu         mulsd %xmm5,%xmm1        mulsd %xmm0,%xmm1       ## xmm0=iter1 of rinv (new lu)         movapd %xmm1,%xmm5      ## copy of lu         mulsd %xmm1,%xmm1       ## lu*lu         movapd nb110_three(%esp),%xmm2        mulsd %xmm4,%xmm1       ## rsq*lu*lu                            movapd nb110_half(%esp),%xmm0        subsd %xmm1,%xmm2       ## 30-rsq*lu*lu         mulsd %xmm5,%xmm2        mulsd %xmm2,%xmm0       ## xmm0=rinv         movapd %xmm0,%xmm4        mulsd  %xmm4,%xmm4      ## xmm4=rinvsq         movapd %xmm4,%xmm1        mulsd  %xmm4,%xmm1        mulsd  %xmm4,%xmm1      ## xmm1=rinvsix         movapd %xmm1,%xmm2        mulsd  %xmm2,%xmm2      ## xmm2=rinvtwelve         mulsd  %xmm0,%xmm3      ## xmm3=vcoul         mulsd  nb110_c6(%esp),%xmm1        mulsd  nb110_c12(%esp),%xmm2        movapd %xmm2,%xmm5        subsd  %xmm1,%xmm5      ## Vvdw=Vvdw12-Vvdw6         addsd  nb110_Vvdwtot(%esp),%xmm5        mulsd  nb110_six(%esp),%xmm1        mulsd  nb110_twelve(%esp),%xmm2        subsd  %xmm1,%xmm2        addsd  %xmm3,%xmm2        mulsd  %xmm2,%xmm4      ## xmm4=total fscal         addsd  nb110_vctot(%esp),%xmm3        movapd nb110_dx(%esp),%xmm0        movapd nb110_dy(%esp),%xmm1        movapd nb110_dz(%esp),%xmm2        movlpd %xmm3,nb110_vctot(%esp)        movlpd %xmm5,nb110_Vvdwtot(%esp)        movl   nb110_faction(%ebp),%edi        mulsd  %xmm4,%xmm0        mulsd  %xmm4,%xmm1        mulsd  %xmm4,%xmm2        ## xmm0-xmm2 contains tx-tz (partial force)         ## now update f_i         movlpd nb110_fix(%esp),%xmm3        movlpd nb110_fiy(%esp),%xmm4        movlpd nb110_fiz(%esp),%xmm5        addsd  %xmm0,%xmm3        addsd  %xmm1,%xmm4        addsd  %xmm2,%xmm5        movlpd %xmm3,nb110_fix(%esp)        movlpd %xmm4,nb110_fiy(%esp)        movlpd %xmm5,nb110_fiz(%esp)        ## the fj's - start by accumulating forces from memory         movlpd (%edi,%eax,8),%xmm3        movlpd 8(%edi,%eax,8),%xmm4        movlpd 16(%edi,%eax,8),%xmm5        subsd %xmm0,%xmm3        subsd %xmm1,%xmm4        subsd %xmm2,%xmm5        movlpd %xmm3,(%edi,%eax,8)        movlpd %xmm4,8(%edi,%eax,8)        movlpd %xmm5,16(%edi,%eax,8)_nb_kernel110_ia32_sse2.nb110_updateouterdata:         movl  nb110_ii3(%esp),%ecx        movl  nb110_faction(%ebp),%edi        movl  nb110_fshift(%ebp),%esi        movl  nb110_is3(%esp),%edx        ## accumulate i forces in xmm0, xmm1, xmm2         movapd nb110_fix(%esp),%xmm0        movapd nb110_fiy(%esp),%xmm1        movapd nb110_fiz(%esp),%xmm2        movhlps %xmm0,%xmm3        movhlps %xmm1,%xmm4        movhlps %xmm2,%xmm5        addsd  %xmm3,%xmm0        addsd  %xmm4,%xmm1        addsd  %xmm5,%xmm2 ## sum is in low xmm0-xmm2         ## increment i force         movsd  (%edi,%ecx,8),%xmm3        movsd  8(%edi,%ecx,8),%xmm4        movsd  16(%edi,%ecx,8),%xmm5        addsd  %xmm0,%xmm3        addsd  %xmm1,%xmm4        addsd  %xmm2,%xmm5        movsd  %xmm3,(%edi,%ecx,8)        movsd  %xmm4,8(%edi,%ecx,8)        movsd  %xmm5,16(%edi,%ecx,8)        ## increment fshift force          movsd  (%esi,%edx,8),%xmm3        movsd  8(%esi,%edx,8),%xmm4        movsd  16(%esi,%edx,8),%xmm5        addsd  %xmm0,%xmm3        addsd  %xmm1,%xmm4        addsd  %xmm2,%xmm5        movsd  %xmm3,(%esi,%edx,8)        movsd  %xmm4,8(%esi,%edx,8)        movsd  %xmm5,16(%esi,%edx,8)        ## get n from stack        movl nb110_n(%esp),%esi        ## get group index for i particle         movl  nb110_gid(%ebp),%edx              ## base of gid[]        movl  (%edx,%esi,4),%edx                ## ggid=gid[n]        ## accumulate total potential energy and update it         movapd nb110_vctot(%esp),%xmm7        ## accumulate         movhlps %xmm7,%xmm6        addsd  %xmm6,%xmm7      ## low xmm7 has the sum now         ## add earlier value from mem         movl  nb110_Vc(%ebp),%eax        addsd (%eax,%edx,8),%xmm7        ## move back to mem         movsd %xmm7,(%eax,%edx,8)        ## accumulate total lj energy and update it         movapd nb110_Vvdwtot(%esp),%xmm7        ## accumulate         movhlps %xmm7,%xmm6        addsd  %xmm6,%xmm7      ## low xmm7 has the sum now         ## add earlier value from mem         movl  nb110_Vvdw(%ebp),%eax        addsd (%eax,%edx,8),%xmm7        ## move back to mem         movsd %xmm7,(%eax,%edx,8)       ## finish if last         movl nb110_nn1(%esp),%ecx        ## esi already loaded with n        incl %esi        subl %esi,%ecx        jz _nb_kernel110_ia32_sse2.nb110_outerend        ## not last, iterate outer loop once more!          movl %esi,nb110_n(%esp)        jmp _nb_kernel110_ia32_sse2.nb110_outer_nb_kernel110_ia32_sse2.nb110_outerend:         ## check if more outer neighborlists remain        movl  nb110_nri(%esp),%ecx        ## esi already loaded with n above        subl  %esi,%ecx        jz _nb_kernel110_ia32_sse2.nb110_end        ## non-zero, do one more workunit        jmp   _nb_kernel110_ia32_sse2.nb110_threadloop_nb_kernel110_ia32_sse2.nb110_end:         emms        movl nb110_nouter(%esp),%eax        movl nb110_ninner(%esp),%ebx        movl nb110_outeriter(%ebp),%ecx        movl nb110_inneriter(%ebp),%edx        movl %eax,(%ecx)        movl %ebx,(%edx)        movl nb110_salign(%esp),%eax        addl %eax,%esp        addl $352,%esp        popl %edi        popl %esi        popl %edx        popl %ecx        popl %ebx        popl %eax        leave        ret.globl nb_kernel110nf_ia32_sse2.globl _nb_kernel110nf_ia32_sse2nb_kernel110nf_ia32_sse2:       _nb_kernel110nf_ia32_sse2:      .set nb110nf_p_nri, 8.set nb110nf_iinr, 12.set nb110nf_jindex, 16.set nb110nf_jjnr, 20.set nb110nf_shift, 24.set nb110nf_shiftvec, 28.set nb110nf_fshift, 32.set nb110nf_gid, 36.set nb110nf_pos, 40.set nb110nf_faction, 44.set nb110nf_charge, 48.set nb110nf_p_facel, 52.set nb110nf_argkrf, 56.set nb110nf_argcrf, 60.set nb110nf_Vc, 64.set nb110nf_type, 68.set nb110nf_p_ntype, 72.set nb110nf_vdwparam, 76.set nb110nf_Vvdw, 80.set nb110nf_p_tabscale, 84.set nb110nf_VFtab, 88.set nb110nf_invsqrta, 92.set nb110nf_dvda, 96.set nb110nf_p_gbtabscale, 100.set nb110nf_GBtab, 104.set nb110nf_p_nthreads, 108.set nb110nf_count, 112.set nb110nf_mtx, 116.set nb110nf_outeriter, 120.set nb110nf_inneriter, 124.set nb110nf_work, 128        ## stack offsets for local variables          ## bottom of stack is cache-aligned for sse2 use .set nb110nf_ix, 0.set nb110nf_iy, 16.set nb110nf_iz, 32.set nb110nf_iq, 48.set nb110nf_c6, 64.set nb110nf_c12, 80.set nb110nf_vctot, 96.set nb110nf_Vvdwtot, 112.set nb110nf_half, 128.set nb110nf_three, 144.set nb110nf_is3, 160.set nb110nf_ii3, 164.set nb110nf_ntia, 168.set nb110nf_innerjjnr, 172.set nb110nf_innerk, 176.set nb110nf_n, 180.set nb110nf_nn1, 184.set nb110nf_nri, 188.set nb110nf_facel, 192                       ## uses 8 bytes.set nb110nf_ntype, 200.set nb110nf_nouter, 204.set nb110nf_ninner, 208.set nb110nf_salign, 212        pushl %ebp        movl %esp,%ebp        pushl %eax        pushl %ebx        pushl %ecx        pushl %edx        pushl %esi        pushl %edi        subl $192,%esp          ## local stack space         movl %esp,%eax        andl $0xf,%eax        subl %eax,%esp        movl %eax,nb110nf_salign(%esp)        emms        ## Move args passed by reference to stack        movl nb110nf_p_nri(%ebp),%ecx        movl nb110nf_p_facel(%ebp),%esi        movl nb110nf_p_ntype(%ebp),%edi        movl (%ecx),%ecx        movsd (%esi),%xmm7        movl (%edi),%edi        movl %ecx,nb110nf_nri(%esp)        movsd %xmm7,nb110nf_facel(%esp)        movl %edi,nb110nf_ntype(%esp)        ## zero iteration counters        movl $0,%eax        movl %eax,nb110nf_nouter(%esp)        movl %eax,nb110nf_ninner(%esp)        ## create constant floating-point factors on stack        movl $0x00000000,%eax   ## lower half of double 0.5 IEEE (hex)        movl $0x3fe00000,%ebx        movl %eax,nb110nf_half(%esp)        movl %ebx,nb110nf_half+4(%esp)        movsd nb110nf_half(%esp),%xmm1        shufpd $0,%xmm1,%xmm1  ## splat to all elements        movapd %xmm1,%xmm3        addpd  %xmm3,%xmm3      ## 1.0        movapd %xmm3,%xmm2        addpd  %xmm2,%xmm2      ## 2.0        addpd  %xmm2,%xmm3      ## 3.0        movapd %xmm1,nb110nf_half(%esp)        movapd %xmm3,nb110nf_three(%esp)_nb_kernel110nf_ia32_sse2.nb110nf_threadloop:         movl  nb110nf_count(%ebp),%esi          ## pointer to sync counter        movl  (%esi),%eax_nb_kernel110nf_ia32_sse2.nb110nf_spinlock:         movl  %eax,%ebx                         ## ebx=*count=nn0        addl  $1,%ebx                           ## ebx=nn1=nn0+10

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?