nb_kernel110_ia32_sse2.s

来自「最著名最快的分子模拟软件」· S 代码 · 共 1,175 行 · 第 1/3 页

S
1,175
字号
        lock         cmpxchgl %ebx,(%esi)                    ## write nn1 to *counter,                                                ## if it hasnt changed.                                                ## or reread *counter to eax.        pause                                   ## -> better p4 performance        jnz _nb_kernel110nf_ia32_sse2.nb110nf_spinlock        ## if(nn1>nri) nn1=nri        movl nb110nf_nri(%esp),%ecx        movl %ecx,%edx        subl %ebx,%ecx        cmovlel %edx,%ebx                       ## if(nn1>nri) nn1=nri        ## Cleared the spinlock if we got here.        ## eax contains nn0, ebx contains nn1.        movl %eax,nb110nf_n(%esp)        movl %ebx,nb110nf_nn1(%esp)        subl %eax,%ebx                          ## calc number of outer lists        movl %eax,%esi                          ## copy n to esi        jg  _nb_kernel110nf_ia32_sse2.nb110nf_outerstart        jmp _nb_kernel110nf_ia32_sse2.nb110nf_end_nb_kernel110nf_ia32_sse2.nb110nf_outerstart:         ## ebx contains number of outer iterations        addl nb110nf_nouter(%esp),%ebx        movl %ebx,nb110nf_nouter(%esp)_nb_kernel110nf_ia32_sse2.nb110nf_outer:         movl  nb110nf_shift(%ebp),%eax        ## eax = pointer into shift[]         movl  (%eax,%esi,4),%ebx        ## ebx=shift[n]         leal  (%ebx,%ebx,2),%ebx    ## ebx=3*is         movl  nb110nf_shiftvec(%ebp),%eax     ## eax = base of shiftvec[]         movsd (%eax,%ebx,8),%xmm0        movsd 8(%eax,%ebx,8),%xmm1        movsd 16(%eax,%ebx,8),%xmm2        movl  nb110nf_iinr(%ebp),%ecx         ## ecx = pointer into iinr[]              movl  (%ecx,%esi,4),%ebx    ## ebx =ii         movl  nb110nf_charge(%ebp),%edx        movsd (%edx,%ebx,8),%xmm3        mulsd nb110nf_facel(%esp),%xmm3        shufpd $0,%xmm3,%xmm3        movl  nb110nf_type(%ebp),%edx        movl  (%edx,%ebx,4),%edx        imull nb110nf_ntype(%esp),%edx        shll  %edx        movl  %edx,nb110nf_ntia(%esp)        leal  (%ebx,%ebx,2),%ebx        ## ebx = 3*ii=ii3         movl  nb110nf_pos(%ebp),%eax      ## eax = base of pos[]          addsd (%eax,%ebx,8),%xmm0        addsd 8(%eax,%ebx,8),%xmm1        addsd 16(%eax,%ebx,8),%xmm2        movapd %xmm3,nb110nf_iq(%esp)        shufpd $0,%xmm0,%xmm0        shufpd $0,%xmm1,%xmm1        shufpd $0,%xmm2,%xmm2        movapd %xmm0,nb110nf_ix(%esp)        movapd %xmm1,nb110nf_iy(%esp)        movapd %xmm2,nb110nf_iz(%esp)        movl  %ebx,nb110nf_ii3(%esp)        ## clear vctot         xorpd %xmm4,%xmm4        movapd %xmm4,nb110nf_vctot(%esp)        movapd %xmm4,nb110nf_Vvdwtot(%esp)        movl  nb110nf_jindex(%ebp),%eax        movl  (%eax,%esi,4),%ecx             ## jindex[n]         movl  4(%eax,%esi,4),%edx            ## jindex[n+1]         subl  %ecx,%edx              ## number of innerloop atoms         movl  nb110nf_pos(%ebp),%esi        movl  nb110nf_jjnr(%ebp),%eax        shll  $2,%ecx        addl  %ecx,%eax        movl  %eax,nb110nf_innerjjnr(%esp)       ## pointer to jjnr[nj0]         movl  %edx,%ecx        subl  $2,%edx        addl  nb110nf_ninner(%esp),%ecx        movl  %ecx,nb110nf_ninner(%esp)        addl  $0,%edx        movl  %edx,nb110nf_innerk(%esp)      ## number of innerloop atoms         jge   _nb_kernel110nf_ia32_sse2.nb110nf_unroll_loop        jmp   _nb_kernel110nf_ia32_sse2.nb110nf_checksingle_nb_kernel110nf_ia32_sse2.nb110nf_unroll_loop:         ## twice unrolled innerloop here         movl  nb110nf_innerjjnr(%esp),%edx       ## pointer to jjnr[k]         movl  (%edx),%eax        movl  4(%edx),%ebx        addl $8,nb110nf_innerjjnr(%esp)                 ## advance pointer (unrolled 2)         movl nb110nf_charge(%ebp),%esi     ## base of charge[]         movlpd (%esi,%eax,8),%xmm3        movhpd (%esi,%ebx,8),%xmm3        movapd nb110nf_iq(%esp),%xmm5        mulpd %xmm5,%xmm3               ## qq         movd  %eax,%mm0         ## use mmx registers as temp storage         movd  %ebx,%mm1        movl nb110nf_type(%ebp),%esi        movl (%esi,%eax,4),%eax        movl (%esi,%ebx,4),%ebx        movl nb110nf_vdwparam(%ebp),%esi        shll %eax        shll %ebx        movl nb110nf_ntia(%esp),%edi        addl %edi,%eax        addl %edi,%ebx        movlpd (%esi,%eax,8),%xmm6      ## c6a        movlpd (%esi,%ebx,8),%xmm7      ## c6b        movhpd 8(%esi,%eax,8),%xmm6     ## c6a c12a         movhpd 8(%esi,%ebx,8),%xmm7     ## c6b c12b         movapd %xmm6,%xmm4        unpcklpd %xmm7,%xmm4        unpckhpd %xmm7,%xmm6        movd  %mm0,%eax        movd  %mm1,%ebx        movapd %xmm4,nb110nf_c6(%esp)        movapd %xmm6,nb110nf_c12(%esp)        movl nb110nf_pos(%ebp),%esi        ## base of pos[]         leal  (%eax,%eax,2),%eax     ## replace jnr with j3         leal  (%ebx,%ebx,2),%ebx        ## move two coordinates to xmm0-xmm2            movlpd (%esi,%eax,8),%xmm0        movlpd 8(%esi,%eax,8),%xmm1        movlpd 16(%esi,%eax,8),%xmm2        movhpd (%esi,%ebx,8),%xmm0        movhpd 8(%esi,%ebx,8),%xmm1        movhpd 16(%esi,%ebx,8),%xmm2        ## move ix-iz to xmm4-xmm6         movapd nb110nf_ix(%esp),%xmm4        movapd nb110nf_iy(%esp),%xmm5        movapd nb110nf_iz(%esp),%xmm6        ## calc dr         subpd %xmm0,%xmm4        subpd %xmm1,%xmm5        subpd %xmm2,%xmm6        ## square it         mulpd %xmm4,%xmm4        mulpd %xmm5,%xmm5        mulpd %xmm6,%xmm6        addpd %xmm5,%xmm4        addpd %xmm6,%xmm4        ## rsq in xmm4         cvtpd2ps %xmm4,%xmm5        rsqrtps %xmm5,%xmm5        cvtps2pd %xmm5,%xmm2    ## lu in low xmm2         ## lookup seed in xmm2         movapd %xmm2,%xmm5      ## copy of lu         mulpd %xmm2,%xmm2       ## lu*lu         movapd nb110nf_three(%esp),%xmm1        mulpd %xmm4,%xmm2       ## rsq*lu*lu                            movapd nb110nf_half(%esp),%xmm0        subpd %xmm2,%xmm1       ## 30-rsq*lu*lu         mulpd %xmm5,%xmm1        mulpd %xmm0,%xmm1       ## xmm0=iter1 of rinv (new lu)         movapd %xmm1,%xmm5      ## copy of lu         mulpd %xmm1,%xmm1       ## lu*lu         movapd nb110nf_three(%esp),%xmm2        mulpd %xmm4,%xmm1       ## rsq*lu*lu                            movapd nb110nf_half(%esp),%xmm0        subpd %xmm1,%xmm2       ## 30-rsq*lu*lu         mulpd %xmm5,%xmm2        mulpd %xmm2,%xmm0       ## xmm0=rinv         movapd %xmm0,%xmm4        mulpd  %xmm4,%xmm4      ## xmm4=rinvsq         movapd %xmm4,%xmm1        mulpd  %xmm4,%xmm1        mulpd  %xmm4,%xmm1      ## xmm1=rinvsix         movapd %xmm1,%xmm2        mulpd  %xmm2,%xmm2      ## xmm2=rinvtwelve         mulpd  %xmm0,%xmm3      ## xmm3=vcoul         mulpd  nb110nf_c6(%esp),%xmm1        mulpd  nb110nf_c12(%esp),%xmm2        movapd %xmm2,%xmm5        subpd  %xmm1,%xmm5      ## Vvdw=Vvdw12-Vvdw6         addpd  nb110nf_Vvdwtot(%esp),%xmm5        addpd  nb110nf_vctot(%esp),%xmm3        movapd %xmm3,nb110nf_vctot(%esp)        movapd %xmm5,nb110nf_Vvdwtot(%esp)        ## should we do one more iteration?         subl $2,nb110nf_innerk(%esp)        jl    _nb_kernel110nf_ia32_sse2.nb110nf_checksingle        jmp   _nb_kernel110nf_ia32_sse2.nb110nf_unroll_loop_nb_kernel110nf_ia32_sse2.nb110nf_checksingle:         movl  nb110nf_innerk(%esp),%edx        andl  $1,%edx        jnz   _nb_kernel110nf_ia32_sse2.nb110nf_dosingle        jmp   _nb_kernel110nf_ia32_sse2.nb110nf_updateouterdata_nb_kernel110nf_ia32_sse2.nb110nf_dosingle:         movl nb110nf_charge(%ebp),%esi        movl nb110nf_pos(%ebp),%edi        movl nb110nf_innerjjnr(%esp),%ecx        movl  (%ecx),%eax        xorpd %xmm3,%xmm3        movlpd (%esi,%eax,8),%xmm3        movapd nb110nf_iq(%esp),%xmm5        mulsd %xmm5,%xmm3               ## qq         movd  %eax,%mm0         ## use mmx registers as temp storage         movl nb110nf_type(%ebp),%esi        movl (%esi,%eax,4),%eax        movl nb110nf_vdwparam(%ebp),%esi        shll %eax        movl nb110nf_ntia(%esp),%edi        addl %edi,%eax        movlpd (%esi,%eax,8),%xmm6      ## c6a        movhpd 8(%esi,%eax,8),%xmm6     ## c6a c12a         xorpd %xmm7,%xmm7        movapd %xmm6,%xmm4        unpcklpd %xmm7,%xmm4        unpckhpd %xmm7,%xmm6        movd  %mm0,%eax        movapd %xmm4,nb110nf_c6(%esp)        movapd %xmm6,nb110nf_c12(%esp)        movl nb110nf_pos(%ebp),%esi        ## base of pos[]         leal  (%eax,%eax,2),%eax     ## replace jnr with j3         ## move two coordinates to xmm0-xmm2            movlpd (%esi,%eax,8),%xmm0        movlpd 8(%esi,%eax,8),%xmm1        movlpd 16(%esi,%eax,8),%xmm2        ## move ix-iz to xmm4-xmm6         movapd nb110nf_ix(%esp),%xmm4        movapd nb110nf_iy(%esp),%xmm5        movapd nb110nf_iz(%esp),%xmm6        ## calc dr         subsd %xmm0,%xmm4        subsd %xmm1,%xmm5        subsd %xmm2,%xmm6        ## square it         mulsd %xmm4,%xmm4        mulsd %xmm5,%xmm5        mulsd %xmm6,%xmm6        addsd %xmm5,%xmm4        addsd %xmm6,%xmm4        ## rsq in xmm4         cvtsd2ss %xmm4,%xmm5        rsqrtss %xmm5,%xmm5        cvtss2sd %xmm5,%xmm2    ## lu in low xmm2         ## lookup seed in xmm2         movapd %xmm2,%xmm5      ## copy of lu         mulsd %xmm2,%xmm2       ## lu*lu         movapd nb110nf_three(%esp),%xmm1        mulsd %xmm4,%xmm2       ## rsq*lu*lu                            movapd nb110nf_half(%esp),%xmm0        subsd %xmm2,%xmm1       ## 30-rsq*lu*lu         mulsd %xmm5,%xmm1        mulsd %xmm0,%xmm1       ## xmm0=iter1 of rinv (new lu)         movapd %xmm1,%xmm5      ## copy of lu         mulsd %xmm1,%xmm1       ## lu*lu         movapd nb110nf_three(%esp),%xmm2        mulsd %xmm4,%xmm1       ## rsq*lu*lu                            movapd nb110nf_half(%esp),%xmm0        subsd %xmm1,%xmm2       ## 30-rsq*lu*lu         mulsd %xmm5,%xmm2        mulsd %xmm2,%xmm0       ## xmm0=rinv         movapd %xmm0,%xmm4        mulsd  %xmm4,%xmm4      ## xmm4=rinvsq         movapd %xmm4,%xmm1        mulsd  %xmm4,%xmm1        mulsd  %xmm4,%xmm1      ## xmm1=rinvsix         movapd %xmm1,%xmm2        mulsd  %xmm2,%xmm2      ## xmm2=rinvtwelve         mulsd  %xmm0,%xmm3      ## xmm3=vcoul         mulsd  nb110nf_c6(%esp),%xmm1        mulsd  nb110nf_c12(%esp),%xmm2        movapd %xmm2,%xmm5        subsd  %xmm1,%xmm5      ## Vvdw=Vvdw12-Vvdw6         addsd  nb110nf_Vvdwtot(%esp),%xmm5        addsd  nb110nf_vctot(%esp),%xmm3        movlpd %xmm3,nb110nf_vctot(%esp)        movlpd %xmm5,nb110nf_Vvdwtot(%esp)_nb_kernel110nf_ia32_sse2.nb110nf_updateouterdata:         ## get n from stack        movl nb110nf_n(%esp),%esi        ## get group index for i particle         movl  nb110nf_gid(%ebp),%edx            ## base of gid[]        movl  (%edx,%esi,4),%edx                ## ggid=gid[n]        ## accumulate total potential energy and update it         movapd nb110nf_vctot(%esp),%xmm7        ## accumulate         movhlps %xmm7,%xmm6        addsd  %xmm6,%xmm7      ## low xmm7 has the sum now         ## add earlier value from mem         movl  nb110nf_Vc(%ebp),%eax        addsd (%eax,%edx,8),%xmm7        ## move back to mem         movsd %xmm7,(%eax,%edx,8)        ## accumulate total lj energy and update it         movapd nb110nf_Vvdwtot(%esp),%xmm7        ## accumulate         movhlps %xmm7,%xmm6        addsd  %xmm6,%xmm7      ## low xmm7 has the sum now         ## add earlier value from mem         movl  nb110nf_Vvdw(%ebp),%eax        addsd (%eax,%edx,8),%xmm7        ## move back to mem         movsd %xmm7,(%eax,%edx,8)        ## finish if last         movl nb110nf_nn1(%esp),%ecx        ## esi already loaded with n        incl %esi        subl %esi,%ecx        jz _nb_kernel110nf_ia32_sse2.nb110nf_outerend        ## not last, iterate outer loop once more!          movl %esi,nb110nf_n(%esp)        jmp _nb_kernel110nf_ia32_sse2.nb110nf_outer_nb_kernel110nf_ia32_sse2.nb110nf_outerend:         ## check if more outer neighborlists remain        movl  nb110nf_nri(%esp),%ecx        ## esi already loaded with n above        subl  %esi,%ecx        jz _nb_kernel110nf_ia32_sse2.nb110nf_end        ## non-zero, do one more workunit        jmp   _nb_kernel110nf_ia32_sse2.nb110nf_threadloop_nb_kernel110nf_ia32_sse2.nb110nf_end:         emms        movl nb110nf_nouter(%esp),%eax        movl nb110nf_ninner(%esp),%ebx        movl nb110nf_outeriter(%ebp),%ecx        movl nb110nf_inneriter(%ebp),%edx        movl %eax,(%ecx)        movl %ebx,(%edx)        movl nb110nf_salign(%esp),%eax        addl %eax,%esp        addl $192,%esp        popl %edi        popl %esi        popl %edx        popl %ecx        popl %ebx        popl %eax        leave        ret

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?