nb_kernel313_x86_64_sse.s

来自「最著名最快的分子模拟软件」· S 代码 · 共 2,144 行 · 第 1/5 页

S
2,144
字号
    movhps %xmm11,(%rdi,%rbx,4)    movlps %xmm12,(%rdi,%rcx,4)    movhps %xmm12,(%rdi,%rdx,4)    movss  %xmm13,8(%rdi,%rax,4)    movss  %xmm14,8(%rdi,%rcx,4)    shufps $1,%xmm13,%xmm13    shufps $1,%xmm14,%xmm14    movss  %xmm13,8(%rdi,%rbx,4)    movss  %xmm14,8(%rdi,%rdx,4)        ## should we do one more iteration?         subl $4,nb313_innerk(%rsp)        jl    _nb_kernel313_x86_64_sse.nb313_odd_inner        jmp   _nb_kernel313_x86_64_sse.nb313_unroll_loop_nb_kernel313_x86_64_sse.nb313_odd_inner:         addl $4,nb313_innerk(%rsp)        jnz   _nb_kernel313_x86_64_sse.nb313_odd_loop        jmp   _nb_kernel313_x86_64_sse.nb313_updateouterdata_nb_kernel313_x86_64_sse.nb313_odd_loop:         movq  nb313_innerjjnr(%rsp),%rdx        ## pointer to jjnr[k]         movl  (%rdx),%eax        addq $4,nb313_innerjjnr(%rsp)        xorps %xmm4,%xmm4       ## clear reg.        movss nb313_iqM(%rsp),%xmm4        movq nb313_charge(%rbp),%rsi        movhps nb313_iqH(%rsp),%xmm4    ## [qM  0  qH  qH]         shufps $41,%xmm4,%xmm4 ## [0 qH qH qM]        movss (%rsi,%rax,4),%xmm3       ## charge in xmm3         shufps $0,%xmm3,%xmm3        mulps %xmm4,%xmm3        movaps %xmm3,nb313_qqM(%rsp)    ## use dummy qq for storage         xorps %xmm6,%xmm6        movq nb313_type(%rbp),%rsi        movl (%rsi,%rax,4),%ebx        movq nb313_vdwparam(%rbp),%rsi        shll %ebx        addl nb313_ntia(%rsp),%ebx        movlps (%rsi,%rbx,4),%xmm6        movaps %xmm6,%xmm7        shufps $252,%xmm6,%xmm6 ## 11111100        shufps $253,%xmm7,%xmm7 ## 11111101        movaps %xmm6,nb313_c6(%rsp)        movaps %xmm7,nb313_c12(%rsp)        movq nb313_pos(%rbp),%rsi        lea (%rax,%rax,2),%rax        movss nb313_ixO(%rsp),%xmm0        movss nb313_iyO(%rsp),%xmm1        movss nb313_izO(%rsp),%xmm2        movss nb313_ixH1(%rsp),%xmm3        movss nb313_iyH1(%rsp),%xmm4        movss nb313_izH1(%rsp),%xmm5        unpcklps nb313_ixH2(%rsp),%xmm0         ## ixO ixH2 - -        unpcklps nb313_iyH2(%rsp),%xmm1         ## iyO iyH2 - -        unpcklps nb313_izH2(%rsp),%xmm2         ## izO izH2 - -        unpcklps nb313_ixM(%rsp),%xmm3          ## ixH1 ixM - -        unpcklps nb313_iyM(%rsp),%xmm4          ## iyH1 iyM - -        unpcklps nb313_izM(%rsp),%xmm5          ## izH1 izM - -        unpcklps %xmm3,%xmm0    ## ixO ixH1 ixH2 ixM        unpcklps %xmm4,%xmm1    ## same for y        unpcklps %xmm5,%xmm2    ## same for z        ## move j coords to xmm0-xmm2         movss (%rsi,%rax,4),%xmm3        movss 4(%rsi,%rax,4),%xmm4        movss 8(%rsi,%rax,4),%xmm5        shufps $0,%xmm3,%xmm3        shufps $0,%xmm4,%xmm4        shufps $0,%xmm5,%xmm5        subps %xmm0,%xmm3        subps %xmm1,%xmm4        subps %xmm2,%xmm5        ## use M distances for storage        movaps %xmm3,nb313_dxM(%rsp)        movaps %xmm4,nb313_dyM(%rsp)        movaps %xmm5,nb313_dzM(%rsp)        mulps  %xmm3,%xmm3        mulps  %xmm4,%xmm4        mulps  %xmm5,%xmm5        addps  %xmm3,%xmm4        addps  %xmm5,%xmm4        ## rsq in xmm4         rsqrtps %xmm4,%xmm5        ## lookup seed in xmm5         movaps %xmm5,%xmm2        mulps %xmm5,%xmm5        movaps nb313_three(%rsp),%xmm1        mulps %xmm4,%xmm5       ## rsq*lu*lu                            movaps nb313_half(%rsp),%xmm0        subps %xmm5,%xmm1       ## 30-rsq*lu*lu         mulps %xmm2,%xmm1        mulps %xmm1,%xmm0       ## xmm0=rinv        movaps %xmm0,nb313_rinvM(%rsp)        mulps  %xmm0,%xmm4      ## r        mulps nb313_tsc(%rsp),%xmm4        movhlps %xmm4,%xmm7        cvttps2pi %xmm4,%mm6        cvttps2pi %xmm7,%mm7    ## mm6/mm7 contain lu indices         cvtpi2ps %mm6,%xmm3        cvtpi2ps %mm7,%xmm7        movlhps %xmm7,%xmm3        subps   %xmm3,%xmm4        movaps %xmm4,%xmm1      ## xmm1=eps         movaps %xmm1,%xmm2        mulps  %xmm2,%xmm2      ## xmm2=eps2         pslld $2,%mm6        pslld $2,%mm7        movq nb313_VFtab(%rbp),%rsi        psrlq $32,%mm6        movd %mm6,%ebx        movd %mm7,%ecx        psrlq $32,%mm7        movd %mm7,%edx        xorps  %xmm5,%xmm5        movlps (%rsi,%rcx,4),%xmm3      ## data: Y3 F3  -  -         movhps (%rsi,%rbx,4),%xmm5      ## data:  0  0 Y2 F2        movhps (%rsi,%rdx,4),%xmm3      ## data: Y3 F3 Y4 F4         movaps %xmm5,%xmm4              ## data:  0  0 Y2 F2         shufps $0x88,%xmm3,%xmm4       ## data:  0 Y2 Y3 Y3        shufps $0xDD,%xmm3,%xmm5       ## data:  0 F2 F3 F4         xorps  %xmm7,%xmm7        movlps 8(%rsi,%rcx,4),%xmm3     ## data: G3 H3  -  -         movhps 8(%rsi,%rbx,4),%xmm7     ## data:  0  0 G2 H2        movhps 8(%rsi,%rdx,4),%xmm3     ## data: G3 H3 G4 H4         movaps %xmm7,%xmm6              ## data:  0  0 G2 H2         shufps $0x88,%xmm3,%xmm6       ## data:  0 G2 G3 G3        shufps $0xDD,%xmm3,%xmm7       ## data:  0 H2 H3 H4         ## xmm4 =  0  Y2 Y3 Y4        ## xmm5 =  0  F2 F3 F4        ## xmm6 =  0  G2 G3 G4        ## xmm7 =  0  H2 H3 H4        ## coulomb table ready, in xmm4-xmm7              mulps  %xmm1,%xmm6      ## xmm6=Geps         mulps  %xmm2,%xmm7      ## xmm7=Heps2         addps  %xmm6,%xmm5        addps  %xmm7,%xmm5      ## xmm5=Fp                mulps  nb313_two(%rsp),%xmm7            ## two*Heps2         movaps nb313_qqM(%rsp),%xmm0        addps  %xmm6,%xmm7        addps  %xmm5,%xmm7 ## xmm7=FF         mulps  %xmm1,%xmm5 ## xmm5=eps*Fp         addps  %xmm4,%xmm5 ## xmm5=VV         mulps  %xmm0,%xmm5 ## vcoul=qq*VV          mulps  %xmm7,%xmm0 ## fijC=FF*qq         ## at this point mm5 contains vcoul and xmm0 fijC         ## increment vcoul - then we can get rid of mm5         addps  nb313_vctot(%rsp),%xmm5        movaps %xmm5,nb313_vctot(%rsp)        ## do nontable L-J  in first element only.        movaps nb313_rinvM(%rsp),%xmm2        mulss  %xmm2,%xmm2        movaps %xmm2,%xmm1        mulss  %xmm1,%xmm1        mulss  %xmm2,%xmm1      ## xmm1=rinvsix        xorps  %xmm4,%xmm4        movss  %xmm1,%xmm4        mulss  %xmm4,%xmm4      ## xmm4=rinvtwelve         mulss  nb313_c6(%rsp),%xmm1        mulss  nb313_c12(%rsp),%xmm4        movaps %xmm4,%xmm3        subss  %xmm1,%xmm3      ## xmm3=Vvdw12-Vvdw6         mulss  nb313_six(%rsp),%xmm1        mulss  nb313_twelve(%rsp),%xmm4        subss  %xmm1,%xmm4        addss  nb313_Vvdwtot(%rsp),%xmm3        mulss  nb313_rinvM(%rsp),%xmm4        ## add back coul stuff from memory, and work on all elements again        mulps  nb313_tsc(%rsp),%xmm0        subps  %xmm0,%xmm4        movss %xmm3,nb313_Vvdwtot(%rsp)        mulps  nb313_rinvM(%rsp),%xmm4        movaps nb313_dxM(%rsp),%xmm0        movaps nb313_dyM(%rsp),%xmm1        movaps nb313_dzM(%rsp),%xmm2        mulps  %xmm4,%xmm0        mulps  %xmm4,%xmm1        mulps  %xmm4,%xmm2 ## xmm0-xmm2 now contains tx-tz (partial force)        movss  nb313_fixO(%rsp),%xmm3        movss  nb313_fiyO(%rsp),%xmm4        movss  nb313_fizO(%rsp),%xmm5        addss  %xmm0,%xmm3        addss  %xmm1,%xmm4        addss  %xmm2,%xmm5        movss  %xmm3,nb313_fixO(%rsp)        movss  %xmm4,nb313_fiyO(%rsp)        movss  %xmm5,nb313_fizO(%rsp)   ## updated the O force now do the H's        movaps %xmm0,%xmm3        movaps %xmm1,%xmm4        movaps %xmm2,%xmm5        shufps $0x39,%xmm3,%xmm3 ## shift right         shufps $0x39,%xmm4,%xmm4        shufps $0x39,%xmm5,%xmm5        addss  nb313_fixH1(%rsp),%xmm3        addss  nb313_fiyH1(%rsp),%xmm4        addss  nb313_fizH1(%rsp),%xmm5        movss  %xmm3,nb313_fixH1(%rsp)        movss  %xmm4,nb313_fiyH1(%rsp)        movss  %xmm5,nb313_fizH1(%rsp)          ## updated the H1 force         shufps $0x39,%xmm3,%xmm3        shufps $0x39,%xmm4,%xmm4        shufps $0x39,%xmm5,%xmm5        addss  nb313_fixH2(%rsp),%xmm3        addss  nb313_fiyH2(%rsp),%xmm4        addss  nb313_fizH2(%rsp),%xmm5        movss  %xmm3,nb313_fixH2(%rsp)        movss  %xmm4,nb313_fiyH2(%rsp)        movss  %xmm5,nb313_fizH2(%rsp)          ## updated the H2 force         movq nb313_faction(%rbp),%rdi        shufps $0x39,%xmm3,%xmm3        shufps $0x39,%xmm4,%xmm4        shufps $0x39,%xmm5,%xmm5        addss  nb313_fixM(%rsp),%xmm3        addss  nb313_fiyM(%rsp),%xmm4        addss  nb313_fizM(%rsp),%xmm5        movss  %xmm3,nb313_fixM(%rsp)        movss  %xmm4,nb313_fiyM(%rsp)        movss  %xmm5,nb313_fizM(%rsp)   ## updated the M force         ## the fj's - move in from mem start by acc. tx/ty/tz in xmm0, xmm1        movlps (%rdi,%rax,4),%xmm6        movss  8(%rdi,%rax,4),%xmm7        movhlps %xmm0,%xmm3        movhlps %xmm1,%xmm4        movhlps %xmm2,%xmm5        addps   %xmm0,%xmm3        addps   %xmm1,%xmm4        addps   %xmm2,%xmm5        movaps  %xmm3,%xmm0        movaps  %xmm4,%xmm1        movaps  %xmm5,%xmm2        shufps $0x39,%xmm3,%xmm3 ## shift right         shufps $0x39,%xmm4,%xmm4        shufps $0x39,%xmm5,%xmm5        addss  %xmm3,%xmm0        addss  %xmm4,%xmm1        addss  %xmm5,%xmm2        unpcklps %xmm1,%xmm0    ## x,y sum in xmm0, z sum in xmm2        addps    %xmm0,%xmm6        addss    %xmm2,%xmm7        movlps %xmm6,(%rdi,%rax,4)        movss  %xmm7,8(%rdi,%rax,4)        decl nb313_innerk(%rsp)        jz    _nb_kernel313_x86_64_sse.nb313_updateouterdata        jmp   _nb_kernel313_x86_64_sse.nb313_odd_loop_nb_kernel313_x86_64_sse.nb313_updateouterdata:         movl  nb313_ii3(%rsp),%ecx        movq  nb313_faction(%rbp),%rdi        movq  nb313_fshift(%rbp),%rsi        movl  nb313_is3(%rsp),%edx        ## accumulate  Oi forces in xmm0, xmm1, xmm2         movaps nb313_fixO(%rsp),%xmm0        movaps nb313_fiyO(%rsp),%xmm1        movaps nb313_fizO(%rsp),%xmm2        movhlps %xmm0,%xmm3        movhlps %xmm1,%xmm4        movhlps %xmm2,%xmm5        addps  %xmm3,%xmm0        addps  %xmm4,%xmm1        addps  %xmm5,%xmm2 ## sum is in 1/2 in xmm0-xmm2         movaps %xmm0,%xmm3        movaps %xmm1,%xmm4        movaps %xmm2,%xmm5        shufps $1,%xmm3,%xmm3        shufps $1,%xmm4,%xmm4        shufps $1,%xmm5,%xmm5        addss  %xmm3,%xmm0        addss  %xmm4,%xmm1        addss  %xmm5,%xmm2      ## xmm0-xmm2 has single force in pos0         ## increment i force         movss  (%rdi,%rcx,4),%xmm3        movss  4(%rdi,%rcx,4),%xmm4        movss  8(%rdi,%rcx,4),%xmm5        subss  %xmm0,%xmm3        subss  %xmm1,%xmm4        subss  %xmm2,%xmm5        movss  %xmm3,(%rdi,%rcx,4)        movss  %xmm4,4(%rdi,%rcx,4)        movss  %xmm5,8(%rdi,%rcx,4)        ## accumulate force in xmm6/xmm7 for fshift         movaps %xmm0,%xmm6        movss %xmm2,%xmm7        movlhps %xmm1,%xmm6        shufps $8,%xmm6,%xmm6 ## 00001000               ## accumulate H1i forces in xmm0, xmm1, xmm2         movaps nb313_fixH1(%rsp),%xmm0        movaps nb313_fiyH1(%rsp),%xmm1        movaps nb313_fizH1(%rsp),%xmm2        movhlps %xmm0,%xmm3        movhlps %xmm1,%xmm4        movhlps %xmm2,%xmm5        addps  %xmm3,%xmm0        addps  %xmm4,%xmm1        addps  %xmm5,%xmm2 ## sum is in 1/2 in xmm0-xmm2         movaps %xmm0,%xmm3        movaps %xmm1,%xmm4        movaps %xmm2,%xmm5        shufps $1,%xmm3,%xmm3        shufps $1,%xmm4,%xmm4        shufps $1,%xmm5,%xmm5        addss  %xmm3,%xmm0        addss  %xmm4,%xmm1        addss  %xmm5,%xmm2      ## xmm0-xmm2 has single force in pos0         ## increment i force         movss  12(%rdi,%rcx,4),%xmm3        movss  16(%rdi,%rcx,4),%xmm4        movss  20(%rdi,%rcx,4),%xmm5        subss  %xmm0,%xmm3        subss  %xmm1,%xmm4        subss  %xmm2,%xmm5        movss  %xmm3,12(%rdi,%rcx,4)        movss  %xmm4,16(%rdi,%rcx,4)        movss  %xmm5,20(%rdi,%rcx,4)        ## accumulate force in xmm6/xmm7 for fshift         addss %xmm2,%xmm7        movlhps %xmm1,%xmm0        shufps $8,%xmm0,%xmm0 ## 00001000               addps   %xmm0,%xmm6        ## accumulate H2i forces in xmm0, xmm1, xmm2         movaps nb313_fixH2(%rsp),%xmm0        movaps nb313_fiyH2(%rsp),%xmm1        movaps nb313_fizH2(%rsp),%xmm2        movhlps %xmm0,%xmm3        movhlps %xmm1,%xmm4        movhlps %xmm2,%xmm5        addps  %xmm3,%xmm0        addps  %xmm4,%xmm1        addps  %xmm5,%xmm2 ## sum is in 1/2 in xmm0-xmm2         movaps %xmm0,%xmm3        movaps %xmm1,%xmm4        movaps %xmm2,%xmm5        shufps $1,%xmm3,%xmm3        shufps $1,%xmm4,%xmm4        shufps $1,%xmm5,%xmm5        addss  %xmm3,%xmm0        addss  %xmm4,%xmm1        addss  %xmm5,%xmm2      ## xmm0-xmm2 has single force in pos0         ## increment i force         movss  24(%rdi,%rcx,4),%xmm3        movss  28(%rdi,%rcx,4),%xmm4        movss  32(%rdi,%rcx,4),%xmm5        subss  %xmm0,%xmm3        subss  %xmm1,%xmm4        subss  %xmm2,%xmm5        movss  %xmm3,24(%rdi,%rcx,4)        movss  %xmm4,28(%rdi,%rcx,4)        movss  %xmm5,32(%rdi,%rcx,4)        ## accumulate force in xmm6/xmm7 for fshift         addss %xmm2,%xmm7        movlhps %xmm1,%xmm0        shufps $8,%xmm0,%xmm0 ## 00001000               addps   %xmm0,%xmm6        ## accumulate Mi forces in xmm0, xmm1, xmm2         movaps nb313_fixM(%rsp),%xmm0        movaps nb313_fiyM(%rsp),%xmm1        movaps nb313_fizM(%rsp),%xmm2        movhlps %xmm0,%xmm3        movhlps %xmm1,%xmm4        movhlps %xmm2,%xmm5        addps  %xmm3,%xmm0        addps  %xmm4,%xmm1        addps  %xmm5,%xmm2 ## sum is in 1/2 in xmm0-xmm2         movaps %xmm0,%xmm3        movaps %xmm1,%xmm4        movaps %xmm2,%xmm5        shufps $1,%xmm3,%xmm3        shufps $1,%xmm4,%xmm4        shufps $1,%xmm5,%xmm5        addss  %xmm3,%xmm0        addss  %xmm4,%xmm1        addss  %xmm5,%xmm2      ## xmm0-xmm2 has single force in pos0         ## increment i force         movss  36(%rdi,%rcx,4),%xmm3        movss  40(%rdi,%rcx,4),%xmm4        movss  44(%rdi,%rcx,4),%xmm5

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?